In [113]:
import pandas as pd
import numpy as np
import pickle

In [114]:
#Uncomment the line below to install stanza
#!pip install stanza

In [115]:
from util import spacy_get_sents, stanza_get_sents, spacy_tokenize_text, stanza_tokenize_text, get_texts

In [116]:
from util import get_tokens_dict, common_tokens

# Data collection

For our further inverstigations we will choose 100 texts from "Writers" category that we previously stored:

In [117]:
texts = get_texts(source_type='folder', path='Writers', token_type='file', n=100)

Below we can see the sample of 5 files from the dataset:

In [118]:
texts[:5]

['Anne Von Bertouch, (29 June 1915 – 31 March 2003) was an Australian art dealer, author, environmentalist and director of the Von Bertouch Galleries in Newcastle, New South Wales, believed to be the first commercial gallery outside a capital city in Australia. Biography Born Anne Catherine, to parents Jean (née Duff) and George Whittle on 29 June 1915 in Eastwood, New South Wales, she was educated at Sydney Girls High School and Armidale Teachers College. After teacher training her first posting was at Adamstown Infants School in the 1930s. She married Roger Von Bertouch in 1939. In 1941 she was the organiser of a National Fitness Camp for girls at Broken Bay, New South Wales.She and Roger moved to Tasmania, where they taught and she studied at Hobart Technical College. In 1942 she performed modern interpretive dance at a Town Hall concert in Hobart organised by the Australian Broadcasting Commission Patriotic Committee as a fundraiser on Allies\' Appeal Day. She danced also in Hobart

# Sentence segmentation

For our selected 100 articles we will perform sentence tokenization using Spacy and Stanza.
Each file will be separated into sentences separately.

In [119]:
spacy_sents = [spacy_get_sents(x) for x in texts]

In [120]:
# To run for the first time Stanza tokenization:
""" 
stanza_sents = [stanza_get_sents(x) for x in texts]
with open('SpacyStanza/stanza_sents.pickle', 'wb') as f:
    pickle.dump(stanza_sents, f))

"""

# To load Stanza tokens from a pickle file

with open('SpacyStanza/stanza_sents.pickle', 'rb') as f:
    stanza_sents = pickle.load(f)

We can now show the number of sentences calculated by each library for our articles:

In [121]:
sents_df = pd.DataFrame(columns=['Spacy', 'Stanza'])

sents_df.Spacy = list(map(lambda x: len(x), spacy_sents))
sents_df.Stanza = list(map(lambda x: len(x), stanza_sents))

sents_df

Unnamed: 0,Spacy,Stanza
0,107,95
1,37,33
2,8,9
3,35,27
4,28,29
...,...,...
95,32,30
96,21,11
97,18,17
98,7,7


We can look at the stats for all articles, it appears that Spacy on average finds more sentences:

In [122]:
sents_df.describe()

Unnamed: 0,Spacy,Stanza
count,100.0,100.0
mean,30.77,28.23
std,26.211978,24.465129
min,3.0,3.0
25%,13.0,11.0
50%,25.0,21.5
75%,37.0,33.25
max,155.0,143.0


Below we can see the total number of sentences recognized by the both libraries:

In [123]:
sents_df.sum()

Spacy     3077
Stanza    2823
dtype: int64

We can notice that overall Stanza splits into sentences less frequently than Spacy does.

We can confirm it using a t-test. Since the tokenization by Spacy and Stanza are performed on the same articles, we will consider the samples to be dependent. Let's set p-value to be < 0.05.



$H_{0}$: Average number of sentences per article generated by Spacy and Stanza is the same.

$H_{1}$: Average number of sentences per article generated by Spacy and Stanza is the same.


In [124]:
from scipy.stats import ttest_rel

In [125]:
ttest_rel(sents_df.Spacy, sents_df.Stanza)

TtestResult(statistic=4.36851507267565, pvalue=3.085063323539573e-05, df=99)

Since p-value is very small we can reject the null hypothesis, and accept the $H_{1}$. And since the value of statistic is positive, we can accept the theory that average of Spacy sentences per article is higher than Stanza.

_____________

Now we can find all unique sentences that both recognize.
First we can look at the number shared sentences per article:

In [126]:
# Make a set of unique sentences for each article for both Spacy and Stanza.
# Find the intersection = these are sentences recognized by both libraries.

shared_sents_per_article = list(map(lambda x,y: set(x).intersection(set(y)), spacy_sents, stanza_sents))

shared_sents_per_article_df = pd.DataFrame(columns=['shared_sents', 'count'])
shared_sents_per_article_df['shared_sents'] = shared_sents_per_article
shared_sents_per_article_df['count'] = [len(x) for x in shared_sents_per_article]
shared_sents_per_article_df

Unnamed: 0,shared_sents,count
0,"{They mustn't attempt to please., In 1941 she ...",68
1,{He has been an invitee to numerous events on ...,24
2,{John Peck is an American Marine sergeant who ...,7
3,"{A statue of him, unveiled in 1950, stands at ...",12
4,{Northcroft graduated with an MA in English li...,27
...,...,...
95,"{According to Yehuda Marton, an Israeli-Hungar...",26
96,"{In Aghmat, in the year 1190, he wrote a Talmu...",10
97,"{Sir John Jervis White Jervis, 1st Baronet (17...",12
98,"{Notes, From 1968 to 1987, he was rector of Ch...",7


Below is the overall count of shared sentences:

In [127]:
total_sp_sent = sents_df.sum()['Spacy']
total_st_sent = sents_df.sum()['Stanza']
total_shared = shared_sents_per_article_df['count'].sum()

print(f"""
    Total Spacy sentences: {total_sp_sent}
    Total Stanza sentences: {total_st_sent}
    Shared sentences: {total_shared}
    Percentage of shared sentences for Spacy: {100*total_shared/total_sp_sent:.2f}%
    Percentage of shared sentences for Stanza: {100*total_shared/total_st_sent:.2f}%""")


    Total Spacy sentences: 3077
    Total Stanza sentences: 2823
    Shared sentences: 2263
    Percentage of shared sentences for Spacy: 73.55%
    Percentage of shared sentences for Stanza: 80.16%


We can also have a look at what different articles both libraries found looking at the first mismatched sentence pair 
for each article:

In [128]:
def get_first_mismatched_sent_pair(sp_sents, st_sents):
    n = len(sp_sents)
    m = len(st_sents)
    
    # Go through the array of sentences for both Spacy and Stanza till the end of smallest of them.
    for i in range(min(n, m)):
        # When we reach the first mismatch, print the sentences.
        if sp_sents[i] != st_sents[i]:
            print(f'Spacy sentence: {sp_sents[i]} \n\n'
            f'Stanza sentence: {st_sents[i]}\n'
            '_______________')
            return sp_sents[i], st_sents[i]
    
    # If there was no mismatch during the array traversal, then all the sentences were tokenized together.
    print('Text segmentized into sentences in the same way.\n_______________')
    return None

for i, pair in list(enumerate(zip(spacy_sents, stanza_sents)))[:20]:
    print(f'Article: {i}\n\n')
    get_first_mismatched_sent_pair(pair[0], pair[1])

Article: 0


Spacy sentence: She danced also in Hobart's Opera And Ballet Festival for International Week in 1945.Intending to develop land and to pursue their artistic interests, they moved to Mungo Brush in the Myall Lakes, New South Wales, in 1951 or 1954, living a subsistence existence from prawn fishing and trading their home-grown produce, and were appointed Honorary Rangers there in 1955 under the Wild Flowers and Native Plants Protection Act. 

Stanza sentence: She danced also in Hobart's Opera And Ballet Festival for International Week in 1945.
_______________
Article: 1


Spacy sentence: ISBN 9782844096265, 

Stanza sentence: ISBN 9782844096265, ASIN B07TTX6ZHQ
_______________
Article: 2


Spacy sentence: He successfully underwent a ground-breaking bilateral arm transplant in August 2016.Peck wrote a book, Rebuilding Sergeant Peck: How I Put Body and Soul Back Together After Afghanistan, that was released on May 7, 2019. 

Stanza sentence: He successfully underwent a ground-b

From the sample we see above we can note the following behavior for sentence tokenization for Spacy and Stanza:
- If there is a missing space before a period, separating sentences, Stanza seems to perform sentence segmentation better, e.g `Article 0, 2, 7, 24`...
- In our source material some sections have a title which is not punctuated. Stanza appears to recognize such cases as a separate sentence (correct) more frequently, e.g. `Article 3, 6, 12, 16, 17, 19`...
- However, Stanza seems to be incorrectly splitting a sentence more frequently if there is a punctuation sign followed by a capital letter, e.g. `Article 4, 5, 9, 11, 15, 18`...
- Both libraries struggle (as expected) with sententences that have a lot of non-English words or the input string is not a correct sentence.

______________________

Now we can form a list of shared sentences for all articles combined:

In [129]:
# Convert the sets into arrays and then flatted the 2 array into 1-d array using numpy.concatenate

shared_sents = np.concatenate(list(map(lambda x: list(x), shared_sents_per_article)))

In [131]:
shared_sents[:5]

array(["They mustn't attempt to please.",
       'In 1941 she was the organiser of a National Fitness Camp for girls at Broken Bay, New South Wales.',
       'Guy Boyd.',
       'Shortly before her death Von Bertouch bequeathed a total of 136 works from her personal collection to the Newcastle Region Art Gallery, the biggest art collection bestowed to it in 58 years and valued in the millions of dollars; they were exhibited there in March 2003.',
       'She was loved very much by all."'], dtype='<U592')

We can now store the shared sentences into a separate DataFrame and into its own CSV-file:

In [132]:
# Run to generate the DataFrame for this time
"""
shared_sents_df = pd.DataFrame(columns=['Sentence'])
shared_sents_df.Sentence = shared_sents
"""

shared_sents_df = pd.read_csv('SpacyStanza/spacy_stanza_shared_sents.csv', index_col=0)
shared_sents_df 

Unnamed: 0,Sentence
0,"Anne Von Bertouch, (29 June 1915 – 31 March 20..."
1,ISBN 978-0-9592824-1-2. OCLC 27623615.
2,"Von Bertouch, Anne (1983)."
3,The terraces were purchased by Dr Dick Lees fo...
4,What was it before it was a gallery?.
...,...
2258,"In popular culture In 1999, Komphet Phorncharo..."
2259,"In 1996, Butnakho started to write songs in th..."
2260,"The melody came from the pattern of ""Sao Simue..."
2261,"In 2007, Isan music became very popular in Lao..."


# Tokenization

For the experiment below we will use all the tokens returned by Spacy and Stanza, without filtering any punctuation or spaces.

However, prior to all the analysis texts were cleaned: we removed linebreaks, tabulation and special characters like `|`, `^`, `<`, `>`, `+`, and `=`. These symbols were not returned when we requested the original texts.

We will do the final step of preprocessing and lowercase the text in order to avoid the same word being in vocabulary twice: for different spelling when it's in the beginning or in the middle of a sentence.

For our experiment we will take only the sentences that were previously segmented in the same way by Spacy and Stanza.
For **SharedTokensNosentences** we will stich all the sentences together and separate them by spaces. For **SharedTokensInSentences** we will apply tokenization on the sentences from `shared_sents_df` above.

In [133]:
shared_sents_lower = [s.lower() for s in shared_sents_df.Sentence]

In [134]:
shared_sents_stiched = ' '.join(shared_sents_lower)

##  Vocabulary

To compare vocabularies we will run the comparison on the sentences stiched together into one text.

In [135]:
# Create a set to find unique tokens only
spacy_vocabulary = set(spacy_tokenize_text(shared_sents_stiched, no_filtering=True))

In [136]:
# To run for the first time Stanza tokenization:
""" 
stanza_vocabulary = set(stanza_tokenize_text(shared_sents_stiched))with open('SpacyStanza/no_sent_stanza_tokens.pickle', 'wb') as f:
with open('SpacyStanza/stanza_vocabulary.pickle', 'wb') as f:
    pickle.dump(stanza_vocabulary, f)

"""

# To load Stanza tokens from a pickle file

with open('SpacyStanza/stanza_vocabulary.pickle', 'rb') as f:
    stanza_vocabulary = pickle.load(f)

In [137]:
print(f'Spacy vocabulary size: {len(spacy_vocabulary)}\nStanza vocabulary size: {len(stanza_vocabulary)}')

Spacy vocabulary size: 8488
Stanza vocabulary size: 8526


### Shared vocabulary

Shared vocabulary is calculated as intersection of sets of Stanza and Spacy vocabulary:

In [30]:
shared_vocab = spacy_vocabulary.intersection(stanza_vocabulary)

In [31]:
shared_vocab_size = len(shared_vocab)

Size of shared vocabulary:

In [32]:
shared_vocab_size

8264

# <font color='red'>Please **DO NOT** rerun the cell below</font>

In [54]:
np.random.choice(list(shared_vocab), size=50)

array(['moving', 'specialized', 'there', 'ekspres', 'attended', 'pune',
       'foot', 'occasion', 'favored', '1961', '1659', 'believed', 'equal',
       'warsaw', 'turkish', 'height', 'growing', 'c.', 'having',
       'hinting', 'friends', 'sweetheart', 'organically', 'registry',
       'lhadj', 'nicholas', 'judgement', 'confinement', 'argentina',
       'pleasures', 'armidale', 'bunyan', 'immediate', 'emigrants',
       'ṛṛays', 'blessing', 'mae', 'extinct', 'mosques', 'stockholm',
       'stanitsa', 'matter', 'muffins', 'ceramics', 'egg', 'saitovic',
       'tourism', 'founded', 'hospitalized', 'an'], dtype='<U19')

From the sample above we can observe that both libraries seem to agree on tokenizing a wide range of words:
- Common words like `an`, `egg`, `there`, `having` are present in the shared vocabulary.
- Foreign to English words like `bunyan`, `stanitsa`, `lhadj` are also present in the shared vocabulary.
- We can as well see some Named Entities like dates and locations
- We can as well see an incorrect lemma `ṛṛays` (presumably a part of `arrays`) which was produced by both libraries.
- ...

### Spacy-only vocabulary

In [39]:
spacy_only_vocab = spacy_vocabulary.difference(stanza_vocabulary)
for w in spacy_only_vocab:
    print(w, end=', ')

pre, 1989–1991, awardee, 1798–1879, χρήστος, zu'bi,  , neo, 480–44, esq, 7349, 975, re, 35–6, u'uqinak'uuh, dmitrievakain, 39–40, 2008–13, 1877–1951, 0, 16–17, 1909–1910, 605, j.d, ):, kishinev, 1810–1860, vatican, editors, 1804–1871, 1785–1863, eminent, dc, vol, non, 976, 3865, suzuki, 1941–45, brassey, n.s.w, 1904–1920, ethno, co, yalıkavak, novo, 1913–1974, art;"i, e, ph.d, 1744–1816, liechtenstein, ju, 1476679488, heiress, 700–707, t.c, 1945–47, 1860–1923,1986, biografie, 8084, 1134068708, chan, 2018edited, yellow, inst, 95–96, l.r.c.s, c.e, coca,   , x, ¿, otto, bmj, kyoto, 9592824, mujeres, 1547–1564, pjesme/, l.r.c.p, 040, m.r.c.s, jr, semite, ¡, 205, 111, kei, prof, nesanice/, semi, 3905881028, p.j, 78442, mishnayot, unesco, fc, 1912–1928, lansdowne, m.r, multi, 1795–1864, madison, l.c, 7–17, 1882–1956, 978, u'uqiank'uuh, 1977–1980, 1996–2000, 9695188, thejournal.ie, 1998–2002, f.r.c.p, denominational, 53, 1928–1998, 1909–1919, 84.5월호, 1758, 1848–1905, 521, 1793–1869, v.g, 1969

In the Spacy-only tokens we can see a lot of numbers as well as non-english words (or non-words).

However, surprisingly we can also see some more or less common English words:
- Some words of relatively more formal register of English: `canonical`, `colonialism`, `denominational`, `editors`, `eminent`, `feudal`, `imperialism`, `instrumentalist`...
- Some NE: `kishinev`, `kyoto`, `liechtenstein`, `vatican` (Locations), `suzuki` (Name or Organisation name), `unesco` (Organisation name)...

### Stanza-only vocubulary

In [40]:
stanza_only_vocab = stanza_vocabulary.difference(spacy_vocabulary)
for w in stanza_only_vocab:
    print(w, end=', ')

choe's, m.d., 'e, esq., 84.5, –2, 978-1442252813, non-russian, dr., ب, 978-3-8392-1986-7, shin's, b.w., 978-605-09, us`, 978-1134068708, inst., .ie, 978-1472806871, non-denominational, e-, covid, 700, 1923-2011, –707, ruskin's, f.r.c.p., re-enactment, liechtenstein., –17, 호, –from, u'uqinak, st., ¡mujeres, l.r.c.s., 978-0-9592824-2-9., 1860, abc's, co-editor, gysin's, pre-islamic, 1810, ıkavak, anti-semite, 978-3-205-78442-5, ?., 441, non-humorous, ethno-, 978-976-8097-13-2., sun-baked, –2002, 1689, 978-605-111-401-9, wisconsin-madison, isl., 월, 1744, multi-instrumentalist, pre-eminent, 1785, mid-day, son-, thejournal, multi-awardee, pg., 978-3-8392-1500, anti-colonialism, otto-, ha-mishnayot, –march, hada, 978-3-7349-9213-1, –1956, mid-october, co-editors, c.e., s.s., co-presenter, –1816, unesco's, u-turn, 480, l.r.c.p., tambo's, رفان, yusuf's, co-, invasion`, -1758, co-heiress, el-, 978-0-9592824-3-6., ¿sueñan, asilah, chuch, 1798, pjesme, edit., –40, χρήστο, uqiank, yonsei-, doi:10.

We can see that Stanza tends tokenize words with prefixes like `co-`, `anti-`, `non-`, `neo-`... as one token, which explains the difference with Spacy on tokenization of words of relatively formal register.

We can se can see that there is the same tendency towards composed words: `pen-name`, `kingdom-based`, `serbo-croatian`...

As for Named Entities, there appears to be a lot of them tokenized together with `'s`: `yusuf's`, `unesco's`, `suzuki's`, `shin's`, `kyoto's`...

## Occurences

To calculate common accurances we will find all tokens produced by both models for a given string and will find tokens that have the same text for both Stanza and Spacy.

For this we will work with raw token entities from Stanza and Spacy to preserve PoS information for the following experiment since PoS information might be changed if a token is taken out of its context.

### Occurences without sentence separation

We will tokenize all stiched sentences together.

In [41]:
no_sent_spacy_tokens = spacy_tokenize_text(shared_sents_stiched, to_string=False, no_filtering=True)

In [42]:
# To run for the first time Stanza tokenization:
""" 
no_sent_stanza_tokens = stanza_tokenize_text(shared_sents_stiched, to_string=False)
with open('SpacyStanza/no_sent_stanza_tokens.pickle', 'wb') as f:
    pickle.dump(no_sent_stanza_tokens, f)

"""

# To load Stanza tokens from a pickle file
with open('SpacyStanza/no_sent_stanza_tokens.pickle', 'rb') as f:
    no_sent_stanza_tokens = pickle.load(f)

In [43]:
common_t = common_tokens(no_sent_spacy_tokens, no_sent_stanza_tokens)

Some token stats:

In [44]:
print(f"""
Spacy tokens: {len(no_sent_spacy_tokens)}
Stanza tokens: {len(no_sent_stanza_tokens)}
Shared tokens: {len(np.concatenate([x[1] for x in common_t]))}
% of shared tokens in Spacy: {100*len(np.concatenate([x[1] for x in common_t]))/len(no_sent_spacy_tokens):.2f}%
% of shared tokens in Stanza: {100*len(np.concatenate([x[1] for x in common_t]))/len(no_sent_stanza_tokens):.2f}%

"""
)


Spacy tokens: 48820
Stanza tokens: 48502
Shared tokens: 48070
% of shared tokens in Spacy: 98.46%
% of shared tokens in Stanza: 99.11%




We will store the results in a dictionary where the key is the word form and the value is another dictionary, storing all seen results for Spacy and Stanza separately.

In [45]:
no_sent_shared_tokens = {x[0][0].text: {'Spacy': x[0], 'Stanza': x[1]} for x in common_t}

In [46]:
no_sent_shared_tokens['record']

{'Spacy': [record, record, record, record, record, record, record],
 'Stanza': [[
    {
      "id": 39,
      "text": "record",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "start_char": 13283,
      "end_char": 13289
    }
  ],
  [
    {
      "id": 17,
      "text": "record",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "start_char": 27303,
      "end_char": 27309
    }
  ],
  [
    {
      "id": 23,
      "text": "record",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "start_char": 95765,
      "end_char": 95771
    }
  ],
  [
    {
      "id": 22,
      "text": "record",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "start_char": 117044,
      "end_char": 117050
    }
  ],
  [
    {
      "id": 12,
      "text": "record",
      "upos": "VERB",
      "xpos": "VB",
      "feats": "VerbForm=Inf",
      "start_char": 159799,
      "end_char": 159805
    }
  

# <font color='green'>Rerun the follwing cells</font>

### Occurences with segmented sentences

Now we repeat the experiment but for the separated sentences above:

In [34]:
# Uncomment this cell if you didn't run the cells above:

# shared_sents_df = pd.read_csv('SpacyStanza/spacy_stanza_shared_sents.csv')
# shared_sents_lower = [s.lower() for s in shared_sents_df.Sentence]

In [14]:
sp_sent_tokens = list(map(lambda x: spacy_tokenize_text(x, no_filtering=True, to_string=False), shared_sents_lower))

In [16]:
# To run for the first time Stanza tokenization:
""" 
st_sent_tokens = list(map(lambda x: stanza_tokenize_text(x, to_string=False), shared_sents_lower))
with open('SpacyStanza/st_sent_tockens.pickle', 'wb') as f:
    pickle.dump(st_sent_tokens, f)

"""

# To load Stanza tokens from a pickle file

with open('SpacyStanza/st_sent_tockens.pickle', 'rb') as f:
    st_sent_tokens = pickle.load(f)

In [17]:
common_sent_tokens = list(map(lambda x: common_tokens(x[0], x[1]), zip(sp_sent_tokens, st_sent_tokens)))

In [62]:
sent_tokens_df = pd.DataFrame(columns=['Spacy_token_count', 
                                       'Stanza_token_count', 
                                       'Shared_token_count', 
                                       'Spacy_shared_percentage',
                                      'Stanza_shared_percentage'])

In [63]:
sent_tokens_df.Spacy_token_count = list(map(len, sp_sent_tokens))
sent_tokens_df.Stanza_token_count = list(map(len, st_sent_tokens))
sent_tokens_df.Shared_token_count = list(map(len, common_sent_tokens))
sent_tokens_df.Spacy_shared_percentage = 100 * sent_tokens_df.Shared_token_count / sent_tokens_df.Spacy_token_count 
sent_tokens_df.Stanza_shared_percentage = 100 * sent_tokens_df.Shared_token_count / sent_tokens_df.Stanza_token_count 

sent_tokens_df

Unnamed: 0,Spacy_token_count,Stanza_token_count,Shared_token_count,Spacy_shared_percentage,Stanza_shared_percentage
0,50,50,42,84.000000,84.000000
1,14,5,4,28.571429,80.000000
2,8,8,8,100.000000,100.000000
3,25,27,21,84.000000,77.777778
4,10,9,6,60.000000,66.666667
...,...,...,...,...,...
2258,32,32,29,90.625000,90.625000
2259,16,16,14,87.500000,87.500000
2260,23,23,20,86.956522,86.956522
2261,15,15,14,93.333333,93.333333


Below are average stats for tokens (how many tokens in a sentence are on average provided by Spacy, how many by Stanza, how many are shared etc.):

In [138]:
sent_tokens_df[['Spacy_token_count', 'Stanza_token_count', 'Shared_token_count']].mean()

Spacy_token_count     21.573133
Stanza_token_count    21.438356
Shared_token_count    18.313301
dtype: float64

Total number of shared tokens across all sentences:

In [139]:
print(f"""
Total Spacy tokens: {sent_tokens_df.Spacy_token_count.sum()}
Total Stanza tokens: {sent_tokens_df.Stanza_token_count.sum()}
Total shared tokens: {sent_tokens_df.Shared_token_count.sum()}

Spacy shared tokens %: {sent_tokens_df.Shared_token_count.sum() * 100 /sent_tokens_df.Spacy_token_count.sum()}
Stanza shared tokens %: {sent_tokens_df.Shared_token_count.sum() * 100 /sent_tokens_df.Stanza_token_count.sum()}
""")


Total Spacy tokens: 48820
Total Stanza tokens: 48515
Total shared tokens: 41443

Spacy shared tokens %: 84.88938959442851
Stanza shared tokens %: 85.42306503143358



We have less aligned tokens now, meaning that in the experiment above we might have matched tokens from different sentences.

We will combine all the shared tokens from our sentences into one dictionary of the same structure as above: the keys are wordforms, the values are dictionaries containing arrays of all of the occurences of the wordforms.

Since for PoS analysis for Stanza requires words instead of tokens, we will convert the conversion as well.

In [18]:
shared_sent_tokens = {}

for sent in common_sent_tokens:
    for word in sent:
        word_text = word[0][0].text
        word_dict = shared_sent_tokens.get(word_text, {})
        
        sp_t_list = word_dict.get('Spacy', [])
        sp_t_list.extend(word[0])
        word_dict['Spacy'] = sp_t_list
        
        st_t_list = word_dict.get('Stanza', [])
        # Before extending the list convert tokens to words
        st_t_list.extend(list(map(lambda x: x.words[0], word[1])))
        word_dict['Stanza'] = st_t_list
        
        shared_sent_tokens[word_text] = word_dict


Below is an example of how the dictionary will work on an example of one word `chief`.

`shared_sent_tokens['chief']['Spacy']` can be called to get all Spacy tokens and similarly `shared_sent_tokens['chief']['Stanza']` can be called to get all Stanza tokens.

In this example we can see that this word was sometimes categorized as Noun and sometimes as Adjective.

In [57]:
shared_sent_tokens['chief']

{'Spacy': [chief,
  chief,
  chief,
  chief,
  chief,
  chief,
  chief,
  chief,
  chief,
  chief,
  chief,
  chief,
  chief,
  chief,
  chief],
 'Stanza': [{
    "id": 5,
    "text": "chief",
    "upos": "ADJ",
    "xpos": "JJ",
    "feats": "Degree=Pos",
    "start_char": 20,
    "end_char": 25
  },
  {
    "id": 12,
    "text": "chief",
    "upos": "ADJ",
    "xpos": "JJ",
    "feats": "Degree=Pos",
    "start_char": 68,
    "end_char": 73
  },
  {
    "id": 6,
    "text": "chief",
    "upos": "ADJ",
    "xpos": "JJ",
    "feats": "Degree=Pos",
    "start_char": 17,
    "end_char": 22
  },
  {
    "id": 8,
    "text": "chief",
    "upos": "NOUN",
    "xpos": "NN",
    "feats": "Number=Sing",
    "start_char": 35,
    "end_char": 40
  },
  {
    "id": 5,
    "text": "chief",
    "upos": "NOUN",
    "xpos": "NN",
    "feats": "Number=Sing",
    "start_char": 26,
    "end_char": 31
  },
  {
    "id": 21,
    "text": "chief",
    "upos": "NOUN",
    "xpos": "NN",
    "feats": "Number=Si

How PoS can be accessed for Spacy:

In [58]:
shared_sent_tokens['chief']['Spacy'][0], shared_sent_tokens['chief']['Spacy'][0].pos_

(chief, 'ADJ')

How PoS can be accessed for Stanza:

In [59]:
shared_sent_tokens['chief']['Stanza'][0], shared_sent_tokens['chief']['Stanza'][0].upos

({
   "id": 5,
   "text": "chief",
   "upos": "ADJ",
   "xpos": "JJ",
   "feats": "Degree=Pos",
   "start_char": 20,
   "end_char": 25
 },
 'ADJ')

# Accessing shared sent tokens

All the wordforms that were tokenized in the same way by Spacy and Stanza:

In [73]:
shared_sent_tokens.keys()

dict_keys(['be', 'anne', 'wales', 'city', 'australian', 'dealer', 'south', '2003', ',', 'march', ')', 'australia', 'author', '29', 'art', 'capital', 'the', 'believed', 'galleries', 'newcastle', '–', 'was', 'outside', 'in', 'a', '(', 'first', 'to', '.', 'environmentalist', 'bertouch', 'an', 'commercial', '1915', 'von', 'of', 'new', 'june', 'gallery', 'director', '31', 'and', '27623615', 'isbn', 'oclc', '1983', 'by', 'sale', 'million', 'at', '1,420,000', 'listed', '#', 'were', '2007', 'terraces', 'for', 'purchased', '2010', 'lees', 'dr', 'dick', 'what', 'it', 'before', 'née', 'eastwood', 'school', 'on', 'parents', 'she', 'biography', 'high', 'sydney', 'armidale', 'college', 'whittle', 'born', 'teachers', 'girls', 'catherine', 'george', 'duff', 'educated', 'jean', 'commission', 'modern', 'fundraiser', 'performed', 'day', 'as', 'interpretive', 'broadcasting', 'hall', '1942', "'", 'appeal', 'organised', 'hobart', 'patriotic', 'concert', 'town', 'committee', 'dance', 'allies', 'voyage', 'fle

______________
Access all the Spacy tokens of a wordform:

In [74]:
shared_sent_tokens['million']['Spacy']

[million, million, million]

_______________
Access all Stanza tokens of a wordform:

In [75]:
shared_sent_tokens['million']['Stanza']

[{
   "id": 26,
   "text": "million",
   "upos": "NUM",
   "xpos": "CD",
   "feats": "NumForm=Word|NumType=Card",
   "start_char": 110,
   "end_char": 117
 },
 {
   "id": 14,
   "text": "million",
   "upos": "NUM",
   "xpos": "CD",
   "feats": "NumForm=Word|NumType=Card",
   "start_char": 66,
   "end_char": 73
 },
 {
   "id": 28,
   "text": "million",
   "upos": "NUM",
   "xpos": "CD",
   "feats": "NumForm=Word|NumType=Card",
   "start_char": 123,
   "end_char": 130
 }]

____________________
**Access all Spacy PoS of a wordform**:

In [76]:
[x.pos_ for x in shared_sent_tokens['million']['Spacy']]

['NUM', 'NUM', 'NUM']

___________
**Access all Stanza PoS of a wordform:**

In [77]:
[x.upos for x in shared_sent_tokens['million']['Stanza']]

['NUM', 'NUM', 'NUM']

_________
**A helper function that would return all Spacy or Stanza UPOS by a wordform:**

In [2]:
def get_pos(wordform, library):
    if library == 'Spacy':
        return [x.pos_ for x in shared_sent_tokens[wordform][library]]
    elif library == 'Stanza':
        return [x.upos for x in shared_sent_tokens[wordform][library]]
    else:
        raise Exception('The supported library values are: Spacy, Stanza')

In [94]:
def get_occurence_count(wordform):
    return len(shared_sent_tokens[wordform]['Spacy'])

In [95]:
get_occurence_count('serious')

4

In [85]:
get_pos('serious', 'Spacy')

['ADJ', 'ADJ', 'ADJ', 'ADJ']

In [87]:
get_pos('serious', 'Stanza')

['ADJ', 'ADJ', 'ADJ', 'ADJ']

_____________________________________
For the task 
`For each token in SharedTokensInSentences,
your code should compute and output: the ratio and the number of times, the token is assigned the same UPOS by both libraries `

What we'll need to is:
- Get all the wordforms of `shared_sent_tokens.keys()`
- And then for each of them:
    - Get total number of occurences using = **n**  => `get_occurence_count(wordform)`
    - Create the count of the same POS
    - Get all the spacy POS using `get_pos(wordform, 'Spacy')` 
    - Get stanza POS using `get_pos(wordform, 'Stanza')`
    - For i in range(n) compare spacy_pos\[i\] and stanza_pos\[i\]: if they are the same, increase the same counter
    - **NB:** Spacy and Stanza occurences are ordered and expected to be in the same order, so spacy_pos\[i\] and stanza_pos\[i\] should refer to the same occurence. Not sure if it's needed for the task.
    - Get the ratio and the total c


## Part of Speech

### Creation of POS dictionnary

First we stitch every text. Then the function *spacy_tokenize_text* from **utils.py** is used to create a dictionnary containing every words of the text and their corresponding part-of-speech.

In [19]:
spacy_pos = [(k, get_pos(k, 'Spacy')) for k,v in shared_sent_tokens.items()]
d_spacy_pos = dict(spacy_pos)
d_spacy_pos

{'galleries': ['NOUN', 'NOUN', 'NOUN', 'PROPN', 'NOUN'],
 'commercial': ['ADJ', 'ADJ'],
 'an': ['DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET

Using the file *st_sents_tockens.pickle* we retrieve all tokens from stanza and we create the equivalent dictionnary.

In [20]:
stanza_pos = [(k, get_pos(k, 'Stanza')) for k,v in shared_sent_tokens.items()]
d_stanza_pos = dict(stanza_pos)
d_stanza_pos

{'galleries': ['NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN'],
 'commercial': ['ADJ', 'ADJ'],
 'an': ['DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET',
  'DET'

As we can see (and as we arleady know) the two libraries do not find the same amount of tokens. 

### Analysis

Next, we create a dictionnary containing for each POS find by **Spacy** the corresponding POS find by **Stanza** if the two doesn't correspond. And a second dictionnary from **Stanza** to **Spacy**.

In [114]:
d_spacy_to_stanza_pos = dict()
d_stanza_to_spacy_pos = dict()
d_spacy_to_stanza_pos_tokens = dict()
d_stanza_to_spacy_pos_tokens = dict()

for k,v in shared_sent_tokens.items():
   pos_spacy = d_spacy_pos.get(k)
   pos_stanza = d_stanza_pos.get(k)
   for i in range(len(pos_spacy)):
      d_spacy_to_stanza_pos.update({pos_spacy[i]: d_spacy_to_stanza_pos.get(pos_spacy[i],dict())})
      d_spacy_to_stanza_pos.get(pos_spacy[i],dict()).update({pos_stanza[i]: (d_spacy_to_stanza_pos.get(pos_spacy[i],dict()).get(pos_stanza[i], 0) + 1)})
      d_stanza_to_spacy_pos.update({pos_stanza[i]: d_stanza_to_spacy_pos.get(pos_stanza[i],dict())})
      d_stanza_to_spacy_pos.get(pos_stanza[i],dict()).update({pos_spacy[i]: (d_stanza_to_spacy_pos.get(pos_stanza[i],dict()).get(pos_spacy[i], 0) + 1)})

      d_spacy_to_stanza_pos_tokens.update({k: d_spacy_to_stanza_pos_tokens.get(k,dict())})
      d_spacy_to_stanza_pos_tokens.get(k,dict()).update({pos_stanza[i]: (d_spacy_to_stanza_pos_tokens.get(k,dict()).get(pos_stanza[i], 0) + 1)})
      d_stanza_to_spacy_pos_tokens.update({k: d_stanza_to_spacy_pos_tokens.get(k,dict())})
      d_stanza_to_spacy_pos_tokens.get(k,dict()).update({pos_spacy[i]: (d_stanza_to_spacy_pos_tokens.get(k,dict()).get(pos_spacy[i], 0) + 1)})

d_spacy_to_stanza_pos, d_stanza_to_spacy_pos, d_spacy_to_stanza_pos_tokens, d_stanza_to_spacy_pos_tokens
   

({'NOUN': {'NOUN': 8719,
   'PROPN': 851,
   'X': 312,
   'VERB': 62,
   'ADV': 6,
   'AUX': 4,
   'PUNCT': 10,
   'ADJ': 115,
   'PRON': 1,
   'NUM': 4,
   'ADP': 1,
   'SYM': 2,
   'INTJ': 2},
  'PROPN': {'NOUN': 1217,
   'ADJ': 271,
   'PROPN': 2869,
   'X': 812,
   'PUNCT': 35,
   'VERB': 47,
   'PRON': 5,
   'ADP': 1,
   'AUX': 4,
   'NUM': 8,
   'ADV': 6,
   'PART': 1,
   'INTJ': 4},
  'ADJ': {'ADJ': 2707,
   'PROPN': 145,
   'NOUN': 142,
   'VERB': 48,
   'PUNCT': 4,
   'ADV': 7,
   'X': 54,
   'ADP': 2,
   'INTJ': 1},
  'DET': {'DET': 3575, 'X': 4, 'PRON': 5, 'CCONJ': 6, 'NOUN': 1, 'INTJ': 2},
  'PUNCT': {'PUNCT': 7046, 'X': 11, 'SYM': 41, 'PART': 2, 'PROPN': 1},
  'ADP': {'ADP': 5943,
   'X': 32,
   'SCONJ': 127,
   'ADV': 24,
   'PART': 3,
   'PROPN': 1,
   'PUNCT': 1,
   'NOUN': 1},
  'PRON': {'DET': 32,
   'PRON': 2189,
   'PROPN': 4,
   'X': 1,
   'ADV': 6,
   'SCONJ': 6,
   'CCONJ': 5},
  'NUM': {'NUM': 1888,
   'SYM': 2,
   'NOUN': 8,
   'PRON': 1,
   'PROPN': 3,
   'ADJ

For a better visualisation, we put everything we found into two dictionnaries.

In [42]:
df_spacy_to_stanza = pd.DataFrame(d_spacy_to_stanza_pos).fillna(0)
df_spacy_to_stanza

Unnamed: 0,NOUN,PROPN,ADJ,DET,PUNCT,ADP,PRON,NUM,AUX,ADV,VERB,PART,CCONJ,SCONJ,SYM,X,INTJ
NOUN,8719.0,1217.0,142.0,1.0,0.0,1.0,0.0,8.0,1.0,24.0,102.0,0.0,0.0,0.0,0.0,3.0,3.0
PROPN,851.0,2869.0,145.0,0.0,1.0,1.0,4.0,3.0,2.0,30.0,62.0,0.0,0.0,0.0,0.0,16.0,1.0
X,312.0,812.0,54.0,4.0,11.0,32.0,1.0,1.0,4.0,7.0,42.0,7.0,6.0,1.0,0.0,45.0,10.0
VERB,62.0,47.0,48.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,3497.0,0.0,0.0,0.0,0.0,0.0,0.0
ADV,6.0,6.0,7.0,0.0,0.0,24.0,6.0,0.0,0.0,814.0,1.0,0.0,0.0,126.0,0.0,0.0,0.0
AUX,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1322.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
PUNCT,10.0,35.0,4.0,0.0,7046.0,1.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,3.0,9.0,0.0
ADJ,115.0,271.0,2707.0,0.0,0.0,0.0,0.0,2.0,0.0,23.0,31.0,0.0,0.0,0.0,0.0,1.0,0.0
PRON,1.0,5.0,0.0,5.0,0.0,0.0,2189.0,1.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0
NUM,4.0,8.0,0.0,0.0,0.0,0.0,0.0,1888.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0


In [43]:
df_stanza_to_spacy = pd.DataFrame(d_stanza_to_spacy_pos).fillna(0)
df_stanza_to_spacy

Unnamed: 0,NOUN,ADJ,DET,PUNCT,X,ADP,SCONJ,ADV,NUM,PROPN,SYM,AUX,VERB,PART,CCONJ,PRON,INTJ
NOUN,8719.0,115.0,0.0,10.0,312.0,1.0,0.0,6.0,4.0,851.0,2.0,4.0,62.0,0.0,0.0,1.0,2.0
PROPN,1217.0,271.0,0.0,35.0,812.0,1.0,0.0,6.0,8.0,2869.0,0.0,4.0,47.0,1.0,0.0,5.0,4.0
ADJ,142.0,2707.0,0.0,4.0,54.0,2.0,0.0,7.0,0.0,145.0,0.0,0.0,48.0,0.0,0.0,0.0,1.0
VERB,102.0,31.0,0.0,0.0,42.0,0.0,0.0,1.0,0.0,62.0,0.0,2.0,3497.0,0.0,0.0,0.0,0.0
NUM,8.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1888.0,3.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0
ADV,24.0,23.0,1.0,0.0,7.0,3.0,2.0,814.0,1.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
INTJ,3.0,0.0,0.0,0.0,10.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
DET,1.0,0.0,3575.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,5.0,2.0
X,3.0,1.0,0.0,9.0,45.0,0.0,0.0,0.0,3.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AUX,1.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,2.0,0.0,1322.0,8.0,1.0,0.0,0.0,0.0


In [115]:
df_spacy_to_stanza_tokens = pd.DataFrame(d_spacy_to_stanza_pos).fillna(0)
df_spacy_to_stanza_tokens

Unnamed: 0,NOUN,PROPN,ADJ,DET,PUNCT,ADP,PRON,NUM,AUX,ADV,VERB,PART,CCONJ,SCONJ,SYM,X,INTJ
NOUN,8719.0,1217.0,142.0,1.0,0.0,1.0,0.0,8.0,1.0,24.0,102.0,0.0,0.0,0.0,0.0,3.0,3.0
PROPN,851.0,2869.0,145.0,0.0,1.0,1.0,4.0,3.0,2.0,30.0,62.0,0.0,0.0,0.0,0.0,16.0,1.0
X,312.0,812.0,54.0,4.0,11.0,32.0,1.0,1.0,4.0,7.0,42.0,7.0,6.0,1.0,0.0,45.0,10.0
VERB,62.0,47.0,48.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,3497.0,0.0,0.0,0.0,0.0,0.0,0.0
ADV,6.0,6.0,7.0,0.0,0.0,24.0,6.0,0.0,0.0,814.0,1.0,0.0,0.0,126.0,0.0,0.0,0.0
AUX,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1322.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
PUNCT,10.0,35.0,4.0,0.0,7046.0,1.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,3.0,9.0,0.0
ADJ,115.0,271.0,2707.0,0.0,0.0,0.0,0.0,2.0,0.0,23.0,31.0,0.0,0.0,0.0,0.0,1.0,0.0
PRON,1.0,5.0,0.0,5.0,0.0,0.0,2189.0,1.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0
NUM,4.0,8.0,0.0,0.0,0.0,0.0,0.0,1888.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0


In [116]:
df_stanza_to_spacy_tokens = pd.DataFrame(d_spacy_to_stanza_pos).fillna(0)
df_stanza_to_spacy_tokens

Unnamed: 0,NOUN,PROPN,ADJ,DET,PUNCT,ADP,PRON,NUM,AUX,ADV,VERB,PART,CCONJ,SCONJ,SYM,X,INTJ
NOUN,8719.0,1217.0,142.0,1.0,0.0,1.0,0.0,8.0,1.0,24.0,102.0,0.0,0.0,0.0,0.0,3.0,3.0
PROPN,851.0,2869.0,145.0,0.0,1.0,1.0,4.0,3.0,2.0,30.0,62.0,0.0,0.0,0.0,0.0,16.0,1.0
X,312.0,812.0,54.0,4.0,11.0,32.0,1.0,1.0,4.0,7.0,42.0,7.0,6.0,1.0,0.0,45.0,10.0
VERB,62.0,47.0,48.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,3497.0,0.0,0.0,0.0,0.0,0.0,0.0
ADV,6.0,6.0,7.0,0.0,0.0,24.0,6.0,0.0,0.0,814.0,1.0,0.0,0.0,126.0,0.0,0.0,0.0
AUX,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1322.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
PUNCT,10.0,35.0,4.0,0.0,7046.0,1.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,3.0,9.0,0.0
ADJ,115.0,271.0,2707.0,0.0,0.0,0.0,0.0,2.0,0.0,23.0,31.0,0.0,0.0,0.0,0.0,1.0,0.0
PRON,1.0,5.0,0.0,5.0,0.0,0.0,2189.0,1.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0
NUM,4.0,8.0,0.0,0.0,0.0,0.0,0.0,1888.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0


Finally, we add some columns representing the errors by frequence.

In [44]:
for i in df_spacy_to_stanza.columns:
    df_spacy_to_stanza[i+'%'] = df_spacy_to_stanza[i]/df_spacy_to_stanza[i].sum()
df_spacy_to_stanza

Unnamed: 0,NOUN,PROPN,ADJ,DET,PUNCT,ADP,PRON,NUM,AUX,ADV,...,NUM%,AUX%,ADV%,VERB%,PART%,CCONJ%,SCONJ%,SYM%,X%,INTJ%
NOUN,8719.0,1217.0,142.0,1.0,0.0,1.0,0.0,8.0,1.0,24.0,...,0.004199,0.000746,0.02649,0.027295,0.0,0.0,0.0,0.0,0.038961,0.157895
PROPN,851.0,2869.0,145.0,0.0,1.0,1.0,4.0,3.0,2.0,30.0,...,0.001575,0.001491,0.033113,0.016591,0.0,0.0,0.0,0.0,0.207792,0.052632
X,312.0,812.0,54.0,4.0,11.0,32.0,1.0,1.0,4.0,7.0,...,0.000525,0.002983,0.007726,0.011239,0.010989,0.004118,0.002451,0.0,0.584416,0.526316
VERB,62.0,47.0,48.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.005966,0.0,0.935777,0.0,0.0,0.0,0.0,0.0,0.0
ADV,6.0,6.0,7.0,0.0,0.0,24.0,6.0,0.0,0.0,814.0,...,0.0,0.0,0.898455,0.000268,0.0,0.0,0.308824,0.0,0.0,0.0
AUX,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1322.0,0.0,...,0.0,0.985831,0.0,0.000535,0.0,0.0,0.0,0.0,0.0,0.0
PUNCT,10.0,35.0,4.0,0.0,7046.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.009419,0.0,0.0,0.068182,0.116883,0.0
ADJ,115.0,271.0,2707.0,0.0,0.0,0.0,0.0,2.0,0.0,23.0,...,0.00105,0.0,0.025386,0.008295,0.0,0.0,0.0,0.0,0.012987,0.0
PRON,1.0,5.0,0.0,5.0,0.0,0.0,2189.0,1.0,0.0,0.0,...,0.000525,0.0,0.0,0.0,0.0,0.0,0.014706,0.0,0.0,0.0
NUM,4.0,8.0,0.0,0.0,0.0,0.0,0.0,1888.0,0.0,1.0,...,0.991076,0.0,0.001104,0.0,0.0,0.0,0.0,0.0,0.038961,0.105263


In [45]:
for i in df_stanza_to_spacy.columns:
    df_stanza_to_spacy[i+'%'] = df_stanza_to_spacy[i]/df_stanza_to_spacy[i].sum()
df_stanza_to_spacy

Unnamed: 0,NOUN,ADJ,DET,PUNCT,X,ADP,SCONJ,ADV,NUM,PROPN,...,ADV%,NUM%,PROPN%,SYM%,AUX%,VERB%,PART%,CCONJ%,PRON%,INTJ%
NOUN,8719.0,115.0,0.0,10.0,312.0,1.0,0.0,6.0,4.0,851.0,...,0.006061,0.002099,0.213551,0.023256,0.003003,0.016931,0.0,0.0,0.000453,0.166667
PROPN,1217.0,271.0,0.0,35.0,812.0,1.0,0.0,6.0,8.0,2869.0,...,0.006061,0.004197,0.71995,0.0,0.003003,0.012835,0.001681,0.0,0.002266,0.333333
ADJ,142.0,2707.0,0.0,4.0,54.0,2.0,0.0,7.0,0.0,145.0,...,0.007071,0.0,0.036386,0.0,0.0,0.013108,0.0,0.0,0.0,0.083333
VERB,102.0,31.0,0.0,0.0,42.0,0.0,0.0,1.0,0.0,62.0,...,0.00101,0.0,0.015558,0.0,0.001502,0.954943,0.0,0.0,0.0,0.0
NUM,8.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1888.0,3.0,...,0.0,0.990556,0.000753,0.023256,0.0,0.0,0.0,0.0,0.000453,0.0
ADV,24.0,23.0,1.0,0.0,7.0,3.0,2.0,814.0,1.0,30.0,...,0.822222,0.000525,0.007528,0.0,0.0,0.0,0.0,0.0,0.0,0.083333
INTJ,3.0,0.0,0.0,0.0,10.0,1.0,0.0,0.0,2.0,1.0,...,0.0,0.001049,0.000251,0.0,0.0,0.0,0.0,0.0,0.0,0.166667
DET,1.0,0.0,3575.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004104,0.002266,0.166667
X,3.0,1.0,0.0,9.0,45.0,0.0,0.0,0.0,3.0,16.0,...,0.0,0.001574,0.004015,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AUX,1.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.000502,0.0,0.992492,0.002185,0.001681,0.0,0.0,0.0


In [117]:
for i in df_spacy_to_stanza_tokens.columns:
    df_spacy_to_stanza_tokens[i+'%'] = df_spacy_to_stanza_tokens[i]/df_spacy_to_stanza[i].sum()
df_spacy_to_stanza_tokens

Unnamed: 0,NOUN,PROPN,ADJ,DET,PUNCT,ADP,PRON,NUM,AUX,ADV,...,NUM%,AUX%,ADV%,VERB%,PART%,CCONJ%,SCONJ%,SYM%,X%,INTJ%
NOUN,8719.0,1217.0,142.0,1.0,0.0,1.0,0.0,8.0,1.0,24.0,...,0.004199,0.000746,0.02649,0.027295,0.0,0.0,0.0,0.0,0.038961,0.157895
PROPN,851.0,2869.0,145.0,0.0,1.0,1.0,4.0,3.0,2.0,30.0,...,0.001575,0.001491,0.033113,0.016591,0.0,0.0,0.0,0.0,0.207792,0.052632
X,312.0,812.0,54.0,4.0,11.0,32.0,1.0,1.0,4.0,7.0,...,0.000525,0.002983,0.007726,0.011239,0.010989,0.004118,0.002451,0.0,0.584416,0.526316
VERB,62.0,47.0,48.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.005966,0.0,0.935777,0.0,0.0,0.0,0.0,0.0,0.0
ADV,6.0,6.0,7.0,0.0,0.0,24.0,6.0,0.0,0.0,814.0,...,0.0,0.0,0.898455,0.000268,0.0,0.0,0.308824,0.0,0.0,0.0
AUX,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1322.0,0.0,...,0.0,0.985831,0.0,0.000535,0.0,0.0,0.0,0.0,0.0,0.0
PUNCT,10.0,35.0,4.0,0.0,7046.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.009419,0.0,0.0,0.068182,0.116883,0.0
ADJ,115.0,271.0,2707.0,0.0,0.0,0.0,0.0,2.0,0.0,23.0,...,0.00105,0.0,0.025386,0.008295,0.0,0.0,0.0,0.0,0.012987,0.0
PRON,1.0,5.0,0.0,5.0,0.0,0.0,2189.0,1.0,0.0,0.0,...,0.000525,0.0,0.0,0.0,0.0,0.0,0.014706,0.0,0.0,0.0
NUM,4.0,8.0,0.0,0.0,0.0,0.0,0.0,1888.0,0.0,1.0,...,0.991076,0.0,0.001104,0.0,0.0,0.0,0.0,0.0,0.038961,0.105263


In [119]:
for i in df_stanza_to_spacy_tokens.columns:
    df_stanza_to_spacy_tokens[i+'%'] = df_stanza_to_spacy_tokens[i]/df_stanza_to_spacy[i].sum()
df_stanza_to_spacy_tokens

Unnamed: 0,NOUN,PROPN,ADJ,DET,PUNCT,ADP,PRON,NUM,AUX,ADV,...,NUM%,AUX%,ADV%,VERB%,PART%,CCONJ%,SCONJ%,SYM%,X%,INTJ%
NOUN,8719.0,1217.0,142.0,1.0,0.0,1.0,0.0,8.0,1.0,24.0,...,0.004197,0.000751,0.024242,0.027854,0.0,0.0,0.0,0.0,0.002224,0.25
PROPN,851.0,2869.0,145.0,0.0,1.0,1.0,4.0,3.0,2.0,30.0,...,0.001574,0.001502,0.030303,0.016931,0.0,0.0,0.0,0.0,0.011861,0.083333
X,312.0,812.0,54.0,4.0,11.0,32.0,1.0,1.0,4.0,7.0,...,0.000525,0.003003,0.007071,0.011469,0.011765,0.004104,0.002793,0.0,0.033358,0.833333
VERB,62.0,47.0,48.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.006006,0.0,0.954943,0.0,0.0,0.0,0.0,0.0,0.0
ADV,6.0,6.0,7.0,0.0,0.0,24.0,6.0,0.0,0.0,814.0,...,0.0,0.0,0.822222,0.000273,0.0,0.0,0.351955,0.0,0.0,0.0
AUX,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1322.0,0.0,...,0.0,0.992492,0.0,0.000546,0.0,0.0,0.0,0.0,0.0,0.0
PUNCT,10.0,35.0,4.0,0.0,7046.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.010084,0.0,0.0,0.034884,0.006672,0.0
ADJ,115.0,271.0,2707.0,0.0,0.0,0.0,0.0,2.0,0.0,23.0,...,0.001049,0.0,0.023232,0.008465,0.0,0.0,0.0,0.0,0.000741,0.0
PRON,1.0,5.0,0.0,5.0,0.0,0.0,2189.0,1.0,0.0,0.0,...,0.000525,0.0,0.0,0.0,0.0,0.0,0.01676,0.0,0.0,0.0
NUM,4.0,8.0,0.0,0.0,0.0,0.0,0.0,1888.0,0.0,1.0,...,0.990556,0.0,0.00101,0.0,0.0,0.0,0.0,0.0,0.002224,0.166667
