In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
#Uncomment the line below to install stanza
#!pip install stanza

In [2]:
from util import spacy_get_sents, stanza_get_sents, spacy_tokenize_text, stanza_tokenize_text, get_texts

2023-05-10 10:04:03 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/tokenize/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/pos/combined.pt:   0%|         …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/backward_charlm/1billion.pt:   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/pretrain/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/forward_charlm/1billion.pt:   0…

2023-05-10 10:04:31 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |

2023-05-10 10:04:31 INFO: Use device: cpu
2023-05-10 10:04:31 INFO: Loading: tokenize
2023-05-10 10:04:31 INFO: Loading: pos
2023-05-10 10:04:32 INFO: Done loading processors!


In [3]:
from util import get_tokens_dict, common_tokens

# Data collection

For our further inverstigations we will choose 100 texts from "Writers" category that we previously stored:

In [4]:
texts = get_texts(source_type='folder', path='Writers', token_type='file', n=100)

Below we can see the sample of 5 files from the dataset:

In [5]:
texts[:5]

["Shahid Shafaat (Urdu: شاہد شفاعت) is a Pakistani director, writer, producer and actor who works in television, film and theatre. Mostly known for his work as a director, Shafaat started his on-screen career in the early 2000s and directed more than 10 television plays. He received Lux Style Award nomination as a Best TV Director for Dil Mom Ka Diya. Career Shafaat started his career in theatre. He directed several plays, most notably Main Adakara Banu Gi. He directed the series Kaafir in 2012, which received critical praise, but was a commercial failure. In 2016, he directed Next Level Entertainment's production, Khuda Mera Bhi Hai. The series received critical acclaim. In 2018, he directed Khasara, which was a commercial success. His next play was another Next level Entertainment production, Dil Mom Ka Diya which became the highest-rated play in Pakistani television history. He then ventured in social causes television series, including Surkh Chandni and Bikhray Moti . Filmography F

# Sentence segmentation

For our selected 100 articles we will perform sentence tokenization using Spacy and Stanza.
Each file will be separated into sentences separately.

In [6]:
spacy_sents = [spacy_get_sents(x) for x in texts]

In [7]:
# To run for the first time Stanza tokenization:
""" 
stanza_sents = [stanza_get_sents(x) for x in texts]
with open('SpacyStanza/stanza_sents.pickle', 'wb') as f:
    pickle.dump(stanza_sents, f))

"""

# To load Stanza tokens from a pickle file

with open('SpacyStanza/stanza_sents.pickle', 'rb') as f:
    stanza_sents = pickle.load(f)

We can now show the number of sentences calculated by each library for our articles:

In [9]:
sents_df = pd.DataFrame(columns=['Spacy', 'Stanza'])

sents_df.Spacy = list(map(lambda x: len(x), spacy_sents))
sents_df.Stanza = list(map(lambda x: len(x), stanza_sents))

sents_df

Unnamed: 0,Spacy,Stanza
0,107,95
1,37,33
2,8,9
3,35,27
4,28,29
...,...,...
95,32,30
96,21,11
97,18,17
98,7,7


We can look at the stats for all articles, it appears that Spacy on average finds more sentences:

In [10]:
sents_df.describe()

Unnamed: 0,Spacy,Stanza
count,100.0,100.0
mean,30.77,28.23
std,26.211978,24.465129
min,3.0,3.0
25%,13.0,11.0
50%,25.0,21.5
75%,37.0,33.25
max,155.0,143.0


Below we can see the total number of sentences recognized by the both libraries:

In [11]:
sents_df.sum()

Spacy     3077
Stanza    2823
dtype: int64

Now we can find all unique sentences that both recognize.
First we can look at the number shared sentences per article:

In [12]:
# Make a set of unique sentences for each article for both Spacy and Stanza.
# Find the intersection = these are sentences recognized by both libraries.

shared_sents_per_article = list(map(lambda x,y: set(x).intersection(set(y)), spacy_sents, stanza_sents))

shared_sents_per_article_df = pd.DataFrame(columns=['shared_sents', 'count'])
shared_sents_per_article_df['shared_sents'] = shared_sents_per_article
shared_sents_per_article_df['count'] = [len(x) for x in shared_sents_per_article]
shared_sents_per_article_df

Unnamed: 0,shared_sents,count
0,"{In fact I won't sell to them., References, Ar...",68
1,"{The Impaler – المُخوزِق., A historical novel ...",24
2,"{References, Also a documentary was made about...",7
3,"{A statue of him, unveiled in 1950, stands at ...",12
4,"{References, His earliest effort came at the a...",27
...,...,...
95,{He was not only the opponent of the Hungarian...,26
96,"{References, In Aghmat, in the year 1190, he w...",10
97,"{Life, A Brief View of the Past and Present St...",12
98,"{Born in Pikiv, Kalynivka Raion, Vinnytsia Obl...",7


Below is the overall count of shared sentences:

In [13]:
total_sp_sent = sents_df.sum()['Spacy']
total_st_sent = sents_df.sum()['Stanza']
total_shared = shared_sents_per_article_df['count'].sum()

print(f"""
    Total Spacy sentences: {total_sp_sent}
    Total Stanza sentences: {total_st_sent}
    Shared sentences: {total_shared}
    Percentage of shared sentences for Spacy: {100*total_shared/total_sp_sent:.2f}%
    Percentage of shared sentences for Stanza: {100*total_shared/total_st_sent:.2f}%""")


    Total Spacy sentences: 3077
    Total Stanza sentences: 2823
    Shared sentences: 2263
    Percentage of shared sentences for Spacy: 73.55%
    Percentage of shared sentences for Stanza: 80.16%


We can also have a look at what different articles both libraries found looking at the first mismatched sentence pair 
for each article:

In [14]:
def get_first_mismatched_sent_pair(sp_sents, st_sents):
    n = len(sp_sents)
    m = len(st_sents)
    
    # Go through the array of sentences for both Spacy and Stanza till the end of smallest of them.
    for i in range(min(n, m)):
        # When we reach the first mismatch, print the sentences.
        if sp_sents[i] != st_sents[i]:
            print(f'Spacy sentence: {sp_sents[i]} \n\n'
            f'Stanza sentence: {st_sents[i]}\n'
            '_______________')
            return sp_sents[i], st_sents[i]
    
    # If there was no mismatch during the array traversal, then all the sentences were tokenized together.
    print('Text segmentized into sentences in the same way.\n_______________')
    return None

for i, pair in enumerate(zip(spacy_sents, stanza_sents)):
    print(f'Article: {i}\n\n')
    get_first_mismatched_sent_pair(pair[0], pair[1])

Article: 0


Spacy sentence: She danced also in Hobart's Opera And Ballet Festival for International Week in 1945.Intending to develop land and to pursue their artistic interests, they moved to Mungo Brush in the Myall Lakes, New South Wales, in 1951 or 1954, living a subsistence existence from prawn fishing and trading their home-grown produce, and were appointed Honorary Rangers there in 1955 under the Wild Flowers and Native Plants Protection Act. 

Stanza sentence: She danced also in Hobart's Opera And Ballet Festival for International Week in 1945.
_______________
Article: 1


Spacy sentence: ISBN 9782844096265, 

Stanza sentence: ISBN 9782844096265, ASIN B07TTX6ZHQ
_______________
Article: 2


Spacy sentence: He successfully underwent a ground-breaking bilateral arm transplant in August 2016.Peck wrote a book, Rebuilding Sergeant Peck: How I Put Body and Soul Back Together After Afghanistan, that was released on May 7, 2019. 

Stanza sentence: He successfully underwent a ground-b

From the sample we see above we can note the following behavior for sentence tokenization for Spacy and Stanza:
- If there is a missing space before a period, separating sentences, Stanza seems to perform sentence segmentation better, e.g `Article 0, 2, 7, 24`...
- In our source material some sections have a title which is not punctuated. Stanza appears to recognize such cases as a separate sentence (correct) more frequently, e.g. `Article 3, 6, 12, 16, 17, 19`...
- However, Stanza seems to be incorrectly splitting a sentence more frequently if there is a punctuation sign followed by a capital letter, e.g. `Article 4, 5, 9, 11, 15, 18`...
- Both libraries struggle (as expected) with sententences that have a lot of non-English words or the input string is not a correct sentence.

______________________

Now we can form a list of shared sentences for all articles combined:

In [15]:
# Convert the sets into arrays and then flatted the 2 array into 1-d array using numpy.concatenate

shared_sents = np.concatenate(list(map(lambda x: list(x), shared_sents_per_article)))

In [16]:
shared_sents[:5]

array(["In fact I won't sell to them.", 'References',
       'Artists must create for the sake of creating.',
       'Germaine, Max; Bertouch, Anne von (1991).',
       'Melbourne: Lansdowne.'], dtype='<U592')

We can now store the shared sentences into a separate DataFrame and into its own CSV-file:

In [9]:
# Run to generate the DataFrame for this time
"""
shared_sents_df = pd.DataFrame(columns=['Sentence'])
shared_sents_df.Sentence = shared_sents
"""

shared_sents_df = pd.read_csv('SpacyStanza/spacy_stanza_shared_sents.csv', index_col=0)
shared_sents_df 

Unnamed: 0,Sentence
0,"Anne Von Bertouch, (29 June 1915 – 31 March 20..."
1,ISBN 978-0-9592824-1-2. OCLC 27623615.
2,"Von Bertouch, Anne (1983)."
3,The terraces were purchased by Dr Dick Lees fo...
4,What was it before it was a gallery?.
...,...
2258,"In popular culture In 1999, Komphet Phorncharo..."
2259,"In 1996, Butnakho started to write songs in th..."
2260,"The melody came from the pattern of ""Sao Simue..."
2261,"In 2007, Isan music became very popular in Lao..."


# Tokenization

For the experiment below we will use all the tokens returned by Spacy and Stanza, without filtering any punctuation or spaces.

However, prior to all the analysis texts were cleaned: we removed linebreaks, tabulation and special characters like `|`, `^`, `<`, `>`, `+`, and `=`. These symbols were not returned when we requested the original texts.

We will do the final step of preprocessing and lowercase the text in order to avoid the same word being in vocabulary twice: for different spelling when it's in the beginning or in the middle of a sentence.

For our experiment we will take only the sentences that were previously segmented in the same way by Spacy and Stanza.
For **SharedTokensNosentences** we will stich all the sentences together and separate them by spaces. For **SharedTokensInSentences** we will apply tokenization on the sentences from `shared_sents_df` above.

In [10]:
shared_sents_lower = [s.lower() for s in shared_sents_df.Sentence]

In [11]:
shared_sents_stiched = ' '.join(shared_sents_lower)

##  Vocabulary

To compare vocabularies we will run the comparison on the sentences stiched together into one text.

In [12]:
# Create a set to find unique tokens only
spacy_vocabulary = set(spacy_tokenize_text(shared_sents_stiched, no_filtering=True))

In [13]:
# To run for the first time Stanza tokenization:
""" 
stanza_vocabulary = set(stanza_tokenize_text(shared_sents_stiched))with open('SpacyStanza/no_sent_stanza_tokens.pickle', 'wb') as f:
with open('SpacyStanza/stanza_vocabulary.pickle', 'wb') as f:
    pickle.dump(stanza_vocabulary, f)

"""

# To load Stanza tokens from a pickle file

with open('SpacyStanza/stanza_vocabulary.pickle', 'rb') as f:
    stanza_vocabulary = pickle.load(f)

In [22]:
print(f'Spacy vocabulary size: {len(spacy_vocabulary)}\nStanza vocabulary size: {len(stanza_vocabulary)}')

Spacy vocabulary size: 8488
Stanza vocabulary size: 8526


### Shared vocabulary

Shared vocabulary is calculated as intersection of sets of Stanza and Spacy vocabulary:

In [23]:
shared_vocab = spacy_vocabulary.intersection(stanza_vocabulary)

In [24]:
shared_vocab_size = len(shared_vocab)

Size of shared vocabulary:

In [25]:
shared_vocab_size

8264

# <font color='red'>Please **DO NOT** rerun the cell below</font>

In [54]:
np.random.choice(list(shared_vocab), size=50)

array(['moving', 'specialized', 'there', 'ekspres', 'attended', 'pune',
       'foot', 'occasion', 'favored', '1961', '1659', 'believed', 'equal',
       'warsaw', 'turkish', 'height', 'growing', 'c.', 'having',
       'hinting', 'friends', 'sweetheart', 'organically', 'registry',
       'lhadj', 'nicholas', 'judgement', 'confinement', 'argentina',
       'pleasures', 'armidale', 'bunyan', 'immediate', 'emigrants',
       'ṛṛays', 'blessing', 'mae', 'extinct', 'mosques', 'stockholm',
       'stanitsa', 'matter', 'muffins', 'ceramics', 'egg', 'saitovic',
       'tourism', 'founded', 'hospitalized', 'an'], dtype='<U19')

From the sample above we can observe that both libraries seem to agree on tokenizing a wide range of words:
- Common words like `an`, `egg`, `there`, `having` are present in the shared vocabulary.
- Foreign to English words like `bunyan`, `stanitsa`, `lhadj` are also present in the shared vocabulary.
- We can as well see some Named Entities like dates and locations
- We can as well see an incorrect lemma `ṛṛays` (presumably a part of `arrays`) which was produced by both libraries.
- ...

### Spacy-only vocabulary

In [26]:
spacy_only_vocab = spacy_vocabulary.difference(stanza_vocabulary)
spacy_only_vocab

{"'m",
 "'ve",
 '):',
 ',(1898',
 '/priče',
 '/zapisi',
 '0',
 '040',
 '09',
 '1031–1034',
 '111',
 '1134068708',
 '1442252813',
 '1472806871',
 '1476679488',
 '1500',
 '1547–1564',
 '1689–1887',
 '16–17',
 '1744–1816',
 '1758',
 '1766–1822',
 '1766–1830',
 '1785–1863',
 '1793–1869',
 '1795–1864',
 '1798–1879',
 '1804–1856',
 '1804–1871',
 '1810–1860',
 '1848–1905',
 '1860–1923,1986',
 '1868–1936',
 '1877–1951',
 '1882–1956',
 '1885–1900',
 '1904–1920',
 '1905–1907',
 '1909–1910',
 '1909–1919',
 '1912–1928',
 '1913–1974',
 '1918–1923',
 '1928–1998',
 '1935–37',
 '1937–39',
 '1941–45',
 '1943–24',
 '1945–47',
 '1949–2009',
 '1950/1951',
 '1969–70',
 '1977–1980',
 '1980–84',
 '1989–1991',
 '1990–2008',
 '1996–2000',
 '1998–2002',
 '19–24',
 '2001–2011',
 '2005–2006',
 '2008–13',
 '2011/2012',
 '2018edited',
 '2019–20',
 '205',
 '23833',
 '255',
 '25–28',
 '293',
 '300p',
 '313',
 '323',
 '35–6',
 '3865',
 '3905881028',
 '39–40',
 '401',
 '441–2',
 '480–44',
 '521',
 '53',
 '605',
 '700–7

In the Spacy-only tokens we can see a lot of numbers as well as non-english words (or non-words).

However, surprisingly we can also see some more or less common English words:
- Some words of relatively more formal register of English: `canonical`, `colonialism`, `denominational`, `editors`, `eminent`, `feudal`, `imperialism`, `instrumentalist`...
- Some NE: `kishinev`, `kyoto`, `liechtenstein`, `vatican` (Locations), `suzuki` (Name or Organisation name), `unesco` (Organisation name)...

### Stanza-only vocubulary

In [27]:
stanza_only_vocab = stanza_vocabulary.difference(spacy_vocabulary)
stanza_only_vocab

{"'a",
 "'e",
 "'t",
 '-1758',
 '-1980',
 '-2005',
 '-3865',
 '.ie',
 '/1951',
 '0-521-23833-1',
 '1-905237-53',
 '1031',
 '1689',
 '1744',
 '1766-1822',
 '1785',
 '1798',
 '1810',
 '1860',
 '1868',
 '1882',
 '1923,1986',
 '1923-2011',
 '1943-2012',
 '313-323',
 '39',
 '441',
 '47',
 '480',
 '700',
 '84',
 '84.5',
 '9695188.',
 '975-293-040-9',
 '975-293-255-x',
 '975-8084',
 '978-0-9592824-1-2.',
 '978-0-9592824-2-9.',
 '978-0-9592824-3-6.',
 '978-1134068708',
 '978-1442252813',
 '978-1472806871',
 '978-1476679488',
 '978-3-205-78442-5',
 '978-3-7349-9213-1',
 '978-3-7349-9223-0',
 '978-3-8392-1500',
 '978-3-8392-1978',
 '978-3-8392-1986-7',
 '978-3905881028',
 '978-605-09',
 '978-605-111-401-9',
 '978-976-8097-13-2.',
 '9781931883702',
 '?.',
 '[...]',
 '`the',
 '`yellow',
 "abc's",
 'ah.',
 'anc.',
 'anti-apartheid',
 'anti-colonialism',
 'anti-feudal',
 'anti-german',
 'anti-imperialism',
 'anti-love',
 'anti-semite',
 'anti-semitic',
 'anti-semitism',
 'anti-soviet',
 'anti-vatica

We can see that Stanza tends tokenize words with prefixes like `co-`, `anti-`, `non-`, `neo-`... as one token, which explains the difference with Spacy on tokenization of words of relatively formal register.

We can se can see that there is the same tendency towards composed words: `pen-name`, `kingdom-based`, `serbo-croatian`...

As for Named Entities, there appears to be a lot of them tokenized together with `'s`: `yusuf's`, `unesco's`, `suzuki's`, `shin's`, `kyoto's`...

## Occurences

To calculate common accurances we will find all tokens produced by both models for a given string and will find tokens that have the same text for both Stanza and Spacy.

For this we will work with raw token entities from Stanza and Spacy to preserve PoS information for the following experiment since PoS information might be changed if a token is taken out of its context.

### Occurences without sentence separation

We will tokenize all stiched sentences together.

In [28]:
no_sent_spacy_tokens = spacy_tokenize_text(shared_sents_stiched, to_string=False, no_filtering=True)

In [29]:
# To run for the first time Stanza tokenization:
""" 
no_sent_stanza_tokens = stanza_tokenize_text(shared_sents_stiched, to_string=False)
with open('SpacyStanza/no_sent_stanza_tokens.pickle', 'wb') as f:
    pickle.dump(no_sent_stanza_tokens, f)

"""

# To load Stanza tokens from a pickle file
""""""
with open('SpacyStanza/no_sent_stanza_tokens.pickle', 'rb') as f:
    no_sent_stanza_tokens = pickle.load(f)

In [30]:
common_t = common_tokens(no_sent_spacy_tokens, no_sent_stanza_tokens)

Some token stats:

In [31]:
print(f"""
Spacy tokens: {len(no_sent_spacy_tokens)}
Stanza tokens: {len(no_sent_stanza_tokens)}
Shared tokens: {len(np.concatenate([x[1] for x in common_t]))}
% of shared tokens in Spacy: {100*len(np.concatenate([x[1] for x in common_t]))/len(no_sent_spacy_tokens):.2f}%
% of shared tokens in Stanza: {100*len(np.concatenate([x[1] for x in common_t]))/len(no_sent_stanza_tokens):.2f}%

"""
)


Spacy tokens: 48820
Stanza tokens: 48502
Shared tokens: 48070
% of shared tokens in Spacy: 98.46%
% of shared tokens in Stanza: 99.11%




We will store the results in a dictionary where the key is the word form and the value is another dictionary, storing all seen results for Spacy and Stanza separately.

In [32]:
no_sent_shared_tokens = {x[0][0].text: {'Spacy': x[0], 'Stanza': x[1]} for x in common_t}

In [33]:
no_sent_shared_tokens['record']

{'Spacy': [record, record, record, record, record, record, record],
 'Stanza': [[
    {
      "id": 39,
      "text": "record",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "start_char": 13283,
      "end_char": 13289
    }
  ],
  [
    {
      "id": 17,
      "text": "record",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "start_char": 27303,
      "end_char": 27309
    }
  ],
  [
    {
      "id": 23,
      "text": "record",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "start_char": 95765,
      "end_char": 95771
    }
  ],
  [
    {
      "id": 22,
      "text": "record",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "start_char": 117044,
      "end_char": 117050
    }
  ],
  [
    {
      "id": 12,
      "text": "record",
      "upos": "VERB",
      "xpos": "VB",
      "feats": "VerbForm=Inf",
      "start_char": 159799,
      "end_char": 159805
    }
  

# <font color='green'>Rerun the follwing cells</font>

### Occurences with segmented sentences

Now we repeat the experiment but for the separated sentences above:

In [34]:
# Uncomment this cell if you didn't run the cells above:

# shared_sents_df = pd.read_csv('SpacyStanza/spacy_stanza_shared_sents.csv')
# shared_sents_lower = [s.lower() for s in shared_sents_df.Sentence]

In [35]:
sp_sent_tokens = list(map(lambda x: spacy_tokenize_text(x, no_filtering=True, to_string=False), shared_sents_lower))

In [53]:
st_sent_tokens = list(map(lambda x: stanza_tokenize_text(x, to_string=False), shared_sents_lower))
with open('SpacyStanza/st_sent_tockens.pickle', 'wb') as f:
    pickle.dump(st_sent_tokens, f)

In [36]:
# To run for the first time Stanza tokenization:
""" 
st_sent_tokens = list(map(lambda x: stanza_tokenize_text(x, to_string=False), shared_sents_lower))
with open('SpacyStanza/st_sent_tockens.pickle', 'wb') as f:
    pickle.dump(st_sent_tokens, f)

"""

# To load Stanza tokens from a pickle file

with open('SpacyStanza/st_sent_tockens.pickle', 'rb') as f:
    st_sent_tokens = pickle.load(f)

In [37]:
common_sent_tokens = list(map(lambda x: common_tokens(x[0], x[1]), zip(sp_sent_tokens, st_sent_tokens)))

In [38]:
sent_tokens_df = pd.DataFrame(columns=['Spacy_token_count', 
                                       'Stanza_token_count', 
                                       'Shared_token_count', 
                                       'Spacy_shared_percentage',
                                      'Stanza_shared_percentage'])

In [39]:
sent_tokens_df.Spacy_token_count = list(map(len, sp_sent_tokens))
sent_tokens_df.Stanza_token_count = list(map(len, st_sent_tokens))
sent_tokens_df.Shared_token_count = list(map(len, common_sent_tokens))
sent_tokens_df.Spacy_shared_percentage = 100 * sent_tokens_df.Shared_token_count / sent_tokens_df.Spacy_token_count 
sent_tokens_df.Stanza_shared_percentage = 100 * sent_tokens_df.Shared_token_count / sent_tokens_df.Stanza_token_count 

sent_tokens_df

Unnamed: 0,Spacy_token_count,Stanza_token_count,Shared_token_count,Spacy_shared_percentage,Stanza_shared_percentage
0,50,13,3,6.000000,23.076923
1,14,5,3,21.428571,60.000000
2,8,18,2,25.000000,11.111111
3,25,12,2,8.000000,16.666667
4,10,10,1,10.000000,10.000000
...,...,...,...,...,...
2258,32,10,3,9.375000,30.000000
2259,16,19,4,25.000000,21.052632
2260,23,16,4,17.391304,25.000000
2261,15,16,4,26.666667,25.000000


Below are average stats for tokens (how many tokens in a sentence are on average provided by Spacy, how many by Stanza, how many are shared etc.):

In [40]:
sent_tokens_df.mean()

Spacy_token_count           21.573133
Stanza_token_count          21.438356
Shared_token_count           3.977022
Spacy_shared_percentage     20.964089
Stanza_shared_percentage    21.208298
dtype: float64

Total number of shared tokens across all sentences:

In [41]:
sent_tokens_df.Shared_token_count.sum()

9000

We have less aligned tokens now, meaning that in the experiment above we might have matched tokens from different sentences.

We will combine all the shared tokens from our sentences into one dictionary of the same structure as above: the keys are wordforms, the values are dictionaries containing arrays of all of the occurences of the wordforms.

Since for PoS analysis for Stanza requires words instead of tokens, we will convert the conversion as well.

In [42]:
shared_sent_tokens = {}

for sent in common_sent_tokens:
    for word in sent:
        word_text = word[0][0].text
        word_dict = shared_sent_tokens.get(word_text, {})
        
        sp_t_list = word_dict.get('Spacy', [])
        sp_t_list.extend(word[0])
        word_dict['Spacy'] = sp_t_list
        
        st_t_list = word_dict.get('Stanza', [])
        # Before extending the list convert tokens to words
        st_t_list.extend(list(map(lambda x: x.words[0], word[1])))
        word_dict['Stanza'] = st_t_list
        
        shared_sent_tokens[word_text] = word_dict


Below is an example of how the dictionary will work on an example of one word `chief`.

`shared_sent_tokens['chief']['Spacy']` can be called to get all Spacy tokens and similarly `shared_sent_tokens['chief']['Stanza']` can be called to get all Stanza tokens.

In this example we can see that this word was sometimes categorized as Noun and sometimes as Adjective.

In [43]:
shared_sent_tokens['chief']

{'Spacy': [chief, chief],
 'Stanza': [{
    "id": 23,
    "text": "chief",
    "upos": "ADJ",
    "xpos": "JJ",
    "feats": "Degree=Pos",
    "start_char": 105,
    "end_char": 110
  },
  {
    "id": 4,
    "text": "chief",
    "upos": "ADJ",
    "xpos": "JJ",
    "feats": "Degree=Pos",
    "start_char": 11,
    "end_char": 16
  }]}

How PoS can be accessed for Spacy:

In [44]:
shared_sent_tokens['chief']['Spacy'][0], shared_sent_tokens['chief']['Spacy'][0].pos_

(chief, 'NOUN')

How PoS can be accessed for Stanza:

In [45]:
shared_sent_tokens['chief']['Stanza'][0], shared_sent_tokens['chief']['Stanza'][0].upos

({
   "id": 23,
   "text": "chief",
   "upos": "ADJ",
   "xpos": "JJ",
   "feats": "Degree=Pos",
   "start_char": 105,
   "end_char": 110
 },
 'ADJ')

# Part of Speech

In [133]:
texts_stitched = ' '.join(texts)
spacy_pos = [(x.text, x.pos_) for x in set(spacy_tokenize_text(texts_stitched, to_string=False))]
d_spacy_pos = dict(spacy_pos)

In [132]:
with open('SpacyStanza/st_sent_tockens.pickle', 'rb') as f:
    stanza_vocabulary = pickle.load(f)

stanza_tokens = []
for x in list(stanza_vocabulary):
    for y in x:
        stanza_tokens.append(y)

stanza_pos = [(x.words[0].text, x.words[0].upos) for x in stanza_tokens]
d_stanza_pos = dict(stanza_pos)


In [77]:
len(stanza_pos), len(spacy_pos)

(48515, 30461)

In [91]:
common_pos = list(set(spacy_pos).difference(set(stanza_pos)))

In [93]:
len(common_pos), len(common_pos)/len(spacy_pos), len(common_pos)/len(stanza_pos)

(9200, 0.3020255408555202, 0.18963207255487993)

In [148]:
d_spacy_to_stanza_pos = dict()
for k,v in d_spacy_pos.items():
   tmp = d_stanza_pos.get(k,'NOT FIND')
   if (v!=tmp):
      d_spacy_to_stanza_pos.update({v: d_spacy_to_stanza_pos.get(v,dict())})
      d_spacy_to_stanza_pos.get(v,dict()).update({tmp: (d_spacy_to_stanza_pos.get(v,dict()).get(tmp, 0) + 1)})
print(d_spacy_to_stanza_pos)
   

{'PROPN': {'NOT FIND': 5272, 'X': 16, 'ADJ': 6, 'NOUN': 9}, 'NOUN': {'NOT FIND': 1644, 'VERB': 41, 'ADJ': 24, 'X': 10, 'PROPN': 2, 'NUM': 1}, 'ADJ': {'NOT FIND': 454, 'NOUN': 21, 'VERB': 9, 'ADV': 3, 'X': 3, 'PROPN': 1, 'INTJ': 1}, 'VERB': {'NOT FIND': 648, 'NOUN': 54, 'ADJ': 8, 'X': 6}, 'NUM': {'NOT FIND': 308, 'NOUN': 1}, 'X': {'NOT FIND': 35, 'PROPN': 1}, 'ADV': {'ADJ': 6, 'NOT FIND': 97, 'SCONJ': 1}, 'SCONJ': {'NOT FIND': 2}, 'PUNCT': {'NOT FIND': 5}, 'SPACE': {'NOT FIND': 4}, 'ADP': {'ADV': 1, 'SCONJ': 1, 'NOT FIND': 23, 'X': 2, 'NOUN': 1}, 'AUX': {'NOT FIND': 4, 'X': 1}, 'PRON': {'NOT FIND': 2}, 'INTJ': {'NOT FIND': 11, 'X': 1}, 'CCONJ': {'NOT FIND': 1}, 'SYM': {'NOT FIND': 1}, 'DET': {'NOT FIND': 1}}


In [149]:
d_stanza_to_spacy_pos = dict()
for k,v in d_stanza_pos.items():
   tmp = d_spacy_pos.get(k,'NOT FIND')
   if (v!=tmp):
      d_stanza_to_spacy_pos.update({v: d_stanza_to_spacy_pos.get(v,dict())})
      d_stanza_to_spacy_pos.get(v,dict()).update({tmp: (d_stanza_to_spacy_pos.get(v,dict()).get(tmp, 0) + 1)})
print(d_stanza_to_spacy_pos)

{'PROPN': {'NOT FIND': 1531, 'ADJ': 1, 'X': 1, 'NOUN': 2}, 'X': {'NOT FIND': 845, 'NOUN': 10, 'VERB': 6, 'PROPN': 16, 'ADP': 2, 'AUX': 1, 'ADJ': 3, 'INTJ': 1}, 'PUNCT': {'NOT FIND': 27}, 'SYM': {'NOT FIND': 5}, 'AUX': {'NOT FIND': 23}, 'DET': {'NOT FIND': 14}, 'ADJ': {'NOT FIND': 566, 'PROPN': 6, 'ADV': 6, 'VERB': 8, 'NOUN': 24}, 'NOUN': {'NOT FIND': 2107, 'PROPN': 9, 'VERB': 54, 'ADJ': 21, 'NUM': 1, 'ADP': 1}, 'CCONJ': {'NOT FIND': 5}, 'ADP': {'NOT FIND': 41}, 'PART': {'NOT FIND': 7}, 'ADV': {'ADP': 1, 'NOT FIND': 124, 'ADJ': 3}, 'NUM': {'NOT FIND': 252, 'NOUN': 1}, 'VERB': {'NOT FIND': 577, 'NOUN': 41, 'ADJ': 9}, 'PRON': {'NOT FIND': 36}, 'SCONJ': {'NOT FIND': 11, 'ADV': 1, 'ADP': 1}, 'INTJ': {'ADJ': 1, 'NOT FIND': 8}}


In [156]:
df_spacy_to_stanza = pd.DataFrame(d_spacy_to_stanza_pos).fillna(0)
df_spacy_to_stanza

Unnamed: 0,PROPN,NOUN,ADJ,VERB,NUM,X,ADV,SCONJ,PUNCT,SPACE,ADP,AUX,PRON,INTJ,CCONJ,SYM,DET
NOT FIND,5272.0,1644.0,454.0,648.0,308.0,35.0,97.0,2.0,5.0,4.0,23.0,4.0,2.0,11.0,1.0,1.0,1.0
X,16.0,10.0,3.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0
ADJ,6.0,24.0,0.0,8.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOUN,9.0,0.0,21.0,54.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
VERB,0.0,41.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PROPN,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NUM,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ADV,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
INTJ,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SCONJ,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [157]:
df_stanza_to_spacy = pd.DataFrame(d_stanza_to_spacy_pos).fillna(0)
df_stanza_to_spacy

Unnamed: 0,PROPN,X,PUNCT,SYM,AUX,DET,ADJ,NOUN,CCONJ,ADP,PART,ADV,NUM,VERB,PRON,SCONJ,INTJ
NOT FIND,1531.0,845.0,27.0,5.0,23.0,14.0,566.0,2107.0,5.0,41.0,7.0,124.0,252.0,577.0,36.0,11.0,8.0
ADJ,1.0,3.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0,3.0,0.0,9.0,0.0,0.0,1.0
X,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOUN,2.0,10.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,1.0,41.0,0.0,0.0,0.0
VERB,0.0,6.0,0.0,0.0,0.0,0.0,8.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PROPN,0.0,16.0,0.0,0.0,0.0,0.0,6.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ADP,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
AUX,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
INTJ,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ADV,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [162]:
for i in df_spacy_to_stanza.columns:
    df_spacy_to_stanza[i+'%'] = df_spacy_to_stanza[i]/df_spacy_to_stanza[i].sum()
df_spacy_to_stanza

Unnamed: 0,PROPN,NOUN,ADJ,VERB,NUM,X,ADV,SCONJ,PUNCT,SPACE,...,SCONJ%%%,PUNCT%%%,SPACE%%%,ADP%%%,AUX%%%,PRON%%%,INTJ%%%,CCONJ%%%,SYM%%%,DET%%%
NOT FIND,5272.0,1644.0,454.0,648.0,308.0,35.0,97.0,2.0,5.0,4.0,...,1.0,1.0,1.0,0.821429,0.8,1.0,0.916667,1.0,1.0,1.0
X,16.0,10.0,3.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.071429,0.2,0.0,0.083333,0.0,0.0,0.0
ADJ,6.0,24.0,0.0,8.0,0.0,0.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOUN,9.0,0.0,21.0,54.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.035714,0.0,0.0,0.0,0.0,0.0,0.0
VERB,0.0,41.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PROPN,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NUM,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ADV,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.035714,0.0,0.0,0.0,0.0,0.0,0.0
INTJ,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SCONJ,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.035714,0.0,0.0,0.0,0.0,0.0,0.0


In [163]:
for i in df_stanza_to_spacy.columns:
    df_stanza_to_spacy[i+'%'] = df_stanza_to_spacy[i]/df_stanza_to_spacy[i].sum()
df_stanza_to_spacy

Unnamed: 0,PROPN,X,PUNCT,SYM,AUX,DET,ADJ,NOUN,CCONJ,ADP,...,NOUN%,CCONJ%,ADP%,PART%,ADV%,NUM%,VERB%,PRON%,SCONJ%,INTJ%
NOT FIND,1531.0,845.0,27.0,5.0,23.0,14.0,566.0,2107.0,5.0,41.0,...,0.960784,1.0,1.0,1.0,0.96875,0.996047,0.920255,1.0,0.846154,0.888889
ADJ,1.0,3.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,...,0.009576,0.0,0.0,0.0,0.023438,0.0,0.014354,0.0,0.0,0.111111
X,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOUN,2.0,10.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.003953,0.065391,0.0,0.0,0.0
VERB,0.0,6.0,0.0,0.0,0.0,0.0,8.0,54.0,0.0,0.0,...,0.024624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PROPN,0.0,16.0,0.0,0.0,0.0,0.0,6.0,9.0,0.0,0.0,...,0.004104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ADP,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.000456,0.0,0.0,0.0,0.007812,0.0,0.0,0.0,0.076923,0.0
AUX,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
INTJ,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ADV,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0
