# Compare normalization of tokens via stemming and lemmatization

In [1]:
# TODO measure vocab overlap after pruning
# TODO Visualize author correlations for each stemmed model in a way that they can be compared
# TODO "Influential Words" and stemming conflation (sec 5.5 in Apples to Apple)
from pathlib import Path

import pandas as pd
import seaborn as sns

from topic_modeling.analysis import *
from topic_modeling.mallet_parser import *
import topic_modeling.preprocessing as preprocessing

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)


all_stemmers = ['pymystem3', 'snowball', 'stanza', 'truncate']

def read_tf_csv(filepath):
    return pd.read_csv(filepath, sep='\t', encoding='utf-8', 
                       names=['token', 'term_freq'], usecols=[0, 1])

In [2]:
# Load in original author counts
UNSTEMMED_CORPUS_DIR = Path('/home/virginia/workspace/topic-modeling-study/russian_novels') 
ORIGINAL_CORPUS = UNSTEMMED_CORPUS_DIR / 'russian_novels.tsv'
ORIGINAL_CORPUS_COUNTS = UNSTEMMED_CORPUS_DIR / 'russian_novels_counts.tsv'
ORIGINAL_PRUNED_COUNTS = UNSTEMMED_CORPUS_DIR  / 'russian_novels_pruned_counts.tsv'

In [3]:
original_tf = read_tf_csv(ORIGINAL_CORPUS_COUNTS)
print("Unpruned corpus:")
print("Character to token ratio:", get_character_token_ratio(original_tf, 'token', 'term_freq'))
print("Dataframe size:", len(original_tf))
original_tf.head()

Unpruned corpus:
Character to token ratio: 5.211265027848076
Dataframe size: 319459


Unnamed: 0,token,term_freq
0,и,247754
1,не,122673
2,на,91762
3,что,88897
4,в,145809


In [4]:
original_pruned_tf = read_tf_csv(ORIGINAL_PRUNED_COUNTS)
print("Character to token ratio:", get_character_token_ratio(original_pruned_tf, 'token', 'term_freq'))
original_pruned_vocab_size =  len(original_pruned_tf)
original_pruned_token_count = original_pruned_tf["term_freq"].sum()
print("Dataframe size:", original_pruned_vocab_size)
original_pruned_tf.head()

Character to token ratio: 6.678288550826788
Dataframe size: 80540


Unnamed: 0,token,term_freq
0,совсем,3618
1,стал,3395
2,этой,3205
3,которые,3638
4,много,3355


In [5]:
original_stop_words = original_tf[~original_tf['token'].isin(original_pruned_tf['token'])].dropna()
print("Number of words pruned from original vocab:", len(original_stop_words))
display(original_stop_words.head())
display(original_stop_words.tail())

Number of words pruned from original vocab: 238919


Unnamed: 0,token,term_freq
0,и,247754
1,не,122673
2,на,91762
3,что,88897
4,в,145809


Unnamed: 0,token,term_freq
319454,abencerage,1
319455,abc,1
319456,abbasi,1
319457,aaa,1
319458,á,1


# Comparison with stemmed corpus
Keep lemmas/stems, the vocabulary elements in stemmed corpora, as the 'normalized' column and the actual raw 'tokens', the vocabulary elements in the original corpus as 'token' column.

In [6]:
# Just start by comparing pymystem to original 
stemmer = 'pymystem3'
stemmed_dir_path = Path(f'/home/virginia/workspace/topic-modeling-study/russian_novels_{stemmer}')
stemmed_experiment_path = stemmed_dir_path / f'russian_novels_{stemmer}_100topics_1000iters'
stemmed_lemma_counts_path = stemmed_dir_path / f'russian_novels_{stemmer}_lemma_counts.tsv'
# This gets the counds of all lemma/token pairs
stemmed_token_counts_by_author = pd.read_csv(stemmed_lemma_counts_path, sep='\t', header=0, encoding='utf-8')
stemmed_token_counts_by_author.head()

Unnamed: 0,author,token,normalized,count
0,Turgenev,было,быть,919
1,Turgenev,тихое,тихий,9
2,Turgenev,летнее,летний,3
3,Turgenev,утро,утро,58
4,Turgenev,солнце,солнце,42


In [7]:
# Remove the by-author aggregation to get total token, lemma pair counts
token_lemma_counts = stemmed_token_counts_by_author.groupby(["token", "normalized"]).agg({"count":"sum"}).reset_index().sort_values(['count'], ascending=False).rename(columns={'count':'pair_count'})
# Sanity check pair count - should match # of tokens in corpus
print("Total tokens", token_lemma_counts["pair_count"].sum())
display(token_lemma_counts.head())

Total tokens 6084073


Unnamed: 0,token,normalized,pair_count
80685,и,и,248005
18617,в,в,145881
131444,не,не,122758
122817,на,на,91965
297786,что,что,89135


In [8]:
# Do the normalized vocab elements with very high word_type counts make sense? 
count_word_types = token_lemma_counts.groupby(["normalized"]).size().reset_index(name="num_word_types").sort_values("num_word_types", ascending=False)
display(count_word_types.head())
count_word_types = pd.merge(count_word_types, token_lemma_counts, on="normalized")
display(count_word_types.head(100))


Unnamed: 0,normalized,num_word_types
49594,подымать,93
54756,принимать,90
19550,замечать,89
13929,давать,84
19680,занимать,83


Unnamed: 0,normalized,num_word_types,token,pair_count
0,подымать,93,поднял,796
1,подымать,93,подняв,316
2,подымать,93,подняла,289
3,подымать,93,поднимая,248
4,подымать,93,поднять,196
5,подымать,93,подняли,117
6,подымать,93,поднимал,111
7,подымать,93,поднимать,52
8,подымать,93,поднимала,40
9,подымать,93,подымая,36


In [9]:
# Vocab and counts of LEMMAS in unpruned corpus
stemmed_unpruned_tf = read_tf_csv(stemmed_dir_path/f'russian_novels_{stemmer}_counts.tsv').rename(columns={"token":"normalized"})
print("Unpruned")
print("Character to token ratio:", get_character_token_ratio(stemmed_unpruned_tf, 'normalized', 'term_freq'))
print("Dataframe size:", len(stemmed_unpruned_tf))
stemmed_unpruned_tf.head()


Unpruned
Character to token ratio: 5.191822977797933
Dataframe size: 80163


Unnamed: 0,normalized,term_freq
0,и,248005
1,в,149230
2,не,122761
3,на,91969
4,с,76022


In [10]:
# Vocab and counts of LEMMAS in pruned corpus
stemmed_pruned_tf = read_tf_csv(stemmed_dir_path/f'russian_novels_{stemmer}_pruned_counts.tsv').rename(columns={'token':'normalized'})
stemmed_pruned_vocab_size = len(stemmed_pruned_tf)
print("Pruned")
print("Character to token ratio:", get_character_token_ratio(stemmed_pruned_tf, 'normalized', 'term_freq'))
print("Vocab size:", stemmed_pruned_vocab_size)
stemmed_pruned_tf.head()


Pruned
Character to token ratio: 7.252728765481368
Vocab size: 32648


Unnamed: 0,normalized,term_freq
0,сейчас,3532
1,каждый,3594
2,ночь,3885
3,должный,3519
4,душа,3863


In [11]:
# Get dataframe with counts of stopped lemmas, token pairs in stemmed corpus
stemmed_stop_words_df = stemmed_unpruned_tf[~stemmed_unpruned_tf['normalized'].isin(stemmed_pruned_tf['normalized'])].dropna()
print("Number of words pruned from stemmed vocab:", len(stemmed_stop_words_df))
# Sanity check against comparison spreadsheet
total_stopped_tokens =  stemmed_stop_words_df["term_freq"].sum()
print("Total stopped tokens", total_stopped_tokens)
display(stemmed_stop_words_df)



Number of words pruned from stemmed vocab: 47515
Total stopped tokens 3107269


Unnamed: 0,normalized,term_freq
0,и,248005
1,в,149230
2,не,122761
3,на,91969
4,с,76022
...,...,...
80158,аать,1
80159,аасбаа,1
80160,аардваркский,1
80161,аардварка,1


In [12]:
print("Joining token,lemma pair counts with stemmed stop words")
stopped_token_lemma_pairs = pd.merge(token_lemma_counts, stemmed_stop_words_df, how='right', on=['normalized'])
print(len(stopped_token_lemma_pairs), "total stopped (token, lemma) pairs")
total_stopped_pair_counts = stopped_token_lemma_pairs["pair_count"].sum()
print("Sanity check: total stopped tokens according to groupby 'pair_count' (does this match previous cell?)", total_stopped_pair_counts)
assert total_stopped_tokens == total_stopped_pair_counts

display(stopped_token_lemma_pairs)

Joining token,lemma pair counts with stemmed stop words
66594 total stopped (token, lemma) pairs
Sanity check: total stopped tokens according to groupby 'pair_count' (does this match previous cell?) 3107269


Unnamed: 0,token,normalized,pair_count,term_freq
0,и,и,248005,248005
1,в,в,145881,149230
2,въ,в,3349,149230
3,не,не,122758,122761
4,нe,не,3,122761
...,...,...,...,...
66589,аать,аать,1,1
66590,аасбаа,аасбаа,1,1
66591,аардваркской,аардваркский,1,1
66592,аардварка,аардварка,1,1


In [13]:
# What tokens were unstopped originally, but their lemmas became stopped by going over the 25% document frequency threshold? 
print("These (token,lemma) pairs are pruned due to stemming putting the normalized form over the max-df threshold:")
new_stopped_tokens_when_stemming = pd.merge(stopped_token_lemma_pairs, original_pruned_tf, how="inner", on=["token"], suffixes=("_normalized", "_original_token"))
print("Number of original vocab (AKA number of token-lemma pairs) removed by stemming putting the normalized form above the max df threshold:", len(new_stopped_tokens_when_stemming))
print("Number of normalized forms (stems/lemmas) affected:", len(set(new_stopped_tokens_when_stemming["normalized"])))
print("Actual token count removed by stemming putting the normalized form above the max df threshold:", new_stopped_tokens_when_stemming["term_freq_original_token"].sum())
display(new_stopped_tokens_when_stemming.head(100))




These (token,lemma) pairs are pruned due to stemming putting the normalized form over the max-df threshold:
Number of original vocab (AKA number of token-lemma pairs) removed by stemming putting the normalized form above the max df threshold: 2566
Number of normalized forms (stems/lemmas) affected: 609
Actual token count removed by stemming putting the normalized form above the max df threshold: 688006


Unnamed: 0,token,normalized,pair_count,term_freq_normalized,term_freq_original_token
0,въ,в,3349,149230,3332
1,съ,с,1918,76022,1887
2,чему,что,1002,96162,999
3,чѣмъ,что,58,96162,128
4,чѣмъ,чем,80,4780,128
5,чём,что,40,96162,40
6,чемъ,что,32,96162,81
7,чемъ,чем,55,4780,81
8,чeм,что,5,96162,21
9,чeм,чем,16,4780,21


In [14]:
# What tokens weren't in the pruned vocab originally because they were too rare? 
pruned_token_lemma_pairs = pd.merge(stemmed_pruned_tf, token_lemma_counts, on="normalized")
count_pruned_token_lemma_pairs = len(pruned_token_lemma_pairs)
print("These are tokens which are kept (or added to the pruned vocabulary) through stemming, because conflation puts them with a normalized term with frequency higher than the min-tf threshold.")
pruned_stemmed_token_counts = pruned_token_lemma_pairs["pair_count"].sum()
print("Sanity check, does this match the token count for the pruned stemmed corpus?", pruned_stemmed_token_counts)
unstopped_by_stemming = pd.merge(pruned_token_lemma_pairs, original_stop_words, on="token", how="inner", suffixes=("_normalized", "_original_token") )

print("Number of original vocab items (AKA number of token-lemma pairs) kept/added to vocabulary:", len(unstopped_by_stemming))
print("Total tokens kept/added to vocabulary:", unstopped_by_stemming["pair_count"].sum())
print("Total stems (normalized vocab items after stemming) affected:", len(set(unstopped_by_stemming["normalized"])))
display(unstopped_by_stemming)

These are tokens which are kept (or added to the pruned vocabulary) through stemming, because conflation puts them with a normalized term with frequency higher than the min-tf threshold.
Sanity check, does this match the token count for the pruned stemmed corpus? 2976804
Number of original vocab items (AKA number of token-lemma pairs) kept/added to vocabulary: 158245
Total tokens kept/added to vocabulary: 292362
Total stems (normalized vocab items after stemming) affected: 28748


Unnamed: 0,normalized,term_freq_normalized,token,pair_count,term_freq_original_token
0,каждый,3594,каждых,3,3
1,каждый,3594,каждыя,1,1
2,каждый,3594,каждыми,1,1
3,ночь,3885,ночах,2,2
4,должный,3519,должною,3,3
...,...,...,...,...,...
158240,авдеев,5,авдееву,1,1
158241,авдеев,5,авдеевым,1,1
158242,авдеев,5,авдеева,1,1
158243,абезон,5,абезона,3,3


In [16]:
# Theoretically, 'pair_count' and 'term_freq_original_token' should match, but only if tokenizaiton is the same
pruned_token_intersection = pd.merge(pruned_token_lemma_pairs, original_pruned_tf, on="token", how="inner", suffixes=("_normalized", "_original_token"))
num_lemmas = len(set(pruned_token_intersection["normalized"]))
print("Total lemmas/stems/normalized forms in intersection:", num_lemmas)
print("\t... as a percentage of the pruned stemmed vocabulary:", num_lemmas/stemmed_pruned_vocab_size)
num_original_tokens = len(set(pruned_token_intersection["token"]))
print("Distinct tokens (original vocab items) in intersection according to original corpus pairs:", num_original_tokens)
print("\t... as a percentage of the original vocabulary size:", num_original_tokens/original_pruned_vocab_size)
print("\t... as a percentage of the unique (token,lemma) pairs in the pruned stemmed corpus:", num_original_tokens/count_pruned_token_lemma_pairs)
count_token_lemma_pairs_intersect = pruned_token_intersection["pair_count"].sum()
print("Total tokens in intersection acccording to token,lemma pair counts:", count_token_lemma_pairs)
print("\t... as a percentage of the overall (token,lemma) pair counts in the full pruned stemmed corpus:", count_token_lemma_pairs_intersect/pruned_stemmed_token_counts)
count_original_tokens = pruned_token_intersection["term_freq_original_token"].sum()
print("Total tokens in intersection according to original token term freq counts:", count_original_tokens)
print("\t... as a percentage of the overall token count in pruned original corpus:", count_original_tokens/original_pruned_token_count)
print("Intersection snippet:")
display(pruned_token_intersection)

Total lemmas/stems/normalized forms in intersection: 23407
	... as a percentage of the pruned stemmed vocabulary: 0.7169505023278608
Distinct tokens (original vocab items) in intersection according to original corpus pairs: 77540
	... as a percentage of the original vocabulary size: 0.9627514278619319
	... as a percentage of the unique (token,lemma) pairs in the pruned stemmed corpus: 0.3233312206024619


NameError: name 'count_token_lemma_pairs' is not defined

In [16]:
stemmed_experiment_metrics = diagnostics_xml_to_dataframe(stemmed_experiment_path /f'russian_novels_{stemmer}_100topics_1000iters_diagnostics.xml' ).rename(columns={"tokens":"tokens_metric"})
display(stemmed_experiment_metrics.head())
stemmed_experiment_metrics.describe()

Unnamed: 0_level_0,tokens_metric,document_entropy,word-length,coherence,uniform_dist,corpus_dist,eff_num_words,token-doc-diff,rank_1_docs,allocation_ratio,allocation_count,exclusivity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,8111.0,6.3338,5.65,-660.2322,4.3344,3.9184,144.7236,0.0082,0.0141,0.0,0.0019,0.5346
1,4972.0,4.4918,5.9,-633.5989,4.6936,4.2196,102.7199,0.0242,0.1731,0.024,0.1394,0.6468
2,83010.0,8.1134,5.1,-393.8316,3.4836,2.0321,525.2401,0.0003,0.0259,0.0,0.0043,0.1874
3,8054.0,4.9823,5.7,-413.8807,4.0737,3.927,154.024,0.0486,0.0758,0.0,0.0072,0.5684
4,7991.0,5.0774,6.5,-468.5135,4.7438,3.8186,92.4072,0.039,0.3162,0.0,0.0909,0.5532


Unnamed: 0,tokens_metric,document_entropy,word-length,coherence,uniform_dist,corpus_dist,eff_num_words,token-doc-diff,rank_1_docs,allocation_ratio,allocation_count,exclusivity
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,29768.04,6.288914,6.2665,-419.502443,4.190007,3.034068,231.226355,0.017592,0.14984,0.004375,0.053295,0.406579
std,27634.79199,1.222173,0.726833,123.424425,0.538936,0.705475,203.693676,0.016911,0.136242,0.011553,0.071661,0.176703
min,4325.0,4.3953,4.75,-781.6283,2.6399,1.563,24.6487,0.0002,0.0013,0.0,0.0,0.1172
25%,11038.25,5.20535,5.7875,-449.73515,3.7783,2.556,86.061775,0.002225,0.01985,0.0,0.002975,0.25695
50%,18077.5,5.94,6.25,-393.96505,4.1501,3.0752,166.92935,0.0146,0.13375,0.0,0.01745,0.43475
75%,37028.75,7.451475,6.75,-349.7799,4.613225,3.582775,319.32655,0.027675,0.25645,0.0009,0.08065,0.549775
max,124161.0,8.5486,8.0,-204.155,5.4125,4.397,1246.9633,0.0783,0.4857,0.0632,0.3295,0.7937
