# Zimp Vocabulary Size Evaluation
How many different words? How frequent are they?

In [1]:
import pandas as pd
import numpy as np
import os
import logging
import matplotlib.pyplot as plt
import time
from zimp.pos.countvectorizer_analyzer import CountVectorizerAnalyzer
from glob import glob

plt.style.use('seaborn-whitegrid')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!


In [2]:
source_dir = '../../zimp_orchestrator/orch/resources'
files = {}

In [3]:
for ds_path in glob(source_dir + '/*'):
    if not os.path.isdir(ds_path):
        continue
    ds_name = os.path.basename(ds_path)
    files[ds_name] = [os.path.join(ds_path, 'train.csv'), os.path.join(ds_path, 'test.csv')]

In [4]:
def get_ds_stats(paths, cva_builder, track_name):
    texts = pd.read_csv(paths[0]).text.append(pd.read_csv(paths[1]).text)
    return cva_builder(texts).extract_dataset_metric().rename(columns={'count': track_name})

def get_ds_language(track):
    if track in ['10K-GNAD', 'GERMEVAL-2018', 'GERMEVAL-2020']:
        return 'german'
    return 'english'

In [5]:
file_path = 'measures/vocab_size_<DS>.csv'
results_dfs = {}
for dataset in files:
    dataset_path = file_path.replace('<DS>', dataset)
    if os.path.exists(dataset_path):
        df_dataset = pd.read_csv(dataset_path, index_col=0)
        print('Loaded existing stats for', dataset)
    else:
        df_dataset = get_ds_stats(files[dataset], lambda texts: CountVectorizerAnalyzer(texts, language=get_ds_language(dataset)), 'count')
        df_dataset.to_csv(dataset_path)
        print('Calculated stats for', dataset)
        
    results_dfs[dataset] = df_dataset

results_dfs['TREC-6'].head(n=5)

Calculated stats for 10K-GNAD
Calculated stats for DBP-14
Calculated stats for GERMEVAL-2018
Calculated stats for GERMEVAL-2020
Loaded existing stats for TREC-6
Calculated stats for YELP-5


Unnamed: 0,count
?,5850
the,4066
what,3726
is,1970
of,1641


In [6]:
def get_vocab_stats(df_vocab):
    df_cumrat = df_vocab.cumsum()/df_vocab.sum()
    return {
        'vocab_size': df_vocab.index.size,
        'min_2_occ': (df_vocab > 1).sum().get('count'),
        'min_3_occ': (df_vocab > 2).sum().get('count'),
        'min_5_occ': (df_vocab > 4).sum().get('count'),
        'vocab_5_percent': (df_cumrat < 0.05).sum().get('count') + 1,
        'vocab_10_percent': (df_cumrat < 0.1).sum().get('count') + 1,
        'vocab_25_percent': (df_cumrat < 0.25).sum().get('count') + 1,
        'vocab_50_percent': (df_cumrat < 0.5).sum().get('count') + 1,
        'vocab_top10': df_cumrat.iloc[9, 0],
        'vocab_top100': df_cumrat.iloc[99, 0],
        'vocab_top300': df_cumrat.iloc[299, 0],
        'vocab_top500': df_cumrat.iloc[499, 0],
    }

In [7]:
file_path = 'measures/vocab_stats.csv'
if os.path.exists(file_path):
    df_vocab = pd.read_csv(file_path)
else:
    vocab_series = []
    for dataset in results_dfs:
        vocab_series.append(pd.Series(get_vocab_stats(results_dfs[dataset]), name=dataset))
    df_vocab = pd.DataFrame(vocab_series)
    df_vocab.to_csv(file_path)
    
df_vocab

Unnamed: 0,vocab_size,min_2_occ,min_3_occ,min_5_occ,vocab_5_percent,vocab_10_percent,vocab_25_percent,vocab_50_percent,vocab_top10,vocab_top100,vocab_top300,vocab_top500
10K-GNAD,227275.0,99285.0,67546.0,43569.0,1.0,2.0,13.0,125.0,0.230348,0.480994,0.569957,0.61009
DBP-14,967723.0,493036.0,282784.0,166010.0,1.0,3.0,9.0,148.0,0.279432,0.469249,0.562971,0.612887
GERMEVAL-2018,25822.0,9643.0,6102.0,3579.0,1.0,2.0,11.0,89.0,0.243331,0.514432,0.626194,0.674467
GERMEVAL-2020,217703.0,54480.0,33770.0,20778.0,1.0,2.0,6.0,43.0,0.324037,0.594837,0.711502,0.759534
TREC-6,8965.0,3638.0,2277.0,1277.0,1.0,2.0,4.0,30.0,0.370228,0.595349,0.68012,0.722379
YELP-5,768397.0,236568.0,157574.0,104285.0,2.0,3.0,10.0,62.0,0.262256,0.569781,0.705211,0.765781
