In [1]:
import pandas as pd
import nltk

In [2]:
# Read the txt data
# Each line has an index, a word, and a count
with open('../raw-data/words.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

list_for_df = []
for l in lines:
    if len(l.split()) == 3:
        list_for_df.append(l.split())

In [3]:
# Create a dataframe from the data
df = pd.DataFrame(list_for_df, columns=['index', 'word', 'count'])

In [4]:
df = df[["word", "count"]]

In [5]:
# Remove nas
df = df.dropna()

In [6]:
# Filter out words that contains non-English characters
df = df[df['word'].str.contains("^[a-zA-Z]+$")].reset_index(drop=True)

In [7]:
# Convert all words to lowercase
df['word'] = df['word'].str.lower()

In [8]:
# Convert count to integer
df['count'] = df['count'].astype(int)

In [9]:
# Merge words with the same spelling
df = df.groupby(['word']).sum().reset_index()

In [10]:
# Sort by count
df = df.sort_values(by=['count'], ascending=False).reset_index(drop=True)

In [11]:
# Remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
df = df[~df['word'].isin(stopwords)].reset_index(drop=True)

In [12]:
# Inspect the data
df["count"].describe(percentiles=[.25, .5, .75, .875, .9, .95, .99])

count    212775.000000
mean         48.351724
std         519.391128
min           1.000000
25%           1.000000
50%           2.000000
75%           6.000000
87.5%        21.000000
90%          31.000000
95%         101.000000
99%         916.000000
max      100600.000000
Name: count, dtype: float64

In [13]:
# Get the top 10% frequent words from the observation of the frequent words
df_frequent = df[df["count"] >= 31].reset_index(drop=True)

In [14]:
# Save the data
df_frequent.to_csv("../metrics/wordcoverage/frequent_words.csv", index=False)