In [49]:
from collections import Counter
from multiprocessing import Pool

import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

# Load Data

In [50]:
path = "input/AllCombined.txt"
with open(path, "r") as f:
    data = [line.strip() for line in f if len(line.strip()) > 0]

data[:5]

['April',
 'April (Apr.) is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.',
 'April always begins on the same day of the week as July, and additionally, January in leap years. April always ends on the same day of the week as December.',
 'April comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.',
 "April begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other's last days are exactly 35 weeks (245 days) apart."]

# Preprocess

In [51]:
stops = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()


def preprocess(doc):
    """
    Preprocessing function.
    """
    doc = doc.lower()  # Lowercase
    tokens = tokenizer.tokenize(doc)  # Tokenize

    tokens = [
        lemmatizer.lemmatize(token.strip())  # Lemmatize and strip
        for token in tokens
        if token not in stops  # Remove stopwords
        and len(token) > 1  # Remove single characters
        and not token.isdigit()  # Remove digits
        and not token.replace(".", "").isnumeric()
        and not token.replace(",", "").isnumeric()
    ]

    return " ".join(tokens)


def parallel_preprocess(data, num_workers=4):
    """Parallelize preprocessing."""
    with Pool(num_workers) as pool:
        data_preprocessed = pool.map(preprocess, data)
    return data_preprocessed


data_preprocessed = parallel_preprocess(data, num_workers=4)

In [52]:
# See the preprocessed data
data_preprocessed[:5]

['april',
 'april apr fourth month year julian gregorian calendar come march may one four month day',
 'april always begin day week july additionally january leap year april always end day week december',
 'april come march may making fourth month year also come first year four month day june september november later year',
 "april begin day week july every year day week january leap year april end day week december every year other's last day exactly week day apart"]

# Count Words

In [62]:
# Count words in the corpus
all_words = []
for doc in data_preprocessed:
    words = doc.lower().split()
    all_words.extend(words)

word_counts = Counter(all_words)

In [None]:
# Create DF
df = pd.DataFrame(
    data={
        "word": [k for k, v in dict(word_counts).items()],
        "count": [v for k, v in dict(word_counts).items()],
    }
)
# Sort by count
df = df.sort_values("count", ascending=False, ignore_index=True)

# See the DF
pd.concat((df.head(), df.tail()))

Unnamed: 0,word,count
0,also,97050
1,state,77272
2,people,75698
3,first,70500
4,one,68676
477713,huraira,1
477714,al-qur'āniyya,1
477715,shellroom,1
477716,delila,1
477717,al-jamil,1


In [64]:
# Drop outliers
# This is because some words occur extremely rare
# And some other words occur extremely frequent


def drop_outliers(df: pd.DataFrame, columns: list, method: str, percentile: set = None):
    """
    Drop outliers from a dataframe

    Args:
        df (pd.DataFrame): dataframe to drop outliers from
        columns (list): list of columns to check
        method (str): methid to use

    Returns:
        pd.DataFrame: dataframe with outliers dropped
    """

    index_outliers = []

    # remove samples that are further than 3 STD from mean
    if method == "z_score":
        for col in columns:
            lower_limit = df[col].mean() - 3 * df[col].std()
            upper_limit = df[col].mean() + 3 * df[col].std()
            indexes = df.loc[
                (df[col] < lower_limit) | (df[col] > upper_limit)
            ].index.to_list()
            index_outliers.extend(indexes)

    # remove bottom 5% and top 5% of samples
    elif method == "iqr":
        for col in columns:
            perc05 = df[col].quantile(0.05)
            perc95 = df[col].quantile(0.95)
            iqr = perc95 - perc05
            lower_limit = perc05 - 1.5 * iqr
            upper_limit = perc95 + 1.5 * iqr
            indexes = df.loc[
                (df[col] < lower_limit) | (df[col] > upper_limit)
            ].index.to_list()
            index_outliers.extend(indexes)

    # remove bottom x% and top y% of samples
    elif method == "percentile":
        if percentile == None:
            raise ValueError("Percentile must be filled when using percentile method.")
        for col in columns:
            lower_perc, upper_perc = percentile
            lower_perc = df[col].quantile(lower_perc)
            upper_perc = df[col].quantile(upper_perc)
            iqr = upper_perc - lower_perc
            lower_limit = lower_perc - 1.5 * iqr
            upper_limit = upper_perc + 1.5 * iqr
            indexes = df.loc[
                (df[col] < lower_limit) | (df[col] > upper_limit)
            ].index.to_list()
            index_outliers.extend(indexes)

    else:
        raise ValueError("Method unrecognized.")

    df = df.drop(index=index_outliers)
    return df


df_outliers = drop_outliers(df, ["count"], "percentile", (0.02, 0.98))

In [65]:
# Create percentage
df_outliers["percentage"] = df_outliers["count"] / df_outliers["count"].sum()

# Create the word occurence bin
df_outliers["bin"] = pd.qcut(
    df_outliers.index + 1,
    q=10,
    labels=[
        "10%",
        "20%",
        "30%",
        "40%",
        "50%",
        "60%",
        "70%",
        "80%",
        "90%",
        "100%",
    ],
)
# Group by bin
df_grouped = df_outliers.groupby("bin", observed=True).agg({"percentage": "sum"})
# Create cumsum
df_grouped["percentage_cumsum"] = df_grouped["percentage"].cumsum()

df_grouped

Unnamed: 0_level_0,percentage,percentage_cumsum
bin,Unnamed: 1_level_1,Unnamed: 2_level_1
10%,0.7374,0.7374
20%,0.099158,0.836558
30%,0.048158,0.884716
40%,0.030366,0.915082
50%,0.023369,0.938451
60%,0.014812,0.953262
70%,0.011684,0.964947
80%,0.011684,0.976631
90%,0.011684,0.988316
100%,0.011684,1.0


In [69]:
# See the top words
df["bin"] = pd.qcut(
    df.index + 1,
    q=5,
    labels=[
        "20%",
        "40%",
        "60%",
        "80%",
        "100%",
    ],
)
df.head(10)

Unnamed: 0,word,count,bin
0,also,97050,20%
1,state,77272,20%
2,people,75698,20%
3,first,70500,20%
4,one,68676,20%
5,born,66575,20%
6,city,54822,20%
7,new,53674,20%
8,united,52519,20%
9,year,51806,20%


In [72]:
df["bin"].value_counts()

bin
20%     95544
60%     95544
100%    95544
40%     95543
80%     95543
Name: count, dtype: int64