## Notebook to clean and save raw data

In [None]:
import spacy
import pandas as pd
import time
from math import ceil

In [None]:
# create directories for data
!mkdir -p data/clean
!mkdir -p data/interim/tfidfs
!mkdir -p data/interim/vectorizers
!mkdir -p data/interim/vocabs
!mkdir -p data/raw
!mkdir -p data/results/models

In [None]:
# if required
# !python3 -m spacy download en_core_web_lg

In [None]:
nlp = spacy.load("en_core_web_lg", disable=["parser", "ner"])

In [None]:
#### PREPROCESSING FUNCTION

# apply preprocessing: stop-word removal, lemmatization
# will take a long time! experiment with small batches first.

def nlp_preprocess(df):
    docs = df["text"].tolist()

    num_docs = 0
    start_time = time.time()

    preprocessed_texts = []

    for doc in nlp.pipe(docs):
        # use this instead of "for doc in docs" for memory efficiency
        tokens = []
        for token in doc:
            if not (token.is_stop or token.is_punct or token.is_space):
                tokens.append(token.lemma_)

        text = " ".join(tokens)
        preprocessed_texts.append(text)

        # timing for Google Colab tracking
        num_docs += 1
        if num_docs % 10000 == 0:
            curr_time = time.time()
            print(
                f"Cleaned: {num_docs}/{len(docs)} docs. Time elapsed: {((curr_time - start_time) / 60):.1f} min."
            )

    df["text_preproced"] = preprocessed_texts
    df = df[["label", "text_preproced", "text"]]

    return df

In [None]:
# clean and save train df
load_path = "../data/raw/training_dataset.xlsx"
df = pd.read_excel(load_path)

print("Loaded dataset.")

df_clean = nlp_preprocess(df)
print("Cleaned dataset.")

save_path = "../data/clean/training_data_cleaned.xlsx"
df_clean.to_excel(save_path)
print("Saved dataset.")

In [None]:
# clean and save validation df
load_path = "../data/raw/validation_dataset.xlsx"
df = pd.read_excel(load_path)

print("Loaded dataset.")

df_clean = nlp_preprocess(df)
print("Cleaned dataset.")

save_path = "../data/clean/validation_data_cleaned.xlsx"
df_clean.to_excel(save_path)
print("Saved dataset.")

In [None]:
# clean and save test df
load_path = "../data/raw/test_dataset.xlsx"
df = pd.read_excel(load_path)

print("Loaded dataset.")

df_clean = nlp_preprocess(df)
print("Cleaned dataset.")

save_path = "../data/clean/test_data_cleaned.xlsx"
df_clean.to_excel(save_path)
print("Saved dataset.")

In [None]:
# save just the labels - used in test inference script
data_types =  ["test", "validation", "training"]

for data_type in data_types:
    load_path = f"../data/raw/{data_type}_dataset.xlsx"
    df = pd.read_excel(load_path)
    df.drop(columns=["text"], inplace=True)
    df.to_excel(f"../data/clean/{data_type}_labels.xlsx")