In [1]:
import yaml
import pandas as pd
from urllib.parse import unquote
import preprocessor as tweet_preprocessor

with open("params.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

In [2]:
def extract_match(text):
    return [match.match for match in text] if text is not None else text

In [3]:
def len_if_not_none(list):
    return len(list) if list is not None else 0

In [4]:
def apply_row_by_row(df, column, function):
    return df.apply(lambda row: function(row[column]), axis=1)

In [5]:
(pd.read_csv(config["raw_dataset_path"], index_col = 0)
 .drop_duplicates()
 .assign(text_cleaned=lambda df: apply_row_by_row(df, "text", tweet_preprocessor.clean))
 .assign(tweet_tokenized=lambda df: apply_row_by_row(df, "text", tweet_preprocessor.tokenize))
 .assign(text_hashtags=lambda df: apply_row_by_row(df, "text", tweet_preprocessor.tokenize))
 .assign(text_parsed=lambda df: apply_row_by_row(df, "text", tweet_preprocessor.parse))
 .assign(text_hashtags=lambda df: apply_row_by_row(df, "text_parsed", lambda row: extract_match(row.hashtags)))
 .assign(text_mentions=lambda df: apply_row_by_row(df, "text_parsed", lambda row: extract_match(row.mentions)))
 .assign(text_urls=lambda df: apply_row_by_row(df, "text_parsed", lambda row: len_if_not_none(row.urls)))
 .assign(keyword_cleaned=lambda df: apply_row_by_row(df, "keyword", lambda row: unquote(row) if pd.notnull(row) else None))
 .drop(columns=["keyword", "text", "text_parsed"])
 .reset_index(drop=True)
 .to_csv(config["processed_dataset_path"], index=False)
)