### IS4242 Group 8 Project

### Data Pre-processing

In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string
import time
import datetime 

In [113]:
# Constants

TARGET_VARS = ["hate", "privacy", "sexual", "impersonation", "illegal", "advertisement", "ai"]

### Loading Datasets

In [114]:
# Reddit Tagged Content

T001 = pd.read_excel("data/reddit/T001.xlsx")
T002 = pd.read_excel("data/reddit/T002.xlsx", index_col=0)
T003 = pd.read_excel("data/reddit/T003.xlsx", index_col=0)

# AI Tagged Content
A001 = pd.read_excel("data/ai/A001.xlsx", header=None)
A002 = pd.read_excel("data/ai/A002.xlsx", header=None)
A003 = pd.read_excel("data/ai/A003.xlsx", header=None)

# Additional Data
E001 = pd.read_csv("data/additional/E001.csv", index_col=0)

In [115]:
display(T001.head())
display(T002.head())
display(T003.head())


Unnamed: 0,topic,thread,author,timestamp,body,hate,privacy,sexual,impersonation,illegal,advertisement
0,career,Career,kalleron,1679710497,Design refuse collection and recycling sites. ...,0,0,0,0,0,0
1,career,Career,Hot-Meringue2880,1679678508,Hi I am an Economics graduate with 2+ years of...,0,0,0,0,0,0
2,career,Career,milenakowalska,1679673888,"Hey, in about a year I’ll finish my bachelor d...",0,0,0,0,0,0
3,career,Career,Rstonerphd,1679676533,I am studying what makes employees want to lea...,0,0,0,0,0,1
4,career,Career,Temporary-Section-50,1679611586,Hi all! I accepted a job offer earlier this mo...,0,0,0,0,0,0


Unnamed: 0,topic,thread,author,timestamp,body,hate,privacy,sexual,impersonation,illegal,advertisement
0,School,School,PrimaryColt,1628458147,"You will be banned. \n\nThat’s the end of it,\...",0,0,0,0,0,0
1,School,School,Quirky_Emu_781,1632795360,good mod,0,0,0,0,0,0
2,School,School,Magical_UnicornCat,1644244704,So grades count? I understand not posting your...,0,0,0,0,0,0
3,School,School,Affectionate-Yeet,1644284600,Ok,0,0,0,0,0,0
4,School,School,sammywriter,1653912733,Can you post you assist students with assignme...,0,0,0,0,0,0


Unnamed: 0,topic,thread,author,timestamp,body,hate,privacy,sexual,impersonation,illegal,advertisement
0,music,r/Music,DANNY_PROPERTY,1670004290,I’m Awsten from the band Waterparks. We just a...,0,0,0,0,0,0
1,music,r/Music,crazy-cat67,1670006076,"Hypothetically, if fob8 were to drop the same ...",0,0,0,0,0,0
2,music,r/Music,piercetheangelina,1670007207,Waterparks merch mystery boxes would be cool! ...,0,0,0,0,0,0
3,music,r/Music,Danteb132,1670005754,What is the greatest accomplishment you’ve eve...,0,0,0,0,0,0
4,music,r/Music,briannalvnn,1670006030,thoughts on a waterparks holiday album? 🎤,0,0,0,0,0,0


In [116]:
display(A001.head())
display(A002.head())
display(A003.head())

Unnamed: 0,0
0,Just got back from an amazing vacation in Hawa...
1,Can't believe it's already been a year since I...
2,Just finished a great workout at the gym. Feel...
3,Tried a new recipe for dinner tonight and it t...
4,Watching the sunset with my significant other....


Unnamed: 0,0
0,"""OMG, can't believe she's wearing that outfit...."
1,"""Ugh, this meeting is taking forever. Can't wa..."
2,"""Why are you still single? Maybe if you weren'..."
3,"""I hate this class and the professor is so bor..."
4,"""Why are you posting about your boring life? N..."


Unnamed: 0,0
0,"""Just started a new job and feeling overwhelme..."
1,"""Finally finished reading 'The Great Gatsby' a..."
2,"""I'm turning 30 next week and feeling a little..."
3,"""I've been trying to eat healthier but struggl..."
4,"""Feeling a little burned out from social media..."


In [117]:
display(E001.head())

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


### Combining Datasets

In [118]:
def populate_target_cols(df, target_vars):
    for target_var in target_vars:
        if target_var not in df.columns:
            df[target_var] = 0
    return df

In [119]:
# Reddit Content

# Populate target columns

T001 = populate_target_cols(T001, TARGET_VARS)
T002 = populate_target_cols(T002, TARGET_VARS)
T003 = populate_target_cols(T003, TARGET_VARS)

# Concatenating columns

T001["body"] = T001["topic"] + " " + T001["thread"] + " " + T001["author"] + " " + T001["body"]
T002["body"] = T002["topic"] + " " + T002["thread"] + " " + T002["author"] + " " + T002["body"]
T003["body"] = T003["topic"] + " " + T003["thread"] + " " + T003["author"] + " " + T003["body"]

# Selecting relevant columns

T001 = T001[['body'] + TARGET_VARS]
T002 = T002[['body'] + TARGET_VARS]
T003 = T003[['body'] + TARGET_VARS]

In [120]:
# AI Content

A001 = A001.rename(columns={0:"body"})
A002 = A002.rename(columns={0:"body"})
A003 = A003.rename(columns={0:"body"})

A001 = populate_target_cols(A001, TARGET_VARS)
A002 = populate_target_cols(A002, TARGET_VARS)
A003 = populate_target_cols(A003, TARGET_VARS)


In [121]:
# Extra Content

E001 = E001.rename(columns={"tweet":"body"})
E001 = populate_target_cols(E001, TARGET_VARS)
E001["hate"] = E001["class"].apply(lambda x: 1 if x==1 else 0)
E001 = E001[['body'] + TARGET_VARS]

In [122]:
df_combined_raw = pd.concat([T001, T002, T003, A001, A002, A003, E001], ignore_index=True)
df_combined_raw

Unnamed: 0,body,hate,privacy,sexual,impersonation,illegal,advertisement,ai
0,career Career kalleron Design refuse collectio...,0,0,0,0,0,0,0
1,career Career Hot-Meringue2880 Hi I am an Econ...,0,0,0,0,0,0,0
2,"career Career milenakowalska Hey, in about a y...",0,0,0,0,0,0,0
3,career Career Rstonerphd I am studying what ma...,0,0,0,0,0,1,0
4,career Career Temporary-Section-50 Hi all! I a...,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
41637,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,1,0,0,0,0,0,0
41638,"you've gone and broke the wrong heart baby, an...",0,0,0,0,0,0,0
41639,young buck wanna eat!!.. dat nigguh like I ain...,1,0,0,0,0,0,0
41640,youu got wild bitches tellin you lies,1,0,0,0,0,0,0


In [123]:
df_combined_raw.to_csv(f"data/df_combined_raw_{datetime.date.today().strftime('%Y-%m-%d')}.csv", index=False)

### Preprocessing Raw Text Data

In this section, we will perform the following steps:
1. Remove Punctuations
2. Convert to lowercase
3. Remove non-alphanumeric characters
4. Remove stopwords
5. Remove extra spaces, new lines, tabs
6. Remove punctuations
7. Lemmetize and Stem text 
8. Remove words with length < 2

In [124]:
# Enforce text to string type
def enforce_text_to_string(text):
    if type(text) != str:
        text = str(text)
    return text

def load_stopwords():
    try:
        with open('cache_files/stopwords.txt', "r") as word_list:
            stopwords = word_list.read().split('\n')
    except:
        nltk.download('stopwords')
        stopwords = nltk.corpus.stopwords.words('english')
        with open('cache_files/stopwords.txt', "w") as word_list:
            for word in stopwords:
                word_list.write(word + '\n')
    return stopwords

# Removing punctuations

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

# Coverting to lowercase

def to_lowercase(text):
    return text.lower()

# Removing non-alphanumeric characters

def remove_non_alphanumeric(text):
    return re.sub(r'\W+', ' ', text)

# Removing stopwords

def remove_stopwords(text):
    stopwords = load_stopwords()
    text = text.split()
    text = [word for word in text if word not in stopwords]
    return " ".join(text)

# Removing numbers

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Removing words with length less than 2

def remove_words_with_length_less_than_2(text):
    text = text.split()
    text = [word for word in text if len(word) > 1]
    return " ".join(text)

# Removing extra spaces

def remove_extra_spaces(text):
    return " ".join(text.split())

# Removing extra newlines

def remove_extra_newlines(text):
    return re.sub(r'    ', '    ', text)

# Removing extra tabs

def remove_extra_tabs(text):
    return re.sub(r'  ', '  ', text)

# Removing punctuations

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

# Lemmatizing text

def lemmatize_text(text):
    try:
        lemmatizer = nltk.stem.WordNetLemmatizer()
    except:
        nltk.download('wordnet')
        lemmatizer = nltk.stem.WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

# Stemming text

def stem_text(text):
    stemmer = nltk.stem.PorterStemmer()
    text = text.split()
    text = [stemmer.stem(word) for word in text]
    return " ".join(text)

# Text Preprocessing Pipeline

def preprocess_text(text, 
                    _remove_punctuations=True, 
                    _to_lowercase=True, 
                    _remove_non_alphanumeric=True, 
                    _remove_stopwords=True, 
                    _remove_extra_spaces=True, 
                    _remove_extra_newlines=True, 
                    _remove_extra_tabs=True, 
                    _lemmatize_text=True, 
                    _stem_text=True,
                    _remove_words_with_length_less_than_2=True):
    
    text = enforce_text_to_string(text)
    if _remove_punctuations:
        text = remove_punctuations(text)
    if _to_lowercase:
        text = to_lowercase(text)
    if _remove_non_alphanumeric:
        text = remove_non_alphanumeric(text)
    if _remove_stopwords:
        text = remove_stopwords(text)
    if _remove_extra_spaces:
        text = remove_extra_spaces(text)
    if _remove_extra_newlines:
        text = remove_extra_newlines(text)
    if _remove_extra_tabs:
        text = remove_extra_tabs(text)
    if _lemmatize_text:
        text = lemmatize_text(text)
    if _stem_text:
        text = stem_text(text)
    if _remove_words_with_length_less_than_2:
        text = remove_words_with_length_less_than_2(text)
    return text

In [125]:
# Preprocessing text
time_start = time.time()
df_combined_processed = df_combined_raw.copy()
df_combined_processed["body"] = df_combined_processed["body"].apply(preprocess_text)
df_combined_processed = df_combined_processed[df_combined_processed["body"] != "nan"]
df_combined_processed = df_combined_processed.dropna()
df_combined_processed = df_combined_processed.reset_index(drop=True)
df_combined_processed.to_csv(f"data/df_combined_processed_{datetime.date.today().strftime('%Y-%m-%d')}.csv", index=False)
print(f"Time taken to preprocess text: {round(time.time() - time_start, 2)} seconds")

Time taken to preprocess text: 41.59 seconds


In [126]:
display(df_combined_processed)

Unnamed: 0,body,hate,privacy,sexual,impersonation,illegal,advertisement,ai
0,career career kalleron design refus collect re...,0,0,0,0,0,0,0
1,career career hotmeringue2880 hi econom gradua...,0,0,0,0,0,0,0
2,career career milenakowalska hey year finish b...,0,0,0,0,0,0,0
3,career career rstonerphd studi make employe wa...,0,0,0,0,0,1,0
4,career career temporarysection50 hi accept job...,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
41198,you muthafin lie 8220lifeask 20pearl coreyeman...,1,0,0,0,0,0,0
41199,youv gone broke wrong heart babi drove redneck...,0,0,0,0,0,0,0
41200,young buck wanna eat dat nigguh like aint fuck...,1,0,0,0,0,0,0
41201,youu got wild bitch tellin lie,1,0,0,0,0,0,0
