In [None]:
import pandas as pd
import numpy as np
import regex as re
import string
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import time
import random

import nltk
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Train/Uncleaned_Data/labelled.csv', encoding_errors = 'ignore', usecols=[1,2])

In [None]:
df.head(5)

Unnamed: 0,text,Label
0,"""He then went on to tell me about what the US ...",0.0
1,"Im crying, but I know you dont need my tears. ...",1.0
2,Most successful first-world countries have alm...,1.0
3,Look at Scandinavian countries too. Many have ...,-1.0
4,what an incredibly stale and ignorant take on ...,1.0


# Preprocessing

In [None]:
def mention_hashtags(text):
    #remove hashtags & mentions
    text = re.sub("@[A-Za-z0-9_]+","", text)
    text = re.sub("#[A-Za-z0-9_]+","", text)
    return text
    
def clean_text(text):
    text = str(text)
    text = re.sub(r'[^a-zA-Z ]+', ' ', text)
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'pic.twitter\S+', ' ', text)
    text = re.sub(r'#', '', text)
    text = text.lower()
    return text

def clean_urls(review):
    review = review.split()
    review = ' '.join([word for word in review if not re.match('^http', word)])
    return review

def decontracted(text):
    text = re.sub(r"won\’t", "will not", text)
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"don’t", "do not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"can\’t", "can not", text)
    text = re.sub(r"n\’t", " not", text)
    text = re.sub(r"\’re", " are", text)
    text = re.sub(r"it\'s", "it is", text)
    text = re.sub(r"\’d", " would", text)
    text = re.sub(r"\’ll", " will", text)
    text = re.sub(r"\’t", " not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\’re", " are", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\’d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\’ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\’t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\’ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\’m", " am", text)
    text = re.sub(r"\“", "", text)
    text = re.sub(r"\”", "", text)
    text = re.sub(r"\…", "", text)
    return text

import html
def remove_html_punc(text):
    text = html.unescape(text)
    text =  text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join([word for word in text.split()])
    text = text.lower()
    return text

## Example of Cleaned Text

In [None]:
df['text'][6]

'&gt;there are rifles capable of murdering a dozen people in 5 minutes\n\nMy brother in christ there are *knives* capable of murdering a dozen people in under a minute. Fun fact: pistols have extended/double stack mags.  You can get a 19 round mag for a glock for $30, and if we pick a very conservative time for drawing on a close target (10 seconds), it could be emptied by somebody who\'s trained and calm, firing well-aimed shots, in about 3 minutes. \n\n&gt;Handgun deaths are usually related to crime and poverty\n\nAu contraire, mon ami. For decades, most firearm deaths (the overwhelming majority of which involving handguns) [are suicides](https://usafacts.org/data/topics/security-safety/crime-and-justice/firearms/firearm-deaths/), which indicates mental health issues, not criminality. Even in cases identified as homicide, common motivations such as intimate partner violence or personal vendettas are, again, indicators of insufficient mental health care, not necessarily criminality or

In [None]:
df['Cleaned_Text'] = df['text'].apply(mention_hashtags).apply(clean_urls).apply(decontracted).apply(remove_html_punc).apply(clean_text)

In [None]:
df['Cleaned_Text'][6]

'there are rifles capable of murdering a dozen people in   minutes my brother in christ there are knives capable of murdering a dozen people in under a minute fun fact pistols have extendeddouble stack mags you can get a   round mag for a glock for   and if we pick a very conservative time for drawing on a close target   seconds it could be emptied by somebody whos trained and calm firing wellaimed shots in about   minutes handgun deaths are usually related to crime and poverty au contraire mon ami for decades most firearm deaths the overwhelming majority of which involving handguns are suicides  which indicates mental health issues not criminality even in cases identified as homicide common motivations such as intimate partner violence or personal vendettas are again indicators of insufficient mental health care not necessarily criminality or poverty gangrelated violence is massively overblown by popular media and orders of magnitude more black men die serving life in prison than in t

# Text Normalization

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Stemming & Lemmatization Function

In [None]:
def stem_text(x):
    stemmer = PorterStemmer()
    x = word_tokenize(x)
    stem = ''
    
    for i in x:
        stem += stemmer.stem(i) + ' '
    return stem

def lemmatize_text(corpus):
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in tweet.split()]) for tweet in corpus]


In [None]:
stop = stopwords.words('english')
additional_stopwords = ["'s","...","'ve","``","''","'m",'--',"'ll","'d"]
stop = set(stop + additional_stopwords)

def remove_stopwords(x):
    x = word_tokenize(x)
    store_words = ''
    
    for i in x:
        if i not in stop:
            store_words += i + ' '
            
    return store_words

### Remove **stopwords** on Cleaned_Text then use Stemming/Lemmatization Function for tokenization


In [None]:
df['Lemmatized_Text'] = lemmatize_text(df['Cleaned_Text'].apply(remove_stopwords))
df['Stemmed_Text'] = df['Cleaned_Text'].apply(remove_stopwords).apply(stem_text)

In [None]:
df.rename(columns = {'text':'Original_Text'}, inplace = True)
new_df = df[['Original_Text', 'Cleaned_Text', 'Lemmatized_Text', 'Stemmed_Text', 'Label']]
new_df.to_csv('./drive/MyDrive/Train/Cleaned_Data/cleaned_labelled.csv', index=False)

In [None]:
print("Text Corpus Information for Labelled data:")
print()
wordcount = df['Original_Text'].apply(lambda x: len(x.split())).sum()
print("There are {} words in the corpus.".format(wordcount))

wordcount = df['Cleaned_Text'].apply(lambda x: len(x.split())).sum()
print("There are {} words in the corpus after cleaning.".format(wordcount))

wordcount = df['Stemmed_Text'].apply(lambda x: len(x.split())).sum()
print("There are {} words in the corpus after stemming and removal of stopwords.".format(wordcount))

wordcount = df['Lemmatized_Text'].apply(lambda x: len(x.split())).sum()
print("There are {} words in the corpus after lemmatization and removal of stopwords.".format(wordcount))



Text Corpus Information for Labelled data:

There are 244375 words in the corpus.
There are 242243 words in the corpus after cleaning.
There are 126856 words in the corpus after stemming and removal of stopwords.
There are 126856 words in the corpus after lemmatization and removal of stopwords.


In [None]:
df.Stemmed_Text.str.split(expand=True).stack().value_counts()

gun        3926
peopl      1553
violenc    1512
america    1369
like        939
           ... 
forearm       1
ensnar        1
bothsu        1
humili        1
hush          1
Length: 10119, dtype: int64

In [None]:
df.Lemmatized_Text.str.split(expand=True).stack().value_counts()

gun               3920
people            1553
violence          1511
america           1369
like               818
                  ... 
postcapitalist       1
widescale            1
roleplay             1
detector             1
dragged              1
Length: 13324, dtype: int64