# Thesis 2020-2021: N-Grams model 

In this notebook, we will create an N-Grams model.

In [85]:
import pandas as pd
import numpy as np
import math

import matplotlib
import matplotlib.pyplot as plt

In [86]:
import csv
    
df_train = pd.read_csv('data/hateval2019_en_train.csv')
df_dev = pd.read_csv('data/hateval2019_en_dev.csv')

df_train_dev = df_train.append(df_dev, ignore_index=True)
df_train_dev = df_train_dev.drop(['TR', 'AG'], axis=1)
df_train_dev

Unnamed: 0,id,text,HS
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1
1,202,Why would young fighting age men be the vast m...,1
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0
4,205,Orban in Brussels: European leaders are ignori...,0
...,...,...,...
9995,19196,@SamEnvers you unfollowed me? Fuck you pussy,0
9996,19197,@DanReynolds STFU BITCH! AND YOU GO MAKE SOME ...,1
9997,19198,"@2beornotbeing Honey, as a fellow white chick,...",0
9998,19199,I hate bitches who talk about niggaz with kids...,1


In [87]:
df_test = pd.read_csv('data/hateval2019_en_test.csv')
df_test = df_test.drop(['TR', 'AG'], axis=1)
df_test

Unnamed: 0,id,text,HS
0,34243,"@local1025 @njdotcom @GovMurphy Oh, I could ha...",0
1,30593,Several of the wild fires in #california and #...,0
2,31427,@JudicialWatch My question is how do you reset...,0
3,31694,"#Europe, you've got a problem! We must hurry...",1
4,31865,This is outrageous! #StopIllegalImmigration #...,1
...,...,...,...
2995,31368,you can never take a L off a real bitchðŸ˜© im ho...,1
2996,30104,@Brian_202 likes to call me a cunt & a bitch b...,1
2997,31912,@kusha1a @Camio_the_wise @shoe0nhead 1. Never ...,1
2998,31000,If i see and know you a hoe why would i hit yo...,1


In [88]:
import re
from pattern.text.en import singularize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

tokenizer = TweetTokenizer()
stop_words = set(stopwords.words('english'))

# Create a function to clean the tweets
def cleanTxt(text):
    text = text.lower() # Convert everything to lower case
    text = re.sub(r'@[a-zA-Z0-9]+', '', text) # Remove @mentions
    text = re.sub(r'rt[\s]+', '', text) # Remove RT (retweet symbol)
    text = re.sub(r'&amp;', 'and', text) # Replace '&amp;' by 'and'
    text = re.sub(r'https?:\/\/\S+', '', text) # Remove hyper link  
    #text = re.sub(r'\d+', '0', text) # Replace all numbers by a zero
    text = " ".join([singularize(word) for word in tokenizer.tokenize(text) if word not in stop_words]) # Remove stopwords
    #text = " ".join([singularize(word) for word in text])
    text = re.sub(r'[^\w\s#]', ' ', text) # Remove all non-alphanumeric symbols (excluding whitespace and # characters)
    text = re.sub(r'\s+', ' ', text) # Replace multiple whitespaces by a single whitespace
    text = text.strip() # Remove whitespaces at the beginning and at the end
    
    return text

In [89]:
# Clean the data

df_train_dev['text_cleaned'] = df_train_dev['text'].apply(cleanTxt)
df_test['text_cleaned'] = df_test['text'].apply(cleanTxt)
df_train_dev

Unnamed: 0,id,text,HS,text_cleaned
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,hurray saving u many way #lockthemup #buildthe...
1,202,Why would young fighting age men be the vast m...,1,would young fighting age man vast majority one...
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,illegal dump kid border like road kill refuse ...
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,ny time s nearly white s state pose s array pr...
4,205,Orban in Brussels: European leaders are ignori...,0,orban brussel european leader ignoring person ...
...,...,...,...,...
9995,19196,@SamEnvers you unfollowed me? Fuck you pussy,0,unfollowed fuck pussy
9996,19197,@DanReynolds STFU BITCH! AND YOU GO MAKE SOME ...,1,stfu bitch go make satanic music u illuminatus...
9997,19198,"@2beornotbeing Honey, as a fellow white chick,...",0,honey fellow white chick let tell need shut fu...
9998,19199,I hate bitches who talk about niggaz with kids...,1,hate bitch talk niggaz kid everybody cant find...


In [96]:
X_train = df_train_dev.text_cleaned
X_test = df_test.text_cleaned
y_train = df_train_dev.HS
y_test = df_test.HS

In [97]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(2,2)) # Convert a collection of text documents to a matrix of token counts
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [98]:
print(vectorizer.get_feature_names()[4000:4050])
X_train.toarray()
#print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

['almost like', 'almost nothing', 'almost seriously', 'almost three', 'aloe banged', 'alon damn', 'alone 10', 'alone according', 'alone aka', 'alone animated', 'alone annoying', 'alone benefit', 'alone broad', 'alone cannot', 'alone capital', 'alone cros', 'alone cunt', 'alone done', 'alone dont', 'alone end', 'alone forever', 'alone good', 'alone got', 'alone hangtown', 'alone italian', 'alone leave', 'alone loser', 'alone man', 'alone mind', 'alone nobody', 'alone pay', 'alone recount', 'alone rubbing', 'alone ur', 'alone we', 'alone whore', 'along artist', 'along border', 'along condition', 'along entire', 'along first', 'along foium', 'along gife', 'along growing', 'along lyric', 'along man', 'along melanium', 'along poor', 'along porou', 'along really']


MemoryError: Unable to allocate 6.49 GiB for an array with shape (10000, 87110) and data type int64

In [46]:
# Normalize the data via StandardScaler

from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train.toarray())
X_train_scaled = scaler.transform(X_train.toarray())
X_test_scaled = scaler.transform(X_test.toarray())

#clf_scaled = LogisticRegression().fit(X_train_scaled, y_train)
#y_scaled_predict = clf_scaled.predict(X_test_scaled)

MemoryError: Unable to allocate 9.05 GiB for an array with shape (10000, 121406) and data type int64