In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer as countvec
from sklearn.ensemble import VotingClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
import random
random.seed(0)

In [2]:
df = pd.read_csv(r'./data/train.tsv', sep='\t') # import training set
df

Unnamed: 0,text,label
0,RT @USER: when you in a public place and ya si...,NG
1,@USER he ate my pussy i could..,NG
2,"When you give somebody that ""bitch"" look",NG
3,Double Stuff Oreos mane say,OK
4,@USER lol la bitch happy bout Beatin our bench...,NG
...,...,...
18677,RT @USER: @USER @USER @USER @USER fuck no id b...,NG
18678,"""@USER: ""@USER: ""@USER: ""@USER: @USER swears I...",OK
18679,Its wayyyyy more going on now than 2 have u pu...,NG
18680,"Mum agrees with me, ur being a wee moody cunt ...",NG


In [3]:
print(df.shape) 
print(df.label.value_counts()) #data unbalanced: need to be under/oversampled

(18682, 2)
NG    15352
OK     3330
Name: label, dtype: int64


In [4]:
X = df[['text']]
y = df[['label']].replace({'NG': 0, 'OK': 1}) 

In [5]:
# Oversample the data
print(X.shape)
print(y.shape)
ros = RandomOverSampler(random_state=0)
X_oversampled, y_oversampled = ros.fit_resample(X, y)
print(X_oversampled.shape)
print(y_oversampled.shape)

(18682, 1)
(18682, 1)
(30704, 1)
(30704, 1)


In [6]:
df = pd.concat([X_oversampled, y_oversampled], axis = 1)
df

Unnamed: 0,text,label
0,RT @USER: when you in a public place and ya si...,0
1,@USER he ate my pussy i could..,0
2,"When you give somebody that ""bitch"" look",0
3,Double Stuff Oreos mane say,1
4,@USER lol la bitch happy bout Beatin our bench...,0
...,...,...
30699,“@USER: When twitter rappers dm me their trash...,1
30700,RT @USER: Autographed Photo of Babe Ruth and L...,1
30701,Why can't I have colored eyes like my siblings 😭😭,1
30702,@USER you a Oreo cookie now.,1


In [7]:
X_oversampled = df.iloc[:, 0]
y_oversampled = df.iloc[:, 1]

In [8]:
# Split data in train and test with 0.2 factor
X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_oversampled, test_size=0.2, random_state=0)

# Create customized stopping words set by including english stop words and @USER and {{URL}}
my_additional_stop_words = {'@USER', '{{URL}}', 'url', 'user'}
stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

# Bag of Words and delete stopping words
countvec = countvec(strip_accents='unicode', stop_words=stop_words, analyzer='word')

X_train_counts = countvec.fit_transform(X_train)
X_test_counts = countvec.transform(X_test)

# Initialize classifiers
log_clf = LogisticRegression(n_jobs=-1)
sgd_clf = SGDClassifier(loss='modified_huber', n_jobs=-1)
svm_clf = SVC(probability=True)
mnb_clf = MultinomialNB()
voting_clf = VotingClassifier( 
    estimators=[('lr', log_clf), ('svm', svm_clf), ('mnb', mnb_clf)], 
    voting='soft', n_jobs=-1)

#Train and evaluation
for clf in (log_clf, svm_clf, mnb_clf, voting_clf):
    clf.fit(X_train_counts, y_train)
    y_pred = clf.predict(X_test_counts)
    print(clf.__class__.__name__, f1_score(y_test, y_pred, average = 'macro')) 

LogisticRegression 0.9737717299220434
SVC 0.9770319302962809
MultinomialNB 0.9607540745284919
VotingClassifier 0.9780126893517511
