In [71]:
import pandas as pd
import os
import preprocessor as tweet_preprocessor
import numpy as np
import matplotlib.pyplot as plt
import spacy

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
nlp = spacy.load("en_core_web_sm")

In [91]:
df_tweets = pd.read_csv(os.path.join("data", "twitter_parsed_dataset.csv"), usecols = ['Text','Annotation', 'oh_label'])
df_tweets = df_tweets[df_tweets['Text'].notna()]
df_tweets = df_tweets[df_tweets['oh_label'].notna()]
df_tweets

Unnamed: 0,Text,Annotation,oh_label
0,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0
1,@ShreyaBafna3 Now you idiots claim that people...,none,0.0
2,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0
3,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0
4,#mkr No No No No No No,none,0.0
...,...,...,...
16846,"Feeling so sorry for the girls, they should be...",none,0.0
16847,#MKR 'pretty good dishes we're happy with' - O...,none,0.0
16848,RT @colonelkickhead: Deconstructed lemon tart!...,none,0.0
16849,@versacezaynx @nyazpolitics @greenlinerzjm You...,none,0.0


In [92]:
KEEP_STOP_WORD = False
KEEP_PUNCTUATION = False
KEEP_SPECIAL_TOKEN = False

tweets = []

for tweet in df_tweets['Text'].tolist():
    if KEEP_SPECIAL_TOKEN:
        tweet = tweet_preprocessor.tokenize(tweet)
    else:
        tweet = tweet_preprocessor.clean(tweet)
        
    final_sentence = []

    nlp_sentence = nlp(tweet)
    for word in nlp_sentence:
        if not KEEP_PUNCTUATION and word.is_punct:
            continue
            
        if not KEEP_STOP_WORD and word.is_stop:
            continue
            
        final_sentence.append(word.lemma_.lower())    
    tweets.append(" ".join(final_sentence))

In [93]:
tweets

['read context change meaning history islamic slavery',
 'idiot claim people try stop terrorist terrorist islamically brain dead',
 'sexist auto place talk guy',
 'wrong isis follow example mohammed quran exactly',
 '',
 'saudi preacher rape torture -year old daughter death release',
 'nooo sexist woman bad driver',
 "go pancake don't hve strawberry hve banana",
 'dare feeling fantastic way dehumanize',
 'wrong girl win wayne rooney street striker',
 'autoblocker run blockbot block manually',
 'good muslim good despite bad religion',
 'help ask pass judgment comes look help',
 "yeah call care human life idiot genocidal daesh would'nt understand",
 'otoh eventually like wait harper randi harper know adjust',
 'roflmao putin delusion factory run speed putin pay troll distribute',
 'kat amp andre drown river amp chance save type sandwich',
 'literally site today',
 'juxtaposition',
 'woo wait happen',
 'tend document hair color experiment later gt;.&gt pigtail pic year old',
 'accord holl

In [98]:
X_train, X_test, y_train, y_test = train_test_split(tweets, df_tweets['oh_label'].to_numpy(), test_size=0.20)

In [99]:
pipeline = Pipeline([('tfidf',TfidfVectorizer()), 
                       ('model',LogisticRegression())])

In [100]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('model', LogisticRegression())])

In [101]:
pred = pipeline.predict(X_test)

In [103]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.81      0.93      0.86      2249
         1.0       0.79      0.55      0.65      1121

    accuracy                           0.80      3370
   macro avg       0.80      0.74      0.75      3370
weighted avg       0.80      0.80      0.79      3370

