In [29]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import spacy
import preprocessor as tp


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

from SBM.TextProcessor import *

In [3]:
df_tweets = pd.read_csv(os.path.join("data", "twitter_parsed_dataset.csv"), usecols = ['Text','Annotation', 'oh_label'])
df_tweets = df_tweets[df_tweets['Text'].notna()]
df_tweets = df_tweets[df_tweets['oh_label'].notna()]
df_tweets

Unnamed: 0,Text,Annotation,oh_label
0,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0
1,@ShreyaBafna3 Now you idiots claim that people...,none,0.0
2,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0
3,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0
4,#mkr No No No No No No,none,0.0
...,...,...,...
16846,"Feeling so sorry for the girls, they should be...",none,0.0
16847,#MKR 'pretty good dishes we're happy with' - O...,none,0.0
16848,RT @colonelkickhead: Deconstructed lemon tart!...,none,0.0
16849,@versacezaynx @nyazpolitics @greenlinerzjm You...,none,0.0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df_tweets['Text'].to_numpy(), df_tweets['oh_label'].to_numpy(), test_size=0.20)

In [8]:
pipeline = Pipeline([('text_proccesor', TextProcessor(remove_stop_word=True, 
                                                      remove_punctuation=True, 
                                                      twitter_tokens=None, 
                                                      min_word_size=2,
                                                      special_token_method=SpecialTokenMethod.PREPROCESS)),
                     ('tfidf',TfidfVectorizer()), 
                       ('model',LogisticRegression())])

In [9]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('text_proccesor',
                 TextProcessor(remove_punctuation=True, remove_stop_word=True)),
                ('tfidf', TfidfVectorizer()), ('model', LogisticRegression())])

In [12]:
y_pred = pipeline.predict(X_test)

In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.82      0.94      0.88      2297
         1.0       0.81      0.57      0.67      1073

    accuracy                           0.82      3370
   macro avg       0.81      0.75      0.77      3370
weighted avg       0.82      0.82      0.81      3370



In [14]:
print(confusion_matrix(y_test, y_pred))

[[2151  146]
 [ 463  610]]
