In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
import string
import spacy
np.random.seed(42)

In [None]:
data=pd.read_csv('./youtoxic_english_1000.csv')
data.head()

Unnamed: 0,CommentId,VideoId,Text,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism
0,Ugg2KwwX0V8-aXgCoAEC,04kJtp6pVXI,If only people would just take a step back and...,False,False,False,False,False,False,False,False,False,False,False,False
1,Ugg2s5AzSPioEXgCoAEC,04kJtp6pVXI,Law enforcement is not trained to shoot to app...,True,True,False,False,False,False,False,False,False,False,False,False
2,Ugg3dWTOxryFfHgCoAEC,04kJtp6pVXI,\nDont you reckon them 'black lives matter' ba...,True,True,False,False,True,False,False,False,False,False,False,False
3,Ugg7Gd006w1MPngCoAEC,04kJtp6pVXI,There are a very large number of people who do...,False,False,False,False,False,False,False,False,False,False,False,False
4,Ugg8FfTbbNF8IngCoAEC,04kJtp6pVXI,"The Arab dude is absolutely right, he should h...",False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
data['IsToxic']=data['IsToxic'].astype('int')

In [None]:
data.head()

Unnamed: 0,CommentId,VideoId,Text,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism
0,Ugg2KwwX0V8-aXgCoAEC,04kJtp6pVXI,If only people would just take a step back and...,0,False,False,False,False,False,False,False,False,False,False,False
1,Ugg2s5AzSPioEXgCoAEC,04kJtp6pVXI,Law enforcement is not trained to shoot to app...,1,True,False,False,False,False,False,False,False,False,False,False
2,Ugg3dWTOxryFfHgCoAEC,04kJtp6pVXI,\nDont you reckon them 'black lives matter' ba...,1,True,False,False,True,False,False,False,False,False,False,False
3,Ugg7Gd006w1MPngCoAEC,04kJtp6pVXI,There are a very large number of people who do...,0,False,False,False,False,False,False,False,False,False,False,False
4,Ugg8FfTbbNF8IngCoAEC,04kJtp6pVXI,"The Arab dude is absolutely right, he should h...",0,False,False,False,False,False,False,False,False,False,False,False


In [None]:
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)

{'in', "'re", 'six', 'she', 're', 'there', 'an', 'he', 'seem', 'every', 'nobody', 'thru', 'whoever', 'each', 'both', 'nor', 'did', 'throughout', 'here', 'already', 'still', 'back', 'everywhere', 'otherwise', 'you', 'now', "'ve", 'over', 'becomes', 'after', 'noone', 'would', 'around', 'among', 'none', 'hereupon', 'it', 'side', 'yourselves', 'amongst', 'top', 'between', 'before', 'such', 'part', 'will', 'been', 'themselves', 'most', 'ten', 'thereafter', 'all', 'seeming', 'besides', 'get', 'nowhere', 'quite', 'that', 'unless', 'anywhere', 'formerly', 'if', 'yet', 'own', 'latterly', 'many', 'fifteen', 'whole', 'call', 'also', 'who', 'when', 'not', 'but', 'our', 'more', 'beside', 'moreover', 'ours', 'except', 'its', 'fifty', 'off', 'then', 'wherever', 'whose', 'see', 'first', 'four', 'for', 'something', 'whom', 'must', 'be', 'together', 'on', 'therefore', 'were', 'me', 'cannot', 'just', 'whereas', 'others', 'whence', 'always', 'only', 'do', 'or', 'empty', '’s', 'have', 'my', 'up', 'say', 'a

In [None]:
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [None]:
# Creating tokenizer function
def spacy_tokenizer(sentence):
    doc = nlp(sentence)

    mytokens = [ word.lemma_.lower().strip() for word in doc ]


    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    return mytokens

In [None]:
sentence="I am eating Apple"
spacy_tokenizer(sentence)

['eat', 'apple']

In [None]:
count_vector = CountVectorizer(tokenizer = spacy_tokenizer)


In [None]:
count_vector.fit_transform(["I am eating apple, I like apple","I am playing cricket"]).toarray()




array([[2, 0, 1, 1, 0],
       [0, 1, 0, 0, 1]])

In [None]:
count_vector.get_feature_names_out()


array(['apple', 'cricket', 'eat', 'like', 'play'], dtype=object)

In [None]:
count_vector.vocabulary_


{'eat': 2, 'apple': 0, 'like': 3, 'play': 4, 'cricket': 1}

In [None]:
from sklearn.model_selection import train_test_split
X=data['Text']
y=data['IsToxic']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,stratify=y)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()

In [None]:
X_train.head()


Unnamed: 0,Text
101,I wonder what the police expect will happen wh...
67,bassem I think walmart is hiring for the holid...
541,Anybody got a cigar?
616,*Monkey screamin bout honkies intensifies*
917,5:53 did you see that brick hit that white pig...


In [None]:
X_train_vectors=count_vector.fit_transform(X_train)

X_test_vectors=count_vector.transform(X_test)



In [None]:
type(X_train_vectors)


In [None]:
X_test_vectors.shape


(200, 3153)

In [None]:
classifier.fit(X_train_vectors,y_train)


In [None]:
predicted=classifier.predict(X_test_vectors)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.705
Logistic Regression Precision: 0.7538461538461538
Logistic Regression Recall: 0.532608695652174


In [None]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)
X_train_vetcors= tfidf_vector.fit_transform(X_train)
X_test_vetcors= tfidf_vector.transform(X_test)



In [None]:
classifier = LogisticRegression()
classifier.fit(X_train_vetcors,y_train)
predicted = classifier.predict(X_test_vetcors)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.695
Logistic Regression Precision: 0.746031746031746
Logistic Regression Recall: 0.5108695652173914
