In [1]:
#Importing the required libraries
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
#Reading the required file

train = pd.read_csv('jigsaw-toxic-comment-classification-challenge/train.csv')

In [3]:
#displaying the first five rows in Train dataset

train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
#shape of training dataset

train.shape

(159571, 8)

In [13]:
#Segregating both feature column and target columns
list_sentences_train = train.iloc[:,1]
label_train = train.iloc[:, 2:8]

print(list_sentences_train)

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object


In [6]:
#Importing string lib for punctuations

import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
#Removing punctuations from text

def remove_punctuations(sentence):
  txt_nopunct = "".join([c for c in sentence if c not in string.punctuation])
  return txt_nopunct

list_sentences_train = list_sentences_train.apply(lambda x:remove_punctuations(x))

print(list_sentences_train)

0         Explanation\nWhy the edits made under my usern...
1         Daww He matches this background colour Im seem...
2         Hey man Im really not trying to edit war Its j...
3         \nMore\nI cant make any real suggestions on im...
4         You sir are my hero Any chance you remember wh...
                                ...                        
159566    And for the second time of asking when your vi...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm theres no actual article for p...
159569    And it looks like it was actually you who put ...
159570    \nAnd  I really dont think you understand  I c...
Name: comment_text, Length: 159571, dtype: object


In [15]:
#Tokenizing and removing stopwords

stop_words = set(stopwords.words('english')) 

def tokenize_sentences(sentence):
  tokens = word_tokenize(sentence)
  filtered_sentence = [word.lower() for word in tokens if word.lower() not in stop_words]
  return filtered_sentence

list_sentences_train = list_sentences_train.apply(lambda x : tokenize_sentences(x))

print(list_sentences_train)

0         [explanation, edits, made, username, hardcore,...
1         [daww, matches, background, colour, im, seemin...
2         [hey, man, im, really, trying, edit, war, guy,...
3         [cant, make, real, suggestions, improvement, w...
4                [sir, hero, chance, remember, page, thats]
                                ...                        
159566    [second, time, asking, view, completely, contr...
159567    [ashamed, horrible, thing, put, talk, page, 12...
159568    [spitzer, umm, theres, actual, article, prosti...
159569    [looks, like, actually, put, speedy, first, ve...
159570    [really, dont, think, understand, came, idea, ...
Name: comment_text, Length: 159571, dtype: object


In [16]:
#Lemmatizing

wn = nltk.WordNetLemmatizer()

def lemmatization(tokenized_text):
  text = " ".join([wn.lemmatize(word) for word in tokenized_text])
  return text

list_sentences_train_lemma = list_sentences_train.apply(lambda x: lemmatization(x))

print(list_sentences_train_lemma)

0         explanation edits made username hardcore metal...
1         daww match background colour im seemingly stuc...
2         hey man im really trying edit war guy constant...
3         cant make real suggestion improvement wondered...
4                       sir hero chance remember page thats
                                ...                        
159566    second time asking view completely contradicts...
159567       ashamed horrible thing put talk page 128611993
159568    spitzer umm there actual article prostitution ...
159569    look like actually put speedy first version de...
159570    really dont think understand came idea bad rig...
Name: comment_text, Length: 159571, dtype: object


In [17]:
#Vectorization
#Process of encoding text as integers to create Feature Vectors
#Feature Vector: vector of numerical features that represent an object

# Instantiate the vectorizer
word_vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{2,}',  #vectorize 2-character words or more
    ngram_range=(1, 1),
    max_features=30000)

# fit and transform on it the training features
word_vectorizer.fit(list_sentences_train_lemma)
X_train_word_features = word_vectorizer.transform(list_sentences_train_lemma)

In [18]:
#Vectored matrix

print(X_train_word_features)

  (0, 29798)	0.21370421658218852
  (0, 28855)	0.22915641879892687
  (0, 28537)	0.24502732596469912
  (0, 28188)	0.1521339619561107
  (0, 28052)	0.19434663672764638
  (0, 26365)	0.16917491539311266
  (0, 26126)	0.09686503344819589
  (0, 22615)	0.1632751731066122
  (0, 22559)	0.2551043308922774
  (0, 22254)	0.1504893135303345
  (0, 19479)	0.08711972514281327
  (0, 18276)	0.13282644778226682
  (0, 17164)	0.32222275040479137
  (0, 13595)	0.11215758229293775
  (0, 12560)	0.2807022467718628
  (0, 11600)	0.2505527407985963
  (0, 10450)	0.2001534511033996
  (0, 10324)	0.2491177485248909
  (0, 10200)	0.18454401523372843
  (0, 9261)	0.13497106289643243
  (0, 8841)	0.10220808487793585
  (0, 8796)	0.31747993851804324
  (0, 6017)	0.2821920770988895
  (1, 28099)	0.17682068642782556
  (1, 26504)	0.14986144835669837
  :	:
  (159568, 21110)	0.3929105451235221
  (159568, 7375)	0.4388896373537555
  (159568, 5124)	0.3428512093895419
  (159568, 2944)	0.10496360798136958
  (159568, 1739)	0.24224340027771488

In [19]:
#Splitting the dataset into training and testing

X_train, X_test, y_train, y_test = train_test_split(X_train_word_features, label_train, test_size= 0.2, random_state=13)

In [20]:
#Training the model and evaluating it 
#Logistic Regression ,storing all the six models (trained on each target column) in classifier array

CLASSES = ['toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate']
classifier=[]
auc = []

for class_name in CLASSES:
    #Taking each column label each time to train the classifier on it
    train_target = y_train[class_name]
    test_target = y_test[class_name]
    
    #Initializing Classifier
    classifier_logistic= LogisticRegression()
    classifier.append(classifier_logistic)
    
    cv_score = np.mean(cross_val_score(classifier_logistic, X_train, train_target, cv=5, scoring='accuracy'))
    print('CV Accuracy score for class {} is {}'.format(class_name, cv_score))
    auc.append(cv_score)
    
    classifier_logistic.fit(X_train, train_target)
    y_pred = classifier_logistic.predict(X_test)

    print(confusion_matrix(test_target, y_pred))
    print(classification_report(test_target, y_pred))

print('Total average Auccarcy score is {}'.format(np.mean(auc)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


CV Accuracy score for class toxic is 0.9548865689803255


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[28750   177]
 [ 1224  1764]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     28927
           1       0.91      0.59      0.72      2988

    accuracy                           0.96     31915
   macro avg       0.93      0.79      0.85     31915
weighted avg       0.95      0.96      0.95     31915

CV Accuracy score for class severe_toxic is 0.9904587351126126
[[31558    54]
 [  237    66]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     31612
           1       0.55      0.22      0.31       303

    accuracy                           0.99     31915
   macro avg       0.77      0.61      0.65     31915
weighted avg       0.99      0.99      0.99     31915

CV Accuracy score for class obscene is 0.9763740167306384
[[30191   100]
 [  620  1004]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     30291
           1       0.91

In [21]:
#Doing the same preprocessing steps on unseen data
test = pd.read_csv('jigsaw-toxic-comment-classification-challenge/test.csv')

#Taking commennts Column
list_sentences_test = test.iloc[:,1]

#Removing Punctations
list_sentences_test = list_sentences_test.apply(lambda x:remove_punctuations(x)) 

#Tokenizing and removing stopwords
list_sentences_test = list_sentences_test.apply(lambda x : tokenize_sentences(x))

#Lemmatizing
list_sentences_test_lemma = list_sentences_test.apply(lambda x: lemmatization(x))

In [22]:
print(list_sentences_test_lemma)

0         yo bitch ja rule succesful youll ever whats ha...
1                                        rfc title fine imo
2                              source zawe ashton lapland —
3         look back source information updated correct f...
4                             dont anonymously edit article
                                ...                        
153159              totally agree stuff nothing toolongcrap
153160    throw field home plate get faster throwing cut...
153161    okinotorishima category see change agree corre...
153162    one founding nation eu germany law return quit...
153163    stop already bullshit welcome im fool think ki...
Name: comment_text, Length: 153164, dtype: object


In [23]:
#Vectorization
X_test_word_features = word_vectorizer.transform(list_sentences_test_lemma)

In [24]:
print(X_test_word_features)

  (0, 29812)	0.13397809784610532
  (0, 29783)	0.16642249784669857
  (0, 29610)	0.10456603197974933
  (0, 28925)	0.22931712381462893
  (0, 28887)	0.12754911466212576
  (0, 27977)	0.16246453661751956
  (0, 27328)	0.2367330103048625
  (0, 26732)	0.07776576681974647
  (0, 26590)	0.08833771056432077
  (0, 25659)	0.22536846112051273
  (0, 24488)	0.19861726358400716
  (0, 24101)	0.12999190427356352
  (0, 23178)	0.15029102890662024
  (0, 23072)	0.18838316421100115
  (0, 22754)	0.09215444645906201
  (0, 20863)	0.18125281898577186
  (0, 18628)	0.23367617846788133
  (0, 17883)	0.13813715217326333
  (0, 16558)	0.11981680272624233
  (0, 15925)	0.06906446455843968
  (0, 15327)	0.1833768001045843
  (0, 14662)	0.36421657175902555
  (0, 12647)	0.20583017251626448
  (0, 12359)	0.11526352669361355
  (0, 10326)	0.13982856522366022
  :	:
  (153162, 15643)	0.12112109949347674
  (153162, 14835)	0.13301797657289877
  (153162, 14587)	0.29929478504856766
  (153162, 14576)	0.09973665722378781
  (153162, 11760)	0

In [25]:
print(X_test_word_features.shape)

(153164, 30000)


In [31]:
#testing on first comment in test dataset

CLASSES = ['toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate']

print("comment -- {} ".format(test.iloc[0,1]))
i=0
for class_name in CLASSES:
    #Taking each column label each time to train the classifier on it
    y_pred = classifier[i].predict(X_test_word_features[0])
    i=i+1
    print(class_name,y_pred)    

comment -- Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time., 
toxic [1]
severe_toxic [0]
obscene [1]
threat [0]
insult [1]
identity_hate [0]


In [None]:
#Testing on all the rows in test dataset and storing it as a DataFrame
columns = ['comment_text'] + CLASSES
df = pd.DataFrame(columns=columns)
rows = 138484

def get_predictions(i):
    predict=[test.iloc[i,1]]
    j=0
    for class_name in CLASSES:
        y_pred = classifier[j].predict(X_test_word_features[i])
        j=j+1
        predict.append(y_pred)
    return predict

for i in range(0,test.shape[0]+1):
    p = get_predictions(i)
    df.loc[i]=p

In [33]:
df

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Yo bitch Ja Rule is more succesful then you'll...,[1],[0],[1],[0],[1],[0]
1,== From RfC == \n\n The title is fine as it is...,[0],[0],[0],[0],[0],[0]
2,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",[0],[0],[0],[0],[0],[0]
3,":If you have a look back at the source, the in...",[0],[0],[0],[0],[0],[0]
4,I don't anonymously edit articles at all.,[0],[0],[0],[0],[0],[0]
...,...,...,...,...,...,...,...
138479,Wikipedia:Criticism#Avoid_sections_and_article...,[0],[0],[0],[0],[0],[0]
138480,(night after night that wanker is causing trou...,[1],[0],[0],[0],[0],[0]
138481,.The Macanese Yu-7 killed 10 officers during a...,[0],[0],[0],[0],[0],[0]
138482,==Oi!!== \n\n Please do not remove me abusing ...,[1],[0],[0],[0],[0],[0]
