In [35]:
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score

import os
import re

In [18]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\denis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\denis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
train_path = os.path.join('data', 'train.csv')
target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

df = pd.read_csv(train_path)
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \r\n\r\nThat...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \r\n\r\nUmm, theres no actual article ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [20]:
def custom_tokenizer(text):
    stemmer = SnowballStemmer(language='english')
    eng_stopwords = stopwords.words('english')

    text_tokenized = word_tokenize(text)
    
    text_stp = [word for word in text_tokenized if (word.lower() not in eng_stopwords) and word.isalpha()]
    
    stemmer = SnowballStemmer(language='english')
    processed_text = [stemmer.stem(word) for word in text_stp]

    return processed_text

In [21]:
max_features = 512
eng_stopwords = stopwords.words('english')

vectirizer = TfidfVectorizer(
    lowercase=True,
    tokenizer=custom_tokenizer,
    stop_words=eng_stopwords,
    ngram_range=(1, 2),
    max_features=max_features
)

In [22]:
%%time
vectirizer.fit(df['comment_text'])



CPU times: total: 3min 3s
Wall time: 3min 14s


In [23]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \r\n\r\nThat...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \r\n\r\nUmm, theres no actual article ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [25]:
X = df['comment_text']
y = df.drop(columns=['id', 'comment_text'])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((127656,), (31915,), (127656, 6), (31915, 6))

In [28]:
%%time
X_train_vec = vectirizer.transform(X_train)
X_test_vec = vectirizer.transform(X_test)

X_train_vec.shape, X_test_vec.shape

CPU times: total: 2min 46s
Wall time: 2min 53s


((127656, 512), (31915, 512))

In [33]:
base_est = LogisticRegression(
    solver = 'lbfgs',
    penalty = 'l2',
    max_iter = 500,
    random_state = 7,
    n_jobs = -1,
)

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True
)

chain = ClassifierChain(
    base_estimator = base_est,
    cv = cv,
    verbose = True,
    random_state = 7
)

CPU times: total: 0 ns
Wall time: 0 ns


In [34]:
%%time
chain.fit(X_train_vec, y_train)

[Chain] ................... (1 of 6) Processing order 0, total=   2.6s
[Chain] ................... (2 of 6) Processing order 1, total=   1.4s
[Chain] ................... (3 of 6) Processing order 2, total=   1.5s
[Chain] ................... (4 of 6) Processing order 3, total=   1.0s
[Chain] ................... (5 of 6) Processing order 4, total=   0.8s
[Chain] ................... (6 of 6) Processing order 5, total=   0.8s
CPU times: total: 7.02 s
Wall time: 27.4 s


In [None]:
train_pred = chain.predict(X_train_vec)
val_pred = chain.predict(X_test_vec)

print("Train ROC:", roc_auc_score(y_train, train_pred))
print("VAl ROC:", roc_auc_score(y_test, val_pred))
print("Train Acc:", accuracy_score(y_train, train_pred))
print("Val Acc:", accuracy_score(y_test, val_pred))

print(classification_report(y_train, train_pred))
print(classification_report(y_test, val_pred))