In [1]:
import numpy as np
import pandas as pd
import re, string
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

In [2]:
path = '/Users/my_MAC/Coding/data/jigsaw-toxic-comment-classification-challenge/'
df = pd.read_csv(path+'train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


In [3]:
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline

import nltk
from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer

import gensim


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.feature_selection import chi2

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [4]:
def pre_preprocess_text(text, stemming=False, lemmatizing=True, sw=None, punct_list = None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    text_token = text.split()
    ## remove Stopwords
    if sw is not None:
        text_stopwordFree = [word for word in text_token if word not in sw]
        
    # Checking all words is alphabets or not - if not remove
    text_alphabets = [i for i in text_stopwordFree if i.isalpha()]
    
    # punctuations free
    if punct_list is not None:
        text_punctFree = [w for w in text_alphabets if w not in punct_list]
                
    ## Stemming (remove -ing, -ly, ...)
    if stemming == True:
        ps = nltk.stem.porter.PorterStemmer()
        text_clean = [ps.stem(word) for word in text_punctFree]
                
    ## Lemmatisation (convert the word into root word)
    if lemmatizing == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        text_clean = [lem.lemmatize(word) for word in text_punctFree]
            
    ## back to string from list
    text = " ".join(text_clean)
    
    return text

stopwords_list = nltk.corpus.stopwords.words("english")
puncts = [i for i in string.punctuation]

df['clean_text'] = df['comment_text'].apply(lambda x: pre_preprocess_text(x, sw=stopwords_list, 
                                                                            punct_list = puncts))

df[['comment_text','clean_text']].head()

Unnamed: 0,comment_text,clean_text
0,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",explanation edits made username hardcore metallica fan reverted werent vandalism closure gas voted new york doll fac please dont remove template talk page since im retired
1,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",daww match background colour im seemingly stuck thanks talk january utc
2,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",hey man im really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info
3,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,cant make real suggestion improvement wondered section statistic later subsection type accident think reference may need tidying exact format ie date format etc later noone else first preference formatting style reference want please let know appears backlog article review guess may delay reviewer turn listed relevant form eg
4,"You, sir, are my hero. Any chance you remember what page that's on?",sir hero chance remember page thats


In [5]:
tfv = TfidfVectorizer(max_features=1000, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                      ngram_range=(1, 1), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english')

x = tfv.fit_transform(df['clean_text'])
all_labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
y = df[all_labels]

x_train, x_valid, y_train, y_valid = train_test_split(x,y, random_state=43, test_size=0.2)

In [6]:
import sklearn.metrics as metrics

import warnings
# warnings.filterwarnings('ignore')

## A. Simple Problem Transformation

In [7]:
# !pip install scikit-multilearn
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [9]:
import time
def mlc_LP(base_clf):
    strt = time.time()
    clf = LabelPowerset(classifier=base_clf)
    clf.fit(x_train, y_train)
    y_hat = clf.predict(x_valid)
    hamming_loss = metrics.hamming_loss(y_valid, y_hat)
    accuracy = metrics.accuracy_score(y_valid, y_hat)
    time_taken = time.time() - strt
    print("hamming loss = ", hamming_loss, "accuracy = ", accuracy, "time taken = ", time_taken)
    return hamming_loss, accuracy, time_taken

def mlc_BR(base_clf):
    strt = time.time()
    clf = BinaryRelevance(classifier=base_clf)
    clf.fit(x_train, y_train)
    y_hat = clf.predict(x_valid)
    hamming_loss = metrics.hamming_loss(y_valid, y_hat)
    accuracy = metrics.accuracy_score(y_valid, y_hat)
    time_taken = time.time() - strt
    print("hamming loss = ", hamming_loss, "accuracy = ", accuracy, "time taken = ", time_taken)
    return hamming_loss, accuracy, time_taken

def mlc_CC(base_clf):
    strt = time.time()
    clf = ClassifierChain(classifier=base_clf)
    clf.fit(x_train, y_train)
    y_hat = clf.predict(x_valid)
    hamming_loss = metrics.hamming_loss(y_valid, y_hat)
    accuracy = metrics.accuracy_score(y_valid, y_hat)
    time_taken = time.time() - strt
    print("hamming loss = ", hamming_loss, "accuracy = ", accuracy, "time taken = ", time_taken)
    return hamming_loss, accuracy, time_taken

In [10]:
CLFs = [LogisticRegression(random_state=43), MultinomialNB(), LinearSVC()]

entries = []
for i in CLFs:
    base_clf = str(i.__class__.__name__)
    print(base_clf)
    a,b,c = mlc_LP(i)
    entries.append((base_clf,a,b,c))
        
results_LP = pd.DataFrame(entries, columns=['Base_classifier', 'Hamming_loss', 'Accuracy', 'Time_taken'])
results_LP['MLC'] = "LP"


entries = []
for i in CLFs:
    base_clf = str(i.__class__.__name__)
    print(base_clf)
    a,b,c = mlc_CC(i)
    entries.append((base_clf,a,b,c))
        
results_CC = pd.DataFrame(entries, columns=['Base_classifier', 'Hamming_loss', 'Accuracy', 'Time_taken'])
results_CC['MLC'] = "CC"



entries = []
for i in CLFs:
    base_clf = str(i.__class__.__name__)
    print(base_clf)
    a,b,c = mlc_BR(i)
    entries.append((base_clf,a,b,c))
        
results_BR = pd.DataFrame(entries, columns=['Base_classifier', 'Hamming_loss', 'Accuracy', 'Time_taken'])
results_BR['MLC'] = "BR"

LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


hamming loss =  0.021144707295420128 accuracy =  0.9160269465768447 time taken =  38.135905027389526
MultinomialNB
hamming loss =  0.024842028304350096 accuracy =  0.9106689644367852 time taken =  1.894563913345337
LinearSVC
hamming loss =  0.021515483837276098 accuracy =  0.9157449475168417 time taken =  10.236384868621826
LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


hamming loss =  0.020371820982818947 accuracy =  0.917248942503525 time taken =  33.25800704956055
MultinomialNB
hamming loss =  0.031845004961094575 accuracy =  0.8988563371455428 time taken =  3.4958109855651855
LinearSVC
hamming loss =  0.02063815342837746 accuracy =  0.9167476108412972 time taken =  12.775354146957397
LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


hamming loss =  0.02028304350096611 accuracy =  0.9168102772990757 time taken =  23.507508993148804
MultinomialNB
hamming loss =  0.022580813619510157 accuracy =  0.9129562901456996 time taken =  3.3632960319519043
LinearSVC
hamming loss =  0.020298710115410727 accuracy =  0.9169042769857434 time taken =  236.77040886878967


In [11]:
results = pd.concat([results_LP, results_BR, results_CC])
results

Unnamed: 0,Base_classifier,Hamming_loss,Accuracy,Time_taken,MLC
0,LogisticRegression,0.021145,0.916027,38.135905,LP
1,MultinomialNB,0.024842,0.910669,1.894564,LP
2,LinearSVC,0.021515,0.915745,10.236385,LP
0,LogisticRegression,0.020283,0.91681,23.507509,BR
1,MultinomialNB,0.022581,0.912956,3.363296,BR
2,LinearSVC,0.020299,0.916904,236.770409,BR
0,LogisticRegression,0.020372,0.917249,33.258007,CC
1,MultinomialNB,0.031845,0.898856,3.495811,CC
2,LinearSVC,0.020638,0.916748,12.775354,CC


In [12]:
results.groupby('MLC').mean().reset_index()

Unnamed: 0,MLC,Hamming_loss,Accuracy,Time_taken
0,BR,0.021054,0.915557,87.880405
1,CC,0.024285,0.910951,16.509724
2,LP,0.022501,0.914147,16.755618


## B. Algorithm Adaptation

In [15]:
from skmultilearn.adapt import MLkNN
from skmultilearn.adapt import BRkNNaClassifier
from skmultilearn.adapt import MLTSVM

In [16]:
adapt_clf = [MLkNN(k=2), BRkNNaClassifier(k=2)]

entries = []
for clf in adapt_clf:
    strt = time.time()
#     clf.fit(np.array(x_train), np.array(y_train))
    clf.fit(x_train.toarray(), np.array(y_train))
    y_hat = clf.predict(x_valid)
    hamming_loss = metrics.hamming_loss(y_valid, y_hat)
    accuracy = metrics.accuracy_score(y_valid, y_hat)
    time_taken = time.time() - strt
    print("hamming loss = ", hamming_loss, "accuracy = ", accuracy, "time taken = ", time_taken)
    entries.append((str(clf.__class__.__name__), hamming_loss, accuracy, time_taken))
    
results_adaptalgo = pd.DataFrame(entries, columns=['MLC', 'Hamming_loss', 'Accuracy', 'Time_taken'])

results_adaptalgo



hamming loss =  0.04026842132748446 accuracy =  0.8519191602694658 time taken =  811.156965970993




hamming loss =  0.027395686458822916 accuracy =  0.9018643271189096 time taken =  115.82356405258179


Unnamed: 0,Adapt_algo,Hamming_loss,Accuracy,Time_taken
0,MLkNN,0.040268,0.851919,811.156966
1,BRkNNaClassifier,0.027396,0.901864,115.823564


In [18]:
results_adaptalgo = pd.DataFrame(entries, columns=['MLC', 'Hamming_loss', 'Accuracy', 'Time_taken'])

results_adaptalgo

Unnamed: 0,MLC,Hamming_loss,Accuracy,Time_taken
0,MLkNN,0.040268,0.851919,811.156966
1,BRkNNaClassifier,0.027396,0.901864,115.823564


## C. Ensemble

In [13]:
from skmultilearn.ensemble import RakelD

def ensemble_rakel(base_clf):
    strt = time.time()
    clf = RakelD(base_classifier=base_clf)
    clf.fit(x_train, y_train)
    y_hat = clf.predict(x_valid)
    hamming_loss = metrics.hamming_loss(y_valid, y_hat)
    accuracy = metrics.accuracy_score(y_valid, y_hat)
    time_taken = time.time() - strt
    print("hamming loss = ", hamming_loss, "accuracy = ", accuracy, "time taken = ", time_taken)
    return hamming_loss, accuracy, time_taken

entries = []
for i in CLFs:
    base_clf = str(i.__class__.__name__)
    print(base_clf)
    a,b,c = ensemble_rakel(i)
    entries.append((base_clf,a,b,c))
        
results_rakel = pd.DataFrame(entries, columns=['Base_classifier', 'Hamming_loss', 'Accuracy', 'Time_taken'])
results_rakel['MLC'] = "Ensemble"

LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


hamming loss =  0.020789597368008773 accuracy =  0.9154629484568385 time taken =  30.27391791343689
MultinomialNB
hamming loss =  0.023714032064337563 accuracy =  0.9104182986056714 time taken =  3.6457748413085938
LinearSVC
hamming loss =  0.021071596428011905 accuracy =  0.9160896130346232 time taken =  8.934327125549316


In [14]:
results_rakel

Unnamed: 0,Base_classifier,Hamming_loss,Accuracy,Time_taken,MLC
0,LogisticRegression,0.02079,0.915463,30.273918,Ensemble
1,MultinomialNB,0.023714,0.910418,3.645775,Ensemble
2,LinearSVC,0.021072,0.91609,8.934327,Ensemble


In [19]:
pd.concat([results,results_adaptalgo,results_rakel])

Unnamed: 0,Base_classifier,Hamming_loss,Accuracy,Time_taken,MLC
0,LogisticRegression,0.021145,0.916027,38.135905,LP
1,MultinomialNB,0.024842,0.910669,1.894564,LP
2,LinearSVC,0.021515,0.915745,10.236385,LP
0,LogisticRegression,0.020283,0.91681,23.507509,BR
1,MultinomialNB,0.022581,0.912956,3.363296,BR
2,LinearSVC,0.020299,0.916904,236.770409,BR
0,LogisticRegression,0.020372,0.917249,33.258007,CC
1,MultinomialNB,0.031845,0.898856,3.495811,CC
2,LinearSVC,0.020638,0.916748,12.775354,CC
0,,0.040268,0.851919,811.156966,MLkNN
