### Resources:
* https://github.com/Jcharis/Python-Machine-Learning/blob/master/Multi_Label_Text_Classification_with_Skmultilearn/Multi-Label%20Classification%20with%20Python%20and%20Scikit-Multilearn-.ipynb
* https://www.youtube.com/watch?v=YyOuDi-zSiI


### Dataset: 
* https://drive.google.com/drive/folders/19Jc42hPSd45PjaYl34iG1RFNFxVyW8zm
* https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

In [1]:
!pip install scikit-multilearn





In [2]:
import numpy as np
import pandas as pd
import 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,

### Split Dataset into Train and Text
from sklearn.model_selection import train_test_split
# Feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

# Multi Label Pkgs
from skmultilearn.problem_transform import 
from skmultilearn.problem_transform import 
from skmultilearn.problem_transform import 
from skmultilearn.adapt import MLkNN

In [3]:
df = pd.read_csv('toxic_comment_classification.csv', index_col=0)  
df.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
df.shape

(159571, 7)

In [5]:
df = df.head(5000)

### Creation of New Columns Needed for the Model

In [6]:
df['labels'] = list(zip(df.toxic.to_list(), df.severe_toxic.to_list(), 
                        df.obscene.to_list(),  df.threat.to_list(),  
                        df.insult.to_list(),  df.identity_hate.to_list()))

In [7]:
df.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"


In [8]:
with_labels = df[df['labels'] != (0, 0, 0, 0, 0, 0)]
with_labels.shape

(538, 8)

In [9]:
non_labels = df[df['labels'] == (0, 0, 0, 0, 0, 0)]
n = 35
non_labels = non_labels.head(int(len(non_labels)*(n/100)))
non_labels

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"
...,...,...,...,...,...,...,...,...
04b92907a0db6e77,Evan Blass (Update) \n\nI have made the follow...,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"
04baeea8d61cac54,Awful bio \nObviously a hatchet job from a pre...,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"
04bb978332bbe669,"Yes, of course. I totally forgot we have tho...",0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"
04bbc7a4ebe7919a,"""\n\n About Einstein \n\nMy search of any docu...",0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)"


In [10]:
current_dataset = pd.concat([with_labels, non_labels])
current_dataset.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,"(1, 1, 1, 0, 1, 0)"
0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0,"(1, 0, 0, 0, 0, 0)"
0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0,"(1, 0, 0, 0, 0, 0)"
001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1,"(1, 0, 1, 0, 1, 1)"
00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0,"(1, 0, 1, 0, 1, 0)"


### Clean Text

In [11]:
import nltk
from nltk.corpus import stopwords
import re
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

current_dataset['text'] = current_dataset['comment_text'].apply(cleanHtml)
current_dataset['text'] = current_dataset['text'].apply(cleanPunc)
current_dataset['text'] = current_dataset['text'].apply(keepAlpha)

In [12]:
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)

def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

current_dataset['text'] = current_dataset['text'].apply(removeStopWords)

### Lemmatization

In [13]:
import nltk

lemmatizer=nltk.stem.()
def process_lemma(sentence):
    lemmaSentence = ""
    for word in sentence.split():
        lemma = lemmatizer.lemmatize(word)
        lemmaSentence += lemma
        lemmaSentence += " "
    lemmaSentence = lemmaSentence.strip()
    return lemmaSentence

current_dataset['text'] = current_dataset['text'].apply(process_lemma)

In [14]:
current_dataset = current_dataset.reset_index()

In [15]:
current_dataset

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,labels,text
0,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,"(1, 1, 1, 0, 1, 0)",COCKSUCKER PISS AROUND WORK
1,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0,"(1, 0, 0, 0, 0, 0)",Hey talk exclusive group WP TALIBANS good dest...
2,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0,"(1, 0, 0, 0, 0, 0)",Bye Dont look come think comming back Tosser
3,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1,"(1, 0, 1, 0, 1, 1)",gay antisemmitian Archangel WHite Tiger Meow G...
4,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0,"(1, 0, 1, 0, 1, 0)",FUCK FILTHY MOTHER ASS DRY
...,...,...,...,...,...,...,...,...,...,...
2094,04b92907a0db6e77,Evan Blass (Update) \n\nI have made the follow...,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)",Evan Blass Update made following comment Talk ...
2095,04baeea8d61cac54,Awful bio \nObviously a hatchet job from a pre...,0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)",Awful bio Obviously hatchet job press release ...
2096,04bb978332bbe669,"Yes, of course. I totally forgot we have tho...",0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)",Yes course totally forgot guideline Thanks Per...
2097,04bbc7a4ebe7919a,"""\n\n About Einstein \n\nMy search of any docu...",0,0,0,0,0,0,"(0, 0, 0, 0, 0, 0)",Einstein search document might bear Einsteins ...


In [16]:
current_dataset[['text', 'labels', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

Unnamed: 0,text,labels,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,COCKSUCKER PISS AROUND WORK,"(1, 1, 1, 0, 1, 0)",1,1,1,0,1,0
1,Hey talk exclusive group WP TALIBANS good dest...,"(1, 0, 0, 0, 0, 0)",1,0,0,0,0,0
2,Bye Dont look come think comming back Tosser,"(1, 0, 0, 0, 0, 0)",1,0,0,0,0,0
3,gay antisemmitian Archangel WHite Tiger Meow G...,"(1, 0, 1, 0, 1, 1)",1,0,1,0,1,1
4,FUCK FILTHY MOTHER ASS DRY,"(1, 0, 1, 0, 1, 0)",1,0,1,0,1,0
...,...,...,...,...,...,...,...,...
2094,Evan Blass Update made following comment Talk ...,"(0, 0, 0, 0, 0, 0)",0,0,0,0,0,0
2095,Awful bio Obviously hatchet job press release ...,"(0, 0, 0, 0, 0, 0)",0,0,0,0,0,0
2096,Yes course totally forgot guideline Thanks Per...,"(0, 0, 0, 0, 0, 0)",0,0,0,0,0,0
2097,Einstein search document might bear Einsteins ...,"(0, 0, 0, 0, 0, 0)",0,0,0,0,0,0


### Get TFIDF

In [17]:
tfidf = ()

In [18]:
Xfeatures = tfidf.(current_dataset['text']).toarray()

In [19]:
Xfeatures

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.03895882, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [20]:
y = current_dataset[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [21]:
y

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,1,1,1,0,1,0
1,1,0,0,0,0,0
2,1,0,0,0,0,0
3,1,0,1,0,1,1
4,1,0,1,0,1,0
...,...,...,...,...,...,...
2094,0,0,0,0,0,0
2095,0,0,0,0,0,0
2096,0,0,0,0,0,0
2097,0,0,0,0,0,0


### Train, Test Split

In [22]:
from sklearn.model_selection import train_test_split

# Split Data 
X_train,X_test,y_train,y_test = train_test_split(Xfeatures,y,test_size=0.2, random_state=42)

In [23]:
import skmultilearn

In [24]:
categories = ['toxic', 'severe_toxic', 
              'obscene', 'threat', 
              'insult', 'identity_hate']

### Binary Relevance

In [25]:
binary_rel_clf = (MultinomialNB())

In [26]:
binary_rel_clf.(X_train,y_train)

In [27]:
br_prediction = binary_rel_clf.predict(X_test)

In [28]:
accuracy_score(y_test,br_prediction)

0.7285714285714285

In [29]:
(y_test,br_prediction)

0.09047619047619047

Hamming Loss: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.hamming_loss.html
the fraction of labels that are incorrectly predicted.

In [30]:
from sklearn.metrics import 
y_pred = [1, 2, 3, 4]
y_true = [2, 2, 3, 4]
hamming_loss(y_true, y_pred)

0.25

### Classifier Chains

In [31]:
def build_model(model, mlb_estimator,xtrain,ytrain,xtest,ytest):
    # Create an Instance
    clf = mlb_estimator(model)
    clf.fit(xtrain,ytrain)
    # Predict
    clf_predictions = clf.predict(xtest)
    # Check For Accuracy
    acc = accuracy_score(ytest,clf_predictions)
    ham = hamming_loss(ytest,clf_predictions)
    result = {"accuracy:":acc,"hamming_score":ham}
    return result

In [32]:
clf_chain_model = build_model(,,X_train,y_train,X_test,y_test)

In [33]:
clf_chain_model

{'accuracy:': 0.7380952380952381, 'hamming_score': 0.08611111111111111}

### Label Powerset

In [34]:
clf_labelP_model = build_model(,,X_train,y_train,X_test,y_test)

In [35]:
clf_labelP_model

{'accuracy:': 0.7261904761904762, 'hamming_score': 0.10119047619047619}