# Réseau de neurones BPMLL pour classification multi label 

In [2]:
import pandas as pd
import numpy as np
import random
import nltk
from nltk.corpus import wordnet, stopwords
import re
import string
from nltk import word_tokenize
import torch as th
import torch.nn as nn
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score
import torch.nn.functional as F
from sklearn.metrics import hamming_loss
from sklearn.linear_model import Ridge
from skmultilearn.adapt import MLkNN

## Data 

In [4]:
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")
data_labels = pd.read_csv("test_labels.csv")

In [5]:
data_train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [6]:
data_train['toxic_general'] = np.where((data_train['toxic'] == 0) & (data_train['severe_toxic'] == 0) &
                              (data_train['obscene'] == 0) & (data_train['threat'] == 0) &
                              (data_train['insult'] == 0) & (data_train['identity_hate'] == 0), 0, 1)

# Data preprocessing : data augmentation

In [8]:
Y_train = data_train.iloc[:,2:8].to_numpy()
np.sum(Y_train, axis=0)/len(Y_train) #very imbalanced classes 

array([0.09584448, 0.00999555, 0.05294822, 0.00299553, 0.04936361,
       0.00880486])

In [9]:
data_train['toxic_general'].value_counts()

0    143346
1     16225
Name: toxic_general, dtype: int64

The distribution of the classes is very imbalanced. The minority classes are "severe_toxic", "threat" and "identity_hate"

In [94]:
data_minor_class = data_train[(data_train['severe_toxic']==1) | (data_train['threat']==1) | (data_train['identity_hate']==1)]

In [12]:
data_minor_class 

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxic_general
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,1
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1,1
55,0020e7119b96eeeb,Stupid peace of shit stop deleting my stuff as...,1,1,1,0,1,0,1
79,003217c3eb469ba9,Hi! I am back again!\nLast warning!\nStop undo...,1,0,0,1,0,0,1
105,00472b8e2d38d1ea,A pair of jew-hating weiner nazi schmucks.,1,0,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...
159312,fbf20e312cd4a78d,"Walter Mercado \n\nAntonio, quite frankly, you...",1,1,1,0,1,0,1
159336,fc3efa2f6f025f6d,"Oh, fuck off. The pansy Jew would just whine a...",1,0,1,0,1,1,1
159400,fd052883fa6a8697,"Shalom \n\nSemite, get the fuck out of here. I...",1,1,1,1,1,1,1
159449,fdce660ddcd6d7ca,I think he is a gay fag!!!,1,0,0,0,0,1,1


In [13]:
data_unique = data_minor_class.copy()
data_random = data_minor_class.copy()
data_synonym = data_minor_class.copy()

In [14]:
def unique(phrase):
    d = {}
    s = 0
    for mot in phrase.split(" "):
        if mot in d :
            s = ""
            s += phrase
        else : 
            d[mot]=len(d)
    return s

def remove_dup(phrase):
    strg=""
    d = set()
    for mot in phrase.split(" "):
        if mot not in d :
            strg= strg + " " + mot
            d.add(mot)
    return strg

def remove_random(phrase):
    L = phrase.split(" ")
    rand = random.sample(L,int(20/100*len(L)))
    for el in rand :
        L.remove(el)
    return L

def list_to_str(L):
    phrase = str()
    for el in L:
        phrase+= " " + el
    return phrase

def synonyme(phrase):
    S = str()
    for word in phrase.split(" "):
        synonyms = [] 
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
        if set(synonyms) != set():
            S=S+" "+(random.sample(set(synonyms),1))[0]
    return S

### Unique words augmentation

In [15]:
data_unique['comment_text'] = data_unique['comment_text'].apply(unique)
data_unique = data_unique[data_unique["comment_text"]!=0]
data_unique["comment_text"].apply(remove_dup)
data_unique

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxic_general
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1,1
55,0020e7119b96eeeb,Stupid peace of shit stop deleting my stuff as...,1,1,1,0,1,0,1
176,006b94add72ed61c,I think that your a Fagget get a oife and burn...,1,0,1,1,1,1,1
218,008e0818dde894fb,"Kill all niggers. \n\nI have hard, that others...",1,0,1,0,1,1,1
238,0097dd5c29bf7a15,u r a tw@ fuck off u gay boy.U r smelly.Fuck u...,1,0,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...
159041,f79a44cb4a4d17a8,"""\nARE YOU FUCKING RETARDED?\n\nPlease do not ...",1,0,1,0,1,1,1
159096,f871b05d4caa6f20,"You filthy, stinking crow! \n\nI am back! Dirt...",1,1,1,0,1,0,1
159239,fab1d401d504cf68,whats up gook \n\nWhats up you asian gook chin...,1,0,0,0,0,1,1
159400,fd052883fa6a8697,"Shalom \n\nSemite, get the fuck out of here. I...",1,1,1,1,1,1,1


### Random mask 

In [16]:
#Random Mask 

data_random['comment_text'] = data_random['comment_text'].apply(remove_random).apply(list_to_str)
data_random

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxic_general
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON WORK,1,1,1,0,1,0,1
42,001810bf8c45bf5f,are or antisemmitian? \n\nArchangel WHite the...,1,0,1,0,1,1,1
55,0020e7119b96eeeb,Stupid of shit stop my stuff asshole go die a...,1,1,1,0,1,0,1
79,003217c3eb469ba9,I am back again!\nLast warning!\nStop my edit...,1,0,0,1,0,0,1
105,00472b8e2d38d1ea,pair of jew-hating weiner nazi schmucks.,1,0,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...
159312,fbf20e312cd4a78d,"Mercado quite frankly, you're a fucker for Me...",1,1,1,0,1,0,1
159336,fc3efa2f6f025f6d,"Oh, fuck off. The Jew would just whine about ...",1,0,1,0,1,1,1
159400,fd052883fa6a8697,"\n\nSemite, the fuck out here. kill son of bi...",1,1,1,1,1,1,1
159449,fdce660ddcd6d7ca,I think is a gay fag!!!,1,0,0,0,0,1,1


### Synonyms Replacement

In [17]:
data_synonym['comment_text'] = data_synonym['comment_text'].apply(synonyme)
data_synonym

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxic_general
6,0002bcb3da6cb337,asshole in_front piss close_to on study,1,1,1,0,1,0,1
42,001810bf8c45bf5f,equal festive operating_theatre albumen there...,1,0,1,0,1,1,1
55,0020e7119b96eeeb,stupe pacification dirt period edit glut arse...,1,1,1,0,1,0,1
79,003217c3eb469ba9,ace embody punt untying edit_out Oregon,1,0,0,1,0,0,1
105,00472b8e2d38d1ea,ampere copulate nazi,1,0,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...
159312,fbf20e312cd4a78d,Walter rather group_A fucker compare sexualit...,1,1,1,0,1,0,1
159336,fc3efa2f6f025f6d,have_it_away pansy Israelite but yammer almos...,1,0,1,0,1,1,1
159400,fd052883fa6a8697,nonplus nooky tabu single testament toss_off ...,1,1,1,1,1,1,1
159449,fdce660ddcd6d7ca,iodine believe atomic_number_2 comprise adeni...,1,0,0,0,0,1,1


In [18]:
data_augm = pd.concat([data_train,data_unique,data_random,data_synonym]).sample(frac=1).reset_index(drop=True)
data_augm

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxic_general
0,707bc41cf4e12f6a,"Busy in real life?! LOL, pathetic cunt. Learn ...",1,0,1,0,1,1,1
1,2faddd32d4c28c11,"""Also the same citation is used for another se...",0,0,0,0,0,0,0
2,23e8718ee04a896d,"Okay, that's enough. Three tries at this one ...",0,0,0,0,0,0,0
3,440beaa1f848200c,"These are your views, but are they the consensus?",0,0,0,0,0,0,0
4,af4f6699bb5e7fac,"""\nI would have to look at the 2004 DVD for th...",0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
167412,b09788abfea10662,"Hi Tom,\n\nSorry to see you are blocked again....",0,0,0,0,0,0,0
167413,f6678813cf615dd2,3-D Bonding Structure \n\nThis image should be...,0,0,0,0,0,0,0
167414,d95429961f576658,"""\n\nCan someone explain to me why the club's ...",0,0,0,0,0,0,0
167415,294e2e2095e50b7a,"]], Template:Moogfest 2010, Template:Moogfest ...",0,0,0,0,0,0,0


In [19]:
Y_train = data_augm.iloc[:,2:8].to_numpy()
(np.sum(Y_train, axis=0)/len(Y_train))*100 #les classes sont plus équilibrées

array([13.60136665,  3.44947048,  8.80854394,  1.06440804,  8.44896277,
        3.04389638])

In [20]:
data_augm['toxic_general'].value_counts()

0    143346
1     24071
Name: toxic_general, dtype: int64

## Data Preprocessing : Cleaning

In [22]:
def remove_punct(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', text)
    return text_nopunct

def lower_token(tokens): 
    return [w.lower() for w in tokens]    

def removeStopWords(tokens): 
    return [word for word in tokens if word not in stoplist and len(word)>2]

#remove punctuation 
data_augm['Text_Clean'] = data_augm['comment_text'].apply(lambda x: remove_punct(x))

#Tokenization
tokens = [word_tokenize(sen) for sen in data_augm.Text_Clean]

#Lower case
lower_tokens = [lower_token(token) for token in tokens]

#remove stop words
stoplist = stopwords.words('english')
filtered_words = [removeStopWords(sen) for sen in lower_tokens]

data_augm['Text_Final'] = [' '.join(sen) for sen in filtered_words]
data_augm['tokens'] = filtered_words

In [23]:
data_augm

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxic_general,Text_Clean,Text_Final,tokens
0,707bc41cf4e12f6a,"Busy in real life?! LOL, pathetic cunt. Learn ...",1,0,1,0,1,1,1,Busy in real life LOL pathetic cunt Learn to r...,busy real life lol pathetic cunt learn read en...,"[busy, real, life, lol, pathetic, cunt, learn,..."
1,2faddd32d4c28c11,"""Also the same citation is used for another se...",0,0,0,0,0,0,0,Also the same citation is used for another sen...,also citation used another sentence also life ...,"[also, citation, used, another, sentence, also..."
2,23e8718ee04a896d,"Okay, that's enough. Three tries at this one ...",0,0,0,0,0,0,0,Okay thats enough Three tries at this one of ...,okay thats enough three tries one came explain...,"[okay, thats, enough, three, tries, one, came,..."
3,440beaa1f848200c,"These are your views, but are they the consensus?",0,0,0,0,0,0,0,These are your views but are they the consensus,views consensus,"[views, consensus]"
4,af4f6699bb5e7fac,"""\nI would have to look at the 2004 DVD for th...",0,0,0,0,0,0,0,\nI would have to look at the 2004 DVD for tha...,would look 2004 dvd since vhs original broadca...,"[would, look, 2004, dvd, since, vhs, original,..."
...,...,...,...,...,...,...,...,...,...,...,...,...
167412,b09788abfea10662,"Hi Tom,\n\nSorry to see you are blocked again....",0,0,0,0,0,0,0,Hi Tom\n\nSorry to see you are blocked again ...,tom sorry see blocked ill bring issue administ...,"[tom, sorry, see, blocked, ill, bring, issue, ..."
167413,f6678813cf615dd2,3-D Bonding Structure \n\nThis image should be...,0,0,0,0,0,0,0,3D Bonding Structure \n\nThis image should be ...,bonding structure image added information box ...,"[bonding, structure, image, added, information..."
167414,d95429961f576658,"""\n\nCan someone explain to me why the club's ...",0,0,0,0,0,0,0,\n\nCan someone explain to me why the clubs we...,someone explain clubs website biased source tr...,"[someone, explain, clubs, website, biased, sou..."
167415,294e2e2095e50b7a,"]], Template:Moogfest 2010, Template:Moogfest ...",0,0,0,0,0,0,0,TemplateMoogfest 2010 TemplateMoogfest 2011 T...,templatemoogfest 2010 templatemoogfest 2011 te...,"[templatemoogfest, 2010, templatemoogfest, 201..."


## Creating the vocabulary

In [24]:
#occurence de chaque mot 
occ = {} 
for liste in data_augm.tokens:
    for w in liste:
        if w in occ:
            occ[w]+=1
        else :
            occ[w]=1

In [25]:
occ

{'creating': 1491,
 'maintaining': 149,
 'articlescreating': 1,
 'new': 10584,
 'article': 55628,
 'another': 6716,
 'explanation': 1762,
 'error': 1203,
 'open': 1687,
 'envelopes': 5,
 'doesnt': 6725,
 'seem': 3683,
 'unreasonable': 183,
 'assume': 1395,
 'said': 7727,
 'contain': 630,
 'without': 7182,
 'distinguishing': 41,
 'envelope': 23,
 'contains': 626,
 'amount': 912,
 'values': 246,
 'equally': 402,
 'likely': 1741,
 'better': 6175,
 'information': 12310,
 'however': 6420,
 'follow': 2272,
 'assumption': 273,
 'given': 3826,
 'found': 4437,
 'first': 10860,
 '12a': 4,
 'every': 4365,
 'conceivable': 19,
 'value': 875,
 'something': 7508,
 'implied': 162,
 'subsequent': 265,
 'calculation': 58,
 'expected': 422,
 'specific': 2335,
 'money': 1054,
 'fill': 569,
 'always': 3615,
 'find': 9093,
 'second': 2955,
 'whereas': 393,
 'possible': 2986,
 'half': 1035,
 'time': 15637,
 'see': 21697,
 'like': 29031,
 'say': 10260,
 '23m': 3,
 '13m': 3,
 'often': 1856,
 'chance': 1092,
 '

We create our dictionary : we delete the most occurent words. 

In [26]:
dic = {}
for key in occ : 
    if occ[key] > 10 :
        dic[key] = len(dic)

In [27]:
len(dic),dic 

(25728,
 {'creating': 0,
  'maintaining': 1,
  'new': 2,
  'article': 3,
  'another': 4,
  'explanation': 5,
  'error': 6,
  'open': 7,
  'doesnt': 8,
  'seem': 9,
  'unreasonable': 10,
  'assume': 11,
  'said': 12,
  'contain': 13,
  'without': 14,
  'distinguishing': 15,
  'envelope': 16,
  'contains': 17,
  'amount': 18,
  'values': 19,
  'equally': 20,
  'likely': 21,
  'better': 22,
  'information': 23,
  'however': 24,
  'follow': 25,
  'assumption': 26,
  'given': 27,
  'found': 28,
  'first': 29,
  'every': 30,
  'conceivable': 31,
  'value': 32,
  'something': 33,
  'implied': 34,
  'subsequent': 35,
  'calculation': 36,
  'expected': 37,
  'specific': 38,
  'money': 39,
  'fill': 40,
  'always': 41,
  'find': 42,
  'second': 43,
  'whereas': 44,
  'possible': 45,
  'half': 46,
  'time': 47,
  'see': 48,
  'like': 49,
  'say': 50,
  'often': 51,
  'chance': 52,
  'finding': 53,
  'equal': 54,
  'assumed': 55,
  'even': 56,
  'though': 57,
  'still': 58,
  'herd': 59,
  'behavi

# Modeling : BPMLL 

We ommit the comments that belong to every class or to zero class.

In [24]:
data_augm['all']=np.where((data_augm['toxic']==1)&(data_augm['severe_toxic']==1)&(data_augm['obscene']==1)&(data_augm['threat']==1)&(data_augm['insult']==1)&(data_augm['identity_hate']==1),1,0)
data_augm2 = data_augm[(data_augm['all']==0) & (data_augm['toxic_general']==1)]
data_augm2

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxic_general,Text_Clean,Text_Final,tokens,all
0,707bc41cf4e12f6a,"Busy in real life?! LOL, pathetic cunt. Learn ...",1,0,1,0,1,1,1,Busy in real life LOL pathetic cunt Learn to r...,busy real life lol pathetic cunt learn read en...,"[busy, real, life, lol, pathetic, cunt, learn,...",0
24,c3cea5cbd987b22a,Wuhwuzdat a bitch wit no life. Yeah u heard u...,1,1,1,0,1,0,1,Wuhwuzdat a bitch wit no life Yeah u heard u ...,wuhwuzdat bitch wit life yeah heard bitch fuck...,"[wuhwuzdat, bitch, wit, life, yeah, heard, bit...",0
48,91c19244efc9d251,Damn the killer Armenians... \n\nI damn all th...,1,0,1,0,1,1,1,Damn the killer Armenians \n\nI damn all the k...,damn killer armenians damn killer armenians ev...,"[damn, killer, armenians, damn, killer, armeni...",0
63,746eebc57015043c,Suck my dick and swallow,1,1,1,0,1,0,1,Suck my dick and swallow,suck dick swallow,"[suck, dick, swallow]",0
75,ace4e064e619d873,"Hello, you Indian cannibal and child molester ...",1,0,0,0,1,1,1,Hello you Indian cannibal and child molester \...,hello indian cannibal child molester vandal do...,"[hello, indian, cannibal, child, molester, van...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
167366,b8ffe81d3d7c1865,You are the biggest and dumbest idiot I have e...,1,0,1,0,1,0,1,You are the biggest and dumbest idiot I have e...,biggest dumbest idiot ever met online second r...,"[biggest, dumbest, idiot, ever, met, online, s...",0
167384,844f2753c02f6720,WHY THE FUCK DID YOU DELETE MY PAGE YOU FUCKIN...,1,0,1,0,1,0,1,WHY THE FUCK DID YOU DELETE MY PAGE YOU FUCKIN...,fuck delete page fucking stuck snobbish paedop...,"[fuck, delete, page, fucking, stuck, snobbish,...",0
167386,6137384a25134aca,"""\n\n Good Luck \n\nwikipedia sucks\n71.186.17...",1,0,1,0,0,0,1,\n\n Good Luck \n\nwikipedia sucks\n7118617019...,good luck wikipedia sucks 71186170196 oooh loo...,"[good, luck, wikipedia, sucks, 71186170196, oo...",0
167395,e5c518c821550c86,lasers. What an asshat,1,0,1,0,0,0,1,lasers What an asshat,lasers asshat,"[lasers, asshat]",0


In [31]:
txtidx2 = []
for el in data_augm2.tokens:
    isent = []
    for w in el:
        if w in dic:
            widx=dic[w]
        isent.append(widx)
    txtidx2.append(th.LongTensor(list(set(isent))))

In [37]:
txtidx2[2]

tensor([682, 683, 684, 685])

### BP MLL loss

In [47]:
 
def pairwise_sub(first_tensor: th.Tensor, second_tensor: th.Tensor) -> th.Tensor:
    """
    Computes pairwise difference between elements of two tensors
    :param first_tensor: the first tensor
    :param second_tensor: the second tensor
    :return: pairwise difference between the two tensors
    """

    column = first_tensor.unsqueeze(2)
    row = second_tensor.unsqueeze(1)
    return column-row

def pairwise_and(first_tensor: th.Tensor, second_tensor: th.Tensor) -> th.Tensor:
    """
    Computes pairwise logical and between elements of two tensors
    :param first_tensor: the first tensor
    :param second_tensor: the second tensor
    :return: pairwise logical and between the two tensors
    """

    column = first_tensor.unsqueeze(2)
    row = second_tensor.unsqueeze(1)
    return column & row


def bp_mll_loss(y_true : th.tensor, y_pred : th.tensor):
    shape = y_true.shape
    y_i = (y_true==th.ones(shape))
    y_i_bar = (y_true!=th.ones(shape))
    truth_matrix = pairwise_and(y_i, y_i_bar).double()
    
    sub_matrix = pairwise_sub(y_pred, y_pred)
    exp_matrix = th.exp(-(sub_matrix))
    
    sparse_matrix = th.mul(exp_matrix, truth_matrix)
    sums = th.sum(sparse_matrix, axis=[1,2])
    
    y_i_sizes = th.sum(y_i.double(), axis=1)
    y_i_bar_sizes = th.sum(y_i_bar.double(), axis=1)
    normalizers = th.mul(y_i_sizes, y_i_bar_sizes)
    results = sums/normalizers
    
    return th.mean(results)


In [39]:
class BPMLL_classifier(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(BPMLL_classifier, self).__init__()
        self.embeddings = th.nn.Embedding(num_embeddings=vocab_size,embedding_dim = embedding_dim)
        self.lin1 = th.nn.Linear(embedding_dim,64)
        self.lin2 = th.nn.Linear(64,64)
        self.lin3 = th.nn.Linear(64,6)

    def forward(self, inputs):
        out = self.embeddings(inputs)
        out = out.sum(dim=0)
        out = F.tanh(self.lin1(out))
        out = F.tanh(self.lin2(out))
        out = F.sigmoid(self.lin3(out))
        return out

In [40]:
bpmll_classifier = BPMLL_classifier(vocab_size=len(dic),embedding_dim=10)
loss_mll = bp_mll_loss
optimizer = th.optim.SGD(bpmll_classifier.parameters(), lr=0.01)

In [41]:
Y_train = data_augm2.iloc[:,2:8].to_numpy()
Y_train = th.from_numpy(Y_train)
Y_train

tensor([[1, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 0, 1, 0, 1, 0],
        ...,
        [1, 0, 1, 0, 1, 1],
        [1, 0, 1, 0, 1, 1],
        [1, 0, 1, 0, 0, 0]])

### Training 

In [42]:
total = len(data_augm2)
preds = th.zeros(total)
randomidx = list(range(total))
for epoch in range(100):
    total_loss = th.Tensor([0])
    random.shuffle(randomidx)
    for i in randomidx :
        x = txtidx2[i]
        optimizer.zero_grad()
        probs = bpmll_classifier(x)
        loss = loss_mll(Y_train[i].unsqueeze(0),probs.unsqueeze(0))
        total_loss += loss
        loss.backward()
        optimizer.step()
    print(epoch,total_loss[0])



0 tensor(15024.2549, grad_fn=<SelectBackward>)
1 tensor(14546.0029, grad_fn=<SelectBackward>)
2 tensor(14518.9844, grad_fn=<SelectBackward>)
3 tensor(14492.9766, grad_fn=<SelectBackward>)
4 tensor(14469.1943, grad_fn=<SelectBackward>)
5 tensor(14440.8682, grad_fn=<SelectBackward>)
6 tensor(14413.6523, grad_fn=<SelectBackward>)
7 tensor(14379.4736, grad_fn=<SelectBackward>)
8 tensor(14343.6650, grad_fn=<SelectBackward>)
9 tensor(14313.8340, grad_fn=<SelectBackward>)
10 tensor(14283.2197, grad_fn=<SelectBackward>)
11 tensor(14251.2852, grad_fn=<SelectBackward>)
12 tensor(14215.7295, grad_fn=<SelectBackward>)
13 tensor(14184.0498, grad_fn=<SelectBackward>)
14 tensor(14157.5537, grad_fn=<SelectBackward>)
15 tensor(14119.0049, grad_fn=<SelectBackward>)
16 tensor(14092.0713, grad_fn=<SelectBackward>)
17 tensor(14064.7168, grad_fn=<SelectBackward>)
18 tensor(14039.3447, grad_fn=<SelectBackward>)
19 tensor(13997.3799, grad_fn=<SelectBackward>)
20 tensor(13972.5986, grad_fn=<SelectBackward>)
21

In [43]:
save_path = "/home/maxence/Documents/NLP_projet/classif_multilab.pt"
output = open(save_path, mode="wb")
th.save(bpmll_classifier, output)
output.close()

  "type " + obj.__name__ + ". It won't be checked "


### Learning the thresholds

In [45]:
def threshold_train(i, classifier, txtidx, labels):
    liste_threshold = list(np.arange(0, 1, 0.1))
    liste_hl = []
    for valeur in liste_threshold:
        y_pred = classifier(txtidx[i]) > valeur
        liste_hl.append(hamming_loss(labels[i], y_pred))
    return liste_threshold[np.argmin(liste_hl)]


def phi_thr(data, labels, classifier, txtidx):
    phi = np.zeros((data.shape[0], labels.shape[1]+1))
    thr = np.zeros((data.shape[0], 1))
    for i in range(data.shape[0]):
        phi[i] = np.concatenate([classifier(txtidx[i]).detach().numpy(), np.array(([1]))], axis=0)
        thr[i] = threshold_train(i, classifier, txtidx, labels)
    return phi, thr


def threshold_predict(data, labels, classifier, txtidx):
    phi, thr = phi_thr(data, labels, classifier, txtidx)
    clf = Ridge(alpha=0.5)
    clf.fit(phi, thr)
    return clf.predict(phi)


def performance_hl(data, labels, classifier, txtidx, threshold):
    ychap = np.zeros((data.shape[0], labels.shape[1]))
    hl_total = 0
    for k in range(data.shape[0]):
        ychap[k] = classifier(txtidx[k]) > th.from_numpy(threshold[k])
        hl_total += hamming_loss(labels[k], th.from_numpy(ychap[k]))
    return hl_total / data.shape[0]

## Evaluation of our model 

### On the training set

In [47]:
threshold = threshold_predict(data_augm2, Y_train, bpmll_classifier, txtidx2)
hl_train = performance_hl(data_augm2, Y_train, bpmll_classifier, txtidx2, threshold)
hl_train



0.14473995518627136

### On the testing set 

In [48]:
data_labels = pd.read_csv("test_labels.csv")
data_labels = data_labels[(data_labels['toxic']!=-1) & (data_labels['severe_toxic']!=-1) & (data_labels['obscene']!=-1)& (data_labels['threat']!=-1)&(data_labels['insult']!=-1)& (data_labels['identity_hate']!=-1)]
data_labels['toxic_general']=np.where((data_labels['toxic']==0)&(data_labels['severe_toxic']==0)&(data_labels['obscene']==0)&(data_labels['threat']==0)&(data_labels['insult']==0)&(data_labels['identity_hate']==0),0,1)

data_labels2 = data_labels[data_labels['toxic_general'] == 1]
data_Test2 = pd.merge(data_labels2, data_test)


#nettoyage données test

#remove punctuation 
data_Test2['Text_Clean'] = data_Test2['comment_text'].apply(lambda x: remove_punct(x))

#Tokenization
tokens = [word_tokenize(sen) for sen in data_Test2.Text_Clean]

#Lower case
lower_tokens = [lower_token(token) for token in tokens]

#remove stop words
stoplist = stopwords.words('english')
filtered_words = [removeStopWords(sen) for sen in lower_tokens]

data_Test2['Text_Final'] = [' '.join(sen) for sen in filtered_words]
data_Test2['tokens'] = filtered_words

data_Test2


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxic_general,comment_text,Text_Clean,Text_Final,tokens
0,00091c35fa9d0465,1,0,0,0,0,0,1,"== Arabs are committing genocide in Iraq, but ...",Arabs are committing genocide in Iraq but no ...,arabs committing genocide iraq protests europe...,"[arabs, committing, genocide, iraq, protests, ..."
1,0013fed3aeae76b7,1,0,1,0,1,1,1,DJ Robinson is gay as hell! he sucks his dick ...,DJ Robinson is gay as hell he sucks his dick s...,robinson gay hell sucks dick much,"[robinson, gay, hell, sucks, dick, much]"
2,0017d4d47894af05,1,0,1,0,1,0,1,":Fuck off, you anti-semitic cunt. |",Fuck off you antisemitic cunt,fuck antisemitic cunt,"[fuck, antisemitic, cunt]"
3,001d739c97bc2ae4,1,0,0,0,0,0,1,How dare you vandalize that page about the HMS...,How dare you vandalize that page about the HMS...,dare vandalize page hms beagle dont vandalize ...,"[dare, vandalize, page, hms, beagle, dont, van..."
4,001eff4007dbb65b,1,0,1,0,1,0,1,"::No, he is an arrogant, self serving, immatur...",No he is an arrogant self serving immature idi...,arrogant self serving immature idiot get right,"[arrogant, self, serving, immature, idiot, get..."
...,...,...,...,...,...,...,...,...,...,...,...,...
6238,ff91c3d8a3e34398,1,0,0,0,1,0,1,NIGEL IS A CRAZY IDIOT!!!,NIGEL IS A CRAZY IDIOT,nigel crazy idiot,"[nigel, crazy, idiot]"
6239,ffd49b8defd069d0,0,0,0,0,1,0,1,""" \n ::Well, now don't I feel stupid.... · """,\n Well now dont I feel stupid ·,well dont feel stupid,"[well, dont, feel, stupid]"
6240,ffdf6854b41d9102,1,0,0,0,0,0,1,==Fourth Baldrick possibly being cleverer than...,Fourth Baldrick possibly being cleverer than i...,fourth baldrick possibly cleverer made anyone ...,"[fourth, baldrick, possibly, cleverer, made, a..."
6241,ffebe90c8d5acaba,1,0,1,0,0,0,1,""" \n\n == IRAN == \n That’s right, Iran. It wa...",\n\n IRAN \n That’s right Iran It was our d...,iran right iran drone spreading homosexual wes...,"[iran, right, iran, drone, spreading, homosexu..."


In [49]:
txtidx_test2 = []
for el in data_Test2.tokens:
    isent = []
    for w in el:
        if w in dic:
            widx=dic[w]
        isent.append(widx)
    txtidx_test2.append(th.LongTensor(list(set(isent))))

y_test2 = data_Test2.iloc[:, 1:7].to_numpy()
y_test2.shape

(6243, 6)

In [50]:
threshold_test = threshold_predict(data_Test2, y_test2, bpmll_classifier, txtidx_test2)
hl_test = performance_hl(data_Test2, y_test2, bpmll_classifier, txtidx_test2, threshold_test)
hl_test



0.17419509851033074

In [52]:
y_pred2 = np.zeros((data_Test2.shape[0], y_test2.shape[1]))
for k in range(data_Test2.shape[0]):
    y_pred2[k] = bpmll_classifier(txtidx_test2[k]) > th.from_numpy(threshold_test[k])

y_pred2

array([[1., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0.],
       [1., 0., 1., 0., 1., 0.],
       ...,
       [1., 0., 1., 0., 1., 0.],
       [1., 0., 1., 0., 1., 0.],
       [1., 0., 1., 0., 1., 1.]])

In [53]:
y_pred2, y_test2

(array([[1., 0., 0., 0., 0., 0.],
        [1., 0., 1., 0., 0., 0.],
        [1., 0., 1., 0., 1., 0.],
        ...,
        [1., 0., 1., 0., 1., 0.],
        [1., 0., 1., 0., 1., 0.],
        [1., 0., 1., 0., 1., 1.]]),
 array([[1, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 1],
        [1, 0, 1, 0, 1, 0],
        ...,
        [1, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 0, 0],
        [1, 0, 1, 0, 1, 0]]))

In [54]:
f1_score(y_test2, y_pred2, average='weighted')

0.7724554622283952

In [55]:
recall_score(y_test2,y_pred2,average='weighted')

0.8350117257552766

In [56]:
precision_score(y_test2,y_pred2,average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


0.7295615284277791

In [57]:
accuracy_score(y_test2,y_pred2)

0.3131507288162742

# MLkNN

In [36]:
classifier_new = MLkNN(k=10)
classifier_new.fit(X_train, Y_train)
predictions_new = classifier_new.predict(X_test)

hamming_loss(Y_test, predictions_new)
f1_score(Y_test, predictions_new,average="weighted")
precision_score(Y_test,predictions_new,average="weighted")
recall_score(Y_test,predictions_new,average="weighted")
accuracy_score(Y_test,predictions_new)


MLkNN(ignore_first_neighbours=0, k=10, s=1.0)