In [0]:
import pandas as pd
import cython
import nltk
import string
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# taille du dataset
size = 10000

# ne pas oublier de mettre le bon path
train = pd.read_csv('/content/gdrive/My Drive/IASD_NLP/data/train.csv').drop(columns='id').iloc[:size]
test_comments = pd.read_csv('/content/gdrive/My Drive/IASD_NLP/data/test.csv').drop(columns='id').iloc[:size]
test_labels = pd.read_csv('/content/gdrive/My Drive/IASD_NLP/data/test_labels.csv').drop(columns='id').iloc[:size]
test = pd.concat((test_comments, test_labels), axis=1)

In [16]:
# pour avoir une idée du dataset
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Preprocessing

In [0]:
# conversion des commentaires en liste
train_comments = train["comment_text"].values.tolist()
test_comments = test["comment_text"].values.tolist()

# récupération des labels en array
train_labels = train.drop(columns='comment_text').values
test_labels = test.drop(columns='comment_text').values

In [18]:
  # sélection des caractères à garder/supprimer
  printable = set(string.printable)
  excluded_caracters = string.punctuation.replace('!', '').replace('?', '').replace('*', '')
  excluded_caracters = excluded_caracters + '0123456789' + '—'
  print("keep only caracters found in : ", string.printable)
  print("to remove : ", excluded_caracters)

keep only caracters found in :  0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

to remove :  "#$%&'()+,-./:;<=>@[\]^_`{|}~0123456789—


In [0]:
# récupéreration des commentaires après filtrage des caractères 
train_txt = []
for comment in train_comments:
  comment = comment.replace('\n', ' ').replace('?', ' ?').replace('!', ' !')
  comment = ''.join(filter(lambda x: x in printable, comment)) #take out the non english caract
  train_txt.append(''.join([c for c in comment if c not in excluded_caracters]))
#train_txt

In [0]:
# la même chose pour le test 
test_txt = []
for comment in test_comments:
  comment = comment.replace('\n', ' ').replace('?', ' ?').replace('!', ' !')
  comment = ''.join(filter(lambda x: x in printable, comment)) #take out the non english caract
  test_txt.append(''.join([c for c in comment if c not in excluded_caracters]))
# test_txt

In [21]:
# tokenizer, vectorizer instantiation + fit + transform
vect = TfidfVectorizer(lowercase=True, stop_words={'english'}, max_features=2000)
Xtrain = vect.fit_transform(train_txt)
Xtest = vect.transform(test_comments)
print(vect.get_feature_names())



In [0]:
# conversion en torch sparse tensor 
coo = coo_matrix(Xtrain)
values = coo.data
indices = np.vstack((coo.row, coo.col))
i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
shape = coo.shape
Xtrain_t = torch.sparse.FloatTensor(i, v, torch.Size(shape))

In [0]:
# idem test
coo = coo_matrix(Xtest)
values = coo.data
indices = np.vstack((coo.row, coo.col))
i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
shape = coo.shape
Xtest_t = torch.sparse.FloatTensor(i, v, torch.Size(shape))

In [0]:
# conversion en torch
ytrain_t = torch.from_numpy(train_labels)
ytest_t = torch.from_numpy(test_labels)

In [0]:
# creation des Tensor datasets
train_data = TensorDataset(Xtrain_t, ytrain_t)
test_data = TensorDataset(Xtest_t, ytest_t)

In [0]:
batch_size = 50

# shuffle des data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [27]:
# itérateur pour obtenir un batch du train
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
#print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 2000])
Sample input: 
 tensor(indices=tensor([[   0,    0,    1,  ...,   49,   49,   49],
                       [1402, 1788,  350,  ..., 1915, 1029, 1735]]),
       values=tensor([0.6224, 0.7827, 0.4878,  ..., 0.0112, 0.0122, 0.4424]),
       size=(50, 2000), nnz=1508, layout=torch.sparse_coo)

Sample label size:  torch.Size([50, 6])


In [0]:
# classe du RNN LSTM
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super().__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        # embedding et LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        # linear et sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
    def forward(self, x, hidden):
        batch_size = x.size(0)
        # embeddings et lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        # supperposition lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # dropout et fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        # reshape pour avoir batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # récupère le dernier batch des labels    
        # return dernier sigmoid output et hidden state
        return sig_out, hidden
    def init_hidden(self, batch_size):
        # crée 2 nouveaux tenseurs de tailles n_layers x batch_size x hidden_dim,
        # init à zero, pour hidden state et cell state du LSTM
        weight = next(self.parameters()).data
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden
        

In [35]:
# Instantiation du model et hyperparam
vocab_size = len(vect.vocabulary_)
output_size = 6
embedding_dim = 400
hidden_dim = 256
n_layers = 2
net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(net)

SentimentLSTM(
  (embedding): Embedding(2000, 400, sparse=True)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=6, bias=True)
  (sig): Sigmoid()
)


In [0]:
# loss et optimizer
lr=0.001
criterion = nn.BCELoss() #multiclass
optimizer = torch.optim.SparseAdam(net.parameters(), lr=lr)

In [37]:
# training params
epochs = 4 
train_on_gpu = True
counter = 0
print_every = 100
clip=5 # gradient clipping contre l'explosion du gradient
if(train_on_gpu):
    net.cuda()
net.train()
for e in range(epochs):
    # initialialisation hidden state
    h = net.init_hidden(batch_size)
    for inputs, labels in train_loader:
        counter += 1
        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()
        h = tuple([each.data for each in h])
        net.zero_grad()
        inputs = inputs.type(torch.LongTensor)
        output, h = net(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        # loss stats
        if counter % print_every == 0:
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:
                val_h = tuple([each.data for each in val_h])
                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()
                inputs = inputs.type(torch.LongTensor)
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())
                val_losses.append(val_loss.item())
            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

RuntimeError: ignored