Environnement
=============

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from collections import Counter
from torch.utils.data import DataLoader, Dataset, random_split

print(os.listdir("../input"))
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

['sentimentIMDB_train.csv', 'sentimentIMDB_test.csv']


Chargement des données
======================

In [2]:
class SentimentDataset(Dataset):
    """
    This is a subclass of torch.utils.data Dataset and it implements 
    methods that make the dataset compatible with pytorch data utilities, notably the DataLoader
    """
    def __init__(self,datalines):
        self.xydata = datalines
    
    def __len__(self):              #API requirement
        return len(self.xydata)
    
    def __getitem__(self,idx):      #API requirement
        return self.xydata[idx]

def load_data_set(filename):
    """
    Loads a dataset as a list of tuples: (text,label)
    Args:
       filename (str): the dataset filename 
    Returns:
       A pytorch compatible Dataset object
       list of tuples.
    """
    istream = open(filename)
    istream.readline()#skips header
    xydataset = [ ]
    for line in istream:
        fields = line.split(',')
        label  = fields[0]
        text   = ','.join(fields[1:])
        xydataset.append( (text,label) )
    istream.close()
    return SentimentDataset(xydataset)

train_set = load_data_set('../input/sentimentIMDB_train.csv')
print('Loaded %d examples as train set. '%(len(train_set)))

#This demonstrates the DataLoader basic usage
print('Train data Sample (1st batch only)')
train_loader = DataLoader(train_set, batch_size=4, shuffle=True)
for text_batch, label_batch in train_loader:       #iterates over all batches
    for text,label in zip(text_batch, label_batch):#iterates over example in current batch 
        print('  ',text[:50],'...',label)
    break #stops displaying once first batch 

    
test_set = load_data_set('../input/sentimentIMDB_test.csv')
print('Loaded %d examples as test set. '%(len(test_set)))

Loaded 25000 examples as train set. 
Train data Sample (1st batch only)
   This movie is very cool. If you're a fan of Tsui H ... 1
   Simply put, this is the worst movie since "Police  ... 0
   I cannot say enough bad things about this train wr ... 0
   I've just had the evidence that confirmed my suspi ... 0
Loaded 25000 examples as test set. 


Codage
======
Fonctionnalités pour associer les mots à des entiers et pour vectoriser un texte.

In [3]:
def  make_w2idx(dataset):
    """
    Maps words to integers
    Returns:
    A dictionary mapping words to integers
    """
    wordset = set([])
    for text,label in dataset:
        words = text.split()
        wordset.update(words)
    return dict(zip(wordset,range(len(wordset))))   

def vectorize_text(text,w2idx):
    counts = Counter(text.split())
    xvec = torch.zeros(len(w2idx))
    for word in counts:
        if word in w2idx:       #manages unk words (ignored)
            xvec[w2idx[word]] = counts[word] 
    return xvec.squeeze()

def vectorize_target(ylabel):
     return torch.tensor(float(ylabel))



Classifieur
===========

In [4]:
class SentimentAnalyzer(nn.Module): 
    
    def __init__(self):    
        super(SentimentAnalyzer, self).__init__()
        self.reset_structure(1,1)
        
    def reset_structure(self,vocab_size, num_labels):
        self.W = nn.Linear(vocab_size, num_labels)
            
    def forward(self, text_vec):    
        return torch.sigmoid(self.W(text_vec)) #sigmoid is the logistic activation
        
    def train(self,train_set,learning_rate,epochs):
            
        self.w2idx = make_w2idx(train_set)
        self.reset_structure(len(self.w2idx),1)
            
        #remind that minimizing Binary Cross Entropy <=> minimizing NLL
        loss_func   = nn.BCELoss() 
        optimizer   = optim.SGD(self.parameters(), lr=learning_rate)
        #We do not take advantage of the DataLoader here but we demonstrate how to use it
        train,dev=random_split(train_set,[20000,5000])
        #train,dev=random_split(train_set,[800,200])
        data_loader_train = DataLoader(train, batch_size=len(train), shuffle=True)
        data_loader_dev = DataLoader(dev, batch_size=len(dev), shuffle=True)

        acc=[]
        bestacc=0
        bestweights=self.W.weight
        for epoch in range(epochs):
            global_logloss = 0.0
            for Xbatch,Ybatch in data_loader_train: #there is a single batch,this loop does a single iteration
                for X, Y in zip(Xbatch,Ybatch): 
                    self.zero_grad()
                    xvec            = vectorize_text(X,self.w2idx)
                    yvec            = vectorize_target(Y)
                    prob            = self(xvec).squeeze()
                    loss            = loss_func(prob,yvec)
                    loss.backward()
                    optimizer.step()
                    global_logloss += loss.item()
            for Xbatch,Ybatch in data_loader_dev: 
                acc_=0
                for X, Y in zip(Xbatch,Ybatch): 
                    xvec=vectorize_text(X,self.w2idx)
                    prob=self(xvec).squeeze()
                    if prob.round()==vectorize_target(Y) :
                        acc_+=1
            acc.append(acc_/len(dev))   
            if (acc_>bestacc) : 
                bestacc=acc_
                saved=self.W.state_dict()
            print("Epoch %d, mean cross entropy = %f"%(epoch,global_logloss/len(train)))
        self.W.load_state_dict(saved)
    #TODO $2 : ADD HERE A METHOD FOR PERFORMING PREDICTIONS
    # This method has to generate a CSV file with two columns:
    #   idx   : example identifier
    #   sentY : predicted sentiment (0 or 1)
    
    def run_test(self,test_set) : 
        idx=[]
        pred=[]
        for text,label in test_set.xydata : 
            xvec= vectorize_text(text,self.w2idx)
            pred.append((int)(self(xvec).squeeze().round().item()))
            idx.append(label)
        submission = pd.DataFrame({"idx" : idx ,"SentY" : pred})
        #enregistre les preds
        filename = 'DugrainCharlotte_pred_sentimentanalysis.csv'
        submission.to_csv(filename,index=False)
        return pred,submission

Inférences
==========

In [5]:
sent = SentimentAnalyzer()
#plusieurs hyperparamètres testés, il faudrait plus de temps/puissance pour faire une recherche plus poussée/systématique
sent.train(train_set,0.003,45)
#sent.train(train_set[:1000],0.01,4)


Epoch 0, mean cross entropy = 0.554869
Epoch 1, mean cross entropy = 0.392729
Epoch 2, mean cross entropy = 0.328108
Epoch 3, mean cross entropy = 0.291463
Epoch 4, mean cross entropy = 0.263224
Epoch 5, mean cross entropy = 0.242480
Epoch 6, mean cross entropy = 0.223494
Epoch 7, mean cross entropy = 0.206584
Epoch 8, mean cross entropy = 0.191539
Epoch 9, mean cross entropy = 0.182082
Epoch 10, mean cross entropy = 0.172040
Epoch 11, mean cross entropy = 0.163025
Epoch 12, mean cross entropy = 0.153125
Epoch 13, mean cross entropy = 0.149091
Epoch 14, mean cross entropy = 0.141943
Epoch 15, mean cross entropy = 0.135028
Epoch 16, mean cross entropy = 0.130518
Epoch 17, mean cross entropy = 0.125687
Epoch 18, mean cross entropy = 0.121449
Epoch 19, mean cross entropy = 0.116693
Epoch 20, mean cross entropy = 0.111859
Epoch 21, mean cross entropy = 0.108955
Epoch 22, mean cross entropy = 0.105741
Epoch 23, mean cross entropy = 0.102551
Epoch 24, mean cross entropy = 0.100250
Epoch 25, 

In [6]:
pred,submission=sent.run_test(test_set)

In [7]:
submission.head()

Unnamed: 0,idx,SentY
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
