In [1]:
import pandas as pd
import numpy as np

In [2]:
#Extraction des data
def get_data (url):
    data = pd.read_csv(url,sep="\t", header=None)
    data.columns = ["id", "label", "text"]
    return data

train_data = get_data('twitter-2013train-A.txt')
dev_data = get_data('twitter-2013dev-A.txt')
test_data = get_data('twitter-2013test-A.txt')



Unnamed: 0,id,label,text
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...
...,...,...,...
9679,103158179306807296,positive,RT @MNFootNg It's monday and Monday Night Foot...
9680,103157324096618497,positive,All I know is the road for that Lomardi start ...
9681,100259220338905089,neutral,"All Blue and White fam, we r meeting at Golden..."
9682,104230318525001729,positive,@DariusButler28 Have a great game agaist Tam...


In [3]:
#2) Mise en place du lexique du corpus
#nous supprimons toutes les ponctuations
# avec string.punctuation: !"#$%&'()*+, -./:;?@[\]^_`{|}~
import string

def remove_punctuation(text):
    translator = str.maketrans('','', string.punctuation)
    return text.translate(translator)
#nous supprimons les espaces en trop dans un tweet
def remove_whitespace(text):
    return " ".join(text.split())

#tokenisation
from nltk.tokenize import word_tokenize

def tokenization(text):
    tokens = word_tokenize(text)
    return tokens

#Amelioration
#Supprimons la casse
def text_lowercase(text):
    return text.lower()

#Stemming: Ne garder que la racine des mots
from nltk.stem.porter import PorterStemmer
def stem_words(tokens):
    stemmer = PorterStemmer()
    stems = [stemmer.stem(word) for word in tokens]
    return stems

#Suppression des mots outils, des mots qui n'apporte pas d'information donc on peut les retirer sans affecter le sens de la phrase
from nltk.corpus import stopwords

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'));
    return [token for token in tokens if token not in stop_words]
#construction du lexique

def create_lexique(data):
    lexique = []
    for tweet in data['text']:
        lower_tweet = text_lowercase(tweet)
        tweet_without_punctuation = remove_punctuation(lower_tweet)
        new_tweet = remove_whitespace(tweet_without_punctuation)
        tokens = tokenization(new_tweet)
        tokens_with_stemming = stem_words(tokens)
        tokens_wthout_stpWords = remove_stopwords(tokens_with_stemming)
        for word in tokens_wthout_stpWords:
            if word not in lexique:
                lexique.append(word)
    return lexique
    
train_data_lexique = create_lexique(train_data)

#train_data_lexique
len(train_data_lexique)
print(len(train_data_lexique))

23849


In [4]:
#Assignation d'un numero unique a chaque lexique

def assign_id_to_lexique(lexique):
    new_lexique = {}
    for i in range(1,len(lexique)+1):
        new_lexique[lexique[i-1]] = i
    return new_lexique

train_lexique_with_id = assign_id_to_lexique(train_data_lexique)
len(train_lexique_with_id)

#test_lexique_with_id

23849

In [5]:
#4) Decomptage pour chaque message, le nombre d'occurence des mots
# [0 for i in range(len(train_data['text'])]


def words_occurence(data, data_lexique_with_id):
    tab = []
    for tweet in data['text']:
        vector = [0 for i in range(len(data_lexique_with_id)+1)]
        for word in tokenization(tweet):
            if word in data_lexique_with_id.keys():
                vector[data_lexique_with_id[word]] +=1
               
        tab.append(vector)
    return np.array(tab)

train_tab = words_occurence(train_data, train_lexique_with_id)
dev_tab = words_occurence(dev_data, train_lexique_with_id)
test_tab = words_occurence(test_data, train_lexique_with_id)

print(train_tab)
print(dev_tab)
print(test_tab)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 1]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [6]:

def returnlabelVector(data):
    label_vector = []
    for label in data['label']:
        if label == 'positive':
            label = 0
        elif label == 'negative':
            label = 1
        elif label == 'neutral':
            label = 2
        label_vector.append(label)
    return np.array(label_vector)

train_label_vector = returnlabelVector(train_data)
test_label_vector = returnlabelVector(test_data)
dev_label_vector = returnlabelVector(dev_data)
print(train_tab.shape)
print(test_tab.shape)
print(dev_tab.shape)

(9684, 23850)
(3547, 23850)
(1654, 23850)


In [7]:
import torch
from torch.utils.data import DataLoader
from torch import nn


In [8]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

class Data(Dataset):
    def __init__(self, X_train, y_train):
        self.X = torch.from_numpy(X_train.astype(np.float32))
        
        self.y = torch.from_numpy(y_train).type(torch.LongTensor)
        self.len = self.X.shape[0]
  
    def __getitem__(self, index):
        return self.X[index], self.y[index]
    
    def __len__(self):
        return self.len

In [9]:
traindata = Data(train_tab, train_label_vector)
testData = Data(test_tab, test_label_vector)
devData = Data(dev_tab, test_label_vector)
print(traindata[100:200])
print(testData[100:200])
print(devData[100:200])

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([0, 2, 1, 0, 2, 2, 2, 0, 2, 1, 0, 1, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0,
        0, 0, 2, 1, 2, 0, 2, 2, 0, 1, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 0, 1, 2, 2,
        2, 0, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 0, 0, 1, 1, 1, 0, 0, 2, 2, 2, 0, 1,
        2, 2, 0, 0, 2, 1, 2, 2, 2, 2, 0, 1, 2, 2, 0, 1, 1, 2, 1, 0, 0, 2, 0, 2,
        2, 1, 0, 2]))
(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([0, 0, 2, 0, 1, 0, 0, 0, 1, 2, 2, 1, 0, 0, 2, 0, 2, 1, 2, 2, 1, 2, 2, 0,
        0, 1, 2, 0, 1, 0, 1, 2, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 2

In [10]:
train_dataloader = DataLoader(traindata, batch_size=64, shuffle=True)
test_dataloader = DataLoader(testData, batch_size=64, shuffle=True)
dev_dataloader = DataLoader(devData, batch_size=64, shuffle=True)

In [11]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear1 = nn.Linear(23850, 64)
        self.relu1 = nn.ReLU()
        self.linear2 = nn.Linear(64, 32)
        self.relu2 = nn.ReLU()
        self.linear3 = nn.Linear(32, 3)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        x = self.softmax(x)
        return x

In [12]:
model = NeuralNetwork()
print(model)

NeuralNetwork(
  (linear1): Linear(in_features=23850, out_features=64, bias=True)
  (relu1): ReLU()
  (linear2): Linear(in_features=64, out_features=32, bias=True)
  (relu2): ReLU()
  (linear3): Linear(in_features=32, out_features=3, bias=True)
  (softmax): Softmax(dim=1)
)


In [88]:
from sklearn.metrics import confusion_matrix
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
    # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

         # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
            
def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0
    true_positive, true_negative, false_positive, false_negative = 0, 0, 0, 0
    CM=0
 
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            
            CM+=confusion_matrix(y, pred.argmax(1),labels=[0,1,2])
            
        tn=CM[0][0]
        tp=CM[1][1]
        fp=CM[0][1]
        fn=CM[1][0]
        recall=tp/(tp+fn)
        precision=tp/(tp+fp) 
        f_score = (2 * recall * precision )/(recall + precision)
        print('Confusion Matirx : ')
        print(CM)
        print('- recall : ',(tp/(tp+fn))*100)
        print('- Precision: ',(tp/(tp+fp))*100) 
        print('f_score: ', f_score*100)
            

    test_loss /= num_batches
    correct /= size
   
    
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
   

In [89]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

for epochs in range(0, 10):
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader,model,loss_fn)

loss: 0.833301 [    0/ 9684]
loss: 0.734077 [ 6400/ 9684]
Confusion Matirx : 
[[ 713  121  641]
 [ 166   69  324]
 [ 283  118 1112]]
- recall :  29.361702127659573
- Precision:  36.31578947368421
f_score:  32.470588235294116
Test Error: 
 Accuracy: 53.4%, Avg loss: 1.004964 

loss: 0.634094 [    0/ 9684]
loss: 0.751834 [ 6400/ 9684]
Confusion Matirx : 
[[ 604  112  759]
 [ 124   58  377]
 [ 177  102 1234]]
- recall :  31.868131868131865
- Precision:  34.11764705882353
f_score:  32.95454545454546
Test Error: 
 Accuracy: 53.5%, Avg loss: 1.005954 

loss: 0.840187 [    0/ 9684]
loss: 0.725610 [ 6400/ 9684]
Confusion Matirx : 
[[ 761  130  584]
 [ 183   68  308]
 [ 322  132 1059]]
- recall :  27.091633466135455
- Precision:  34.34343434343434
f_score:  30.28953229398664
Test Error: 
 Accuracy: 53.2%, Avg loss: 1.007666 

loss: 0.731895 [    0/ 9684]
loss: 0.752137 [ 6400/ 9684]
Confusion Matirx : 
[[815 169 491]
 [200  92 267]
 [401 194 918]]
- recall :  31.506849315068493
- Precision:  35

In [None]:
#Accuracy
#Accuracy : 56.6%
#recall : 36,2%
#precision: 34,04%