In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import CamembertForSequenceClassification, CamembertTokenizer, AdamW
import os
import pickle
import matplotlib.pyplot as plt
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, BertTokenizer

first_ex = True

In [5]:
#data/tweetDownloadBE.csv
if first_ex:
    # Ouverture du dataset complet
    df = pd.read_csv('Tweets_data/tweetDownloadBE.csv', sep='\t')

In [6]:
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,date\ttext\tretweet\tlike
2020-12-25,@Angele070 Goeiemorgen Angèle ☕,0.0,1.0,
2020-12-25,Je suis à la page 109 de L'Eschylliade - Aux apparences ne te fieras par Pierre-François Kettler https://t.co/rfg0SFxvl4 via @simplementpro,0.0,2.0,
2020-12-25,« Impact - New Stages » est lauréat de l’appel à Projet @St_art_invest « Rayonnement Wallonie » - de quoi poursuivre et approfondir toutes les démarches entreprises au Théâtre de Liège en matière d’innovation 🤩🙏,1.0,4.0,
2020-12-25,@DocWissam Faudra quand meme analyser les chiffres et comment nous les avons recolte,0.0,3.0,
2020-12-25,"@1908Winko Je ne sais pas, je suis partagé, est ce que sport pro veut dire rendre imaginable ce qui se passe dans le monde du travail (des congés à des moments parfois inhabituels au nom du bien être du travailleur) ? Ok, ils gagnent très bien leur vie mais...",0.0,4.0,
...,...,...,...,...
2020-12-31,@SophieID5 Une très bonne année à vous et tous vos proches,0.0,0.0,
2020-12-31,Il attendait sagement le retour de papa. Thank God pour cette année qui se termine en beauté. I’m blessed 🙏🏾,1.0,49.0,
2020-12-31,Suite d’un très beau repas 🥰😉 à Les plus belles cuvées https://t.co/Thw8msa7JZ,0.0,2.0,
2020-12-31,@TinaSalama2 Merci beaucoup et meilleurs vœux 🙏,0.0,1.0,


In [3]:
if first_ex:
    # We have to split the dataset in subset of 100 000 tweets
    reader = open('data/tweetDownloadBE.csv', 'r', encoding='Latin-1')
    # Read all lines
    lines = reader.readlines()

    sub_file_size = 100000
    file_idx = 0
    total_idx = 0
    first = True
    # Open the new file to write
    file = open('data/sub_tweet_{}.csv'.format(file_idx), 'a', encoding='Latin-1')
    # Add headers
    headers = ['idx', 'date', 'text', 'retweets', 'likes', '\n']
    headers = '\t'.join(headers)
    file.write(headers)
    # Store skipped lines
    skipped = 0
    for line in tqdm(lines):
        if total_idx % sub_file_size == 0 and total_idx != 0:
            # Write in a new file
            file.close()
            file_idx += 1
            file = open('data/sub_tweet_{}.csv'.format(file_idx), 'a', encoding='Latin-1')
            file.write(headers)
        # Avoid headers
        if first:
            first = False
            continue
        line = line.split('\t')
        if len(line) < 4:
            skipped += 1
            continue
        try:
            writer = [str(total_idx),
                      str(line[0]),
                      str(line[1]),
                      str(line[2]),
                      str(line[3]),
                      '\n']
            file.write('\t'.join(writer))
            total_idx += 1
        except:
            skipped += 1
    file.close()
    reader.close()

    print('Skipped lines: {}'.format(skipped))
    

In [4]:
# Get the lsit of files
import os
general_lst = os.listdir('data/')
file_lst = []
for itm in general_lst:
    if 'sub_tweet' in itm:
        file_lst.append(itm)


In [9]:
class BERT_model(nn.Module):
    def __init__(self, device='cpu', name='model_B', new=False):

        super(BERT_model, self).__init__()

        # If we want to overwrite an existing model
        if new and os.path.exists('Model/weights_{}.pt'.format(name)):
            print('Weights for the model {} already exist: deleting...'.format(name))
            os.remove('Model/weights_{}.pt'.format(name))
        if new and os.path.exists('Model/train_track_{}.csv'.format(name)):
            print('Traininck track for model {} already exist: deleting...'.format(name))
            os.remove('Model/train_track_{}.csv'.format(name))

        # The name of the model to sore data
        self.name = name

        # Import Tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained('camembert-base')
        # Import the camembert pre-trained model
        self.model = AutoModel.from_pretrained('camembert-base')
        
        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(768, 768),
            nn.ReLU(),
            nn.Linear(768, 500),
            nn.ReLU(),
            nn.Linear(500, 250),
            nn.ReLU(),
            nn.Linear(250, 50),
            nn.ReLU(),
            nn.Linear(50, 2),
            nn.Softmax()
        )

        # If there is already a saved model, load it
        if os.path.exists('Model/weights_{}.pt'.format(name)):
            print('Existing model loading...')
            self.model.load_state_dict(torch.load('Model/weights_{}.pt'.format(name)))

        # Load training hystory
        self.train_history = None
        self.epoch_idx = 0
        self.total_idx = 0
        if os.path.exists('Model/train_track_{}.csv'.format(name)):
            self.train_history = pd.read_csv('Model/train_track_{}.csv'.format(name), sep=';', index_col=None)
            if self.train_history.shape[0] >= 1:
                self.epoch_idx = np.max(self.train_history['epoch'].to_numpy()) + 1
                self.total_idx = np.max(self.train_history['idx'].to_numpy()) + 1
        else:
            # Write file header
            file = open('Model/train_track_{}.csv'.format(name), 'a')
            file.write('idx;epoch;batch_idx;train_loss;test_loss;\n')
            file.close()

        # Set Evaluation mode
        #self.model.eval()


        # Dataset hyperparameters
        self.train_split = 0.8
        self.batch_size = 5

        # Dataset for training
        self.train_dataset = None
        self.test_dataset = None
        self.class_names = None
        self.train_size = None

        # Dataloaders
        self.train_loader = None
        self.test_loader = None

        # Optimizer
        self.optimizer = None
        # Loss function
        self.loss_fn = None
        self.loss_fn_test = None

        # Device
        self.device = device
        
    def predictor(self, sentences):

        # Get the number of predictions to do
        nb_preds = len(sentences)
        # Set eval mode
        self.model.eval()
        # Warning for memory
        batch_size = 10
        # Tokenize input sentences
        # Function to encode data batch
        MAX_LENGTH = 280
        encoded_batch = self.tokenizer.batch_encode_plus(
            sentences,
            add_special_tokens=True,
            max_length=MAX_LENGTH,
            padding=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        # Build the dataset
        dataset = TensorDataset(
            encoded_batch['input_ids'],
            encoded_batch['attention_mask']
        )
        # The data loader
        loader = DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=False
        )
        # Store predictions of each batch
        total_preds = []
        # For all batchs
        for step, batch, in enumerate(loader):
            # Get data
            input_id = batch[0].to(self.device)
            attention_mask = batch[1].to(self.device)
            # Make predictions
            #preds = self.model(input_id,
            #                   token_type_ids=None,
            #                   attention_mask=attention_mask,
            #                   labels=None)
            preds = self.forward(input_id, attention_mask)
            total_preds.append(preds.cpu().detach().numpy())

        # Reshape outputs
        outputs = np.zeros((nb_preds, 2))

        idx = 0
        for i in range(0, len(total_preds)):
            for j in range(0, len(total_preds[i])):
                outputs[idx, :] = total_preds[i][j]
                idx += 1

        return outputs
    
    def forward(self, input_id, attention_mask):

        out = self.model(input_id.to(self.device), attention_mask.to(self.device))

        return self.fc(out[1])

    
    
    
    

In [10]:
model = BERT_model(device='cuda:0', name='model_B', new=False).to('cuda:0')

In [11]:
df = pd.read_csv('data/{}'.format(file_lst[0]), sep='\t', index_col=None)
df = df.rename({'Unnamed: 5': 'class'}, axis='columns')




In [12]:
txt = df['text'].tolist()
txt = txt[0:10]
prds = model.predictor(txt)
print(prds)

[[0.46944726 0.53055269]
 [0.4694984  0.5305016 ]
 [0.46934074 0.53065932]
 [0.46942431 0.53057569]
 [0.46924147 0.53075856]
 [0.46927774 0.53072226]
 [0.46948901 0.53051102]
 [0.46934652 0.53065348]
 [0.46938014 0.5306198 ]
 [0.46924034 0.53075963]]


  input = module(input)


In [13]:
model

BERT_model(
  (model): CamembertModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,)