In [1]:
import numpy as np
import pandas as pd

import bz2, string, re, gc

from nltk.corpus import stopwords, wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from torch.utils.data import Dataset, DataLoader
import torch.nn as nn 
import torch.optim as optim
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

np.random.seed(42)

# Data Preparation

In [2]:
#Device selection
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'Using {device} device')

### Reading the files as bz2 extensions

In [3]:
trainFileB = bz2.BZ2File('../input/amazonreviews/train.ft.txt.bz2')
testFileB = bz2.BZ2File('../input/amazonreviews/test.ft.txt.bz2')
trainFile = trainFileB.readlines()
testFile = testFileB.readlines()
trainFileB.close()
testFileB.close()
del trainFileB, testFileB

In [4]:
gc.collect()

In [5]:
print('train data {}'.format(len(trainFile)))
print('test data {}'.format(len(testFile)))

Only select the 5% of the data since the huge dataset

In [6]:
trainFile = trainFile[:int(len(trainFile)*0.1)]
testFile = testFile[:int(len(testFile)*0.1)]

In [7]:
print('train data {}'.format(len(trainFile)))
print('test data {}'.format(len(testFile)))

### Helper functions

These functions are used in order to clean most of the texts. We remove special characters, Urls, stopwords in english and then we could also include lemmization or stemming.

In [8]:
#Cleaning and Stemming part

def removePunctuation(text):
    return re.sub(r'['+string.punctuation+']',' ',text)

def removeUrl(text):
    return re.sub(r'http(s?)\S+',' ',text)

def removeStopWords(text):
    text = text.split()
    words = stopwords.words('english')
    for i,w in enumerate(text):
        if w in words:
            text[i] = ''
    return ' '.join(text)

def lemmization(text):
    lemmatizer = WordNetLemmatizer()
    text = ' '.join(lemmatizer.lemmatize(w, wordnet.synsets(w)[0].pos()) for w in text.split() if len(wordnet.synsets(w))>0) 
    return text

def stemming(text):
    stemmer = SnowballStemmer('english')
    text = ' '.join(stemmer.stem(w) for w in text.split())
    return text
            
def cleanText(text):
    text = text.lower()
    text = removeUrl(text)
    text = removePunctuation(text)
    text = removeStopWords(text)
#     text = stemming(text)
#     text = lemmization(text)
    return text

Creating the dataframes to be used later in the Torch Datasets. The texts are parsed, then a map is used to clean the text

In [9]:
def parseInput(text):
    text = text.decode('utf-8')
    text = text.split(maxsplit=1)
    label = int(text[0][-1]) - 1
    data = text[1]
    return data, label

trainData = pd.DataFrame(trainFile)
testData = pd.DataFrame(testFile)

trainData['text'], trainData['target'] = zip(*trainData[0].map(parseInput))
testData['text'], testData['target'] = zip(*testData[0].map(parseInput))

In [10]:
trainData.head()

In [11]:
del trainFile
del testFile

In [12]:
gc.collect()

In [13]:
trainData['text'] = trainData['text'].map(cleanText)
testData['text'] = testData['text'].map(cleanText)

In [14]:
trainData.drop(columns=[0], inplace=True)

In [15]:
trainData.describe(include='all')

In [16]:
trainData.head()

In [17]:
trainData, validData = train_test_split(trainData, train_size=0.8, random_state = 42, stratify = trainData['target'])
trainData.reset_index(drop=True, inplace=True)
validData.reset_index(drop=True, inplace=True)

### Small EDA: showing the length distributions of tokenized texts and the labels distrubution

In [18]:
sns.lineplot(data=trainData.sort_values(by='text', 
                                        key=lambda x : x.str.split().str.len(),
                                        ignore_index = True).text.str.split().str.len())

In [19]:
sns.kdeplot(trainData['text'].str.split().str.len(), fill=True)

In [20]:
sns.countplot(x=trainData['target'])

# Logistic Regression with Tf-Idf Vectorizer

In [21]:
vectorizer = TfidfVectorizer(max_features = 1500, 
                             stop_words = 'english', 
                             ngram_range = (1,3))

X = vectorizer.fit_transform(trainData['text'])
X_valid = vectorizer.transform(validData['text'])
y = trainData['target']
y_valid = validData['target']

In [22]:
lr_model = LogisticRegression()
lr_model.fit(X, y)

In [23]:
y_pred = lr_model.predict(X)
y_pred_valid = lr_model.predict(X_valid)
conf_mat = confusion_matrix(y, y_pred)
conf_mat_valid = confusion_matrix(y_valid, y_pred_valid)
fig, ax = plt.subplots(1,2, figsize=(15,5))
sns.heatmap(conf_mat, annot=True, cmap = 'Blues', ax=ax[0])
ax[0].set_title('Training Confusion Matrix')
sns.heatmap(conf_mat_valid, annot=True, cmap = 'Blues', ax=ax[1])
ax[1].set_title('Validation Confusion Matrix')

In [24]:
tot_sum = np.sum(conf_mat.ravel())
accuracy = (conf_mat[0][0] + conf_mat[1][1]) / tot_sum
recall = (conf_mat[0][0])/(conf_mat[0][0] + conf_mat[0][1])
precision = (conf_mat[0][0])/(conf_mat[0][0] + conf_mat[1][0])
f1_score = 2*precision*recall / (precision + recall)
print(f'Training scores: {accuracy:.2f} - {precision:.2f} - {recall:.2f} - {f1_score:.2f}')

### Creating the Datasets

In [25]:
class CustomDataset():
    
    def __init__(self, df, model='distilbert-base-uncased', max_len = 100, test = False):
        self.df = df.sort_values(by='text', 
                                 key=lambda x : x.str.split().str.len(),
                                 ignore_index = True)
        self.max_len = max_len
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.test = test
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df['text'][idx]
        label = self.df['target'][idx] if not self.test else None
        tokenized_text = self.tokenizer.encode_plus(text,
                                                    max_length=self.max_len,
                                                    padding=False,
                                                    truncation=True,
                                                    add_special_tokens=True,
                                                    return_tensors='pt')
        
        return {'input_ids': tokenized_text['input_ids'].squeeze(),
                'attention_mask': tokenized_text['attention_mask'],
                'target': torch.tensor(label, dtype=torch.float)}
        
        

In [26]:
trainDataset = CustomDataset(trainData)
validDataset = CustomDataset(validData)
testDataset = CustomDataset(testData)

In [27]:
trainDataset[0]

In [28]:
gc.collect()

Here we define the collate function to apply after each batch generation: we simply create a dynamic padding since we sort by sequence lenght in the dataset. Then we proceed by padding the sequences to the maximum length found in the batch itself. This is useful in order to speed-up the BERT (and variants) processing.

In [29]:
def batch_padding(batch):
    
    max_len = max([len(sen['input_ids']) for sen in batch])
    batch_padded_texts = []
    batch_attention_mask = []
    batch_target = []
    
    for b in batch:
        l = len(b['input_ids'])
        batch_padded_texts.append(b['input_ids'].tolist() + (max_len - l)*[0])
        batch_attention_mask.append(b['attention_mask'].squeeze().tolist() + (max_len - l)*[0])
        batch_target.append(b['target'])
    
    return {'input_ids': torch.tensor(batch_padded_texts), 
            'attention_mask': torch.tensor(batch_attention_mask) , 
            'target': torch.tensor(batch_target)}

### Define the Model and DataLoader configurations

In [30]:
TRAIN_BATCH_SIZE = 200
VALID_BATCH_SIZE = 100
EPOCHS = 2
DROPOUT_PROB = 0.3
LR = 2e-5
EPS = 1e-6
WARMUP_STEPS = 0

### Create the Dataloaders

In [31]:
trainDataLoader = DataLoader(trainDataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, pin_memory=True, collate_fn=batch_padding)
validDataLoader = DataLoader(validDataset, batch_size=VALID_BATCH_SIZE, shuffle=False, pin_memory=True, collate_fn=batch_padding)

In [32]:
class CustomModel(nn.Module):
    
    def __init__(self, model='distilbert-base-uncased'):
        super(CustomModel, self).__init__()
        config = AutoConfig.from_pretrained(model)
        self.model = AutoModel.from_pretrained(model, config=config)
        self.dense = nn.Linear(self.model.config.dim, self.model.config.dim)
        self.prediction = nn.Linear(self.model.config.dim, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(DROPOUT_PROB)
        
    def forward(self, seq, attn_mask):
        output = self.model(input_ids = seq, attention_mask = attn_mask)
        output = output[0] #Take the hidden states of the last layer (batch_size, seq_len, hidden_dim)
        output = output[:,0] #Take the hidden state returned by the CLS token
#         output = self.dense(output)
#         output = self.relu(output)
        output = self.dropout(output)
        output = self.prediction(output)
        output = self.sigmoid(output)
        return output

In [33]:
model = CustomModel().to(device)

In [34]:
#Training Loop
optimizer = AdamW(model.parameters(), lr=LR, eps=EPS) # ALERT: this is the transformers library implementation one
scheduler = get_linear_schedule_with_warmup(optimizer = optimizer,
                                           num_warmup_steps = WARMUP_STEPS,
                                           num_training_steps = EPOCHS*len(trainDataLoader))
loss = nn.BCELoss().to(device)

def training_loop(trainDL, validDL, epochs, model, loss, optimizer, scheduler):
    loss_history=[]
    for epoch in tqdm(range(1,epochs+1), unit='epoch', desc='Epoch '):
        training_loss = 0.0
        mean_acc = 0.0
        valid_loss = 0.0
        mean_acc_valid = 0.0
        for X in tqdm(trainDL, unit='batch', desc='Training progress batch: ', leave=False): #For each batch
            y_pred = model(X['input_ids'].to(device), X['attention_mask'].to(device))
            loss_pred = loss(y_pred, X['target'].unsqueeze(-1).to(device))
            training_loss += loss_pred
            acc = accuracy_score(torch.round(y_pred.detach().to(torch.device('cpu'))).int(), X['target'].int())
            mean_acc += acc
            loss_pred.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            del y_pred, loss_pred, acc
            
        model.eval() #Set evaluation mode for all the dropout and BN layers
        
        with torch.no_grad():
            for V in tqdm(validDL, unit='batch', desc='Valid progress batch: ', leave=False):
                y_valid_pred = model(V['input_ids'].to(device), 
                                     V['attention_mask'].to(device))
                loss_valid_pred = loss(y_valid_pred, V['target'].unsqueeze(-1).to(device))
                valid_loss += loss_valid_pred
                acc_val = accuracy_score(torch.round(y_valid_pred.detach().to(torch.device('cpu'))).int(), V['target'].int())
                mean_acc_valid += acc_val
                del y_valid_pred, loss_valid_pred, acc_val
                
        model.train() #Reset the training switch
        
        mean_acc/=len(trainDL)    
        training_loss/=len(trainDL)
        valid_loss/=len(validDL)
        mean_acc_valid/=len(validDL)
        
        print(f"Epoch {epoch}:")
        print(f"\tTraining Loss {training_loss} - Accuracy {mean_acc}")
        print(f"\tValidation Loss {valid_loss} - Accuracy {mean_acc_valid}")

In [35]:
training_loop(trainDataLoader,validDataLoader, EPOCHS, model, loss, optimizer, scheduler)

In [36]:
gc.collect()

In [1]:
y_test_pred = []
y_test = []

model.eval()

for x in tqdm(testDataset, desc='Evaluating test samples'):
    y_test_pred.append(model(x['input_ids'].unsqueeze(0).to(device), x['attention_mask'].unsqueeze(0).to(device)).cpu())
    y_test.append(x['target'])
    