In [1]:
import pandas as pd
import pickle
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
def _read_pickle(file_name):
    return pd.read_pickle(file_name)

def read_data(file_name):
    
    df = _read_pickle(file_name)
    a = [] 
    for t in df:
        a.append(np.squeeze(t.numpy(), 0))

    return a


# Data Loading

In [16]:
train = torch.load('jigsaw_toxic-comment_train_embedded_CLS_english_bert.pkl')
#train = pd.read_csv('jigsaw_toxic-comment_train_embedded.pkl')
#train = pickle.load( open( 'jigsaw_toxic-comment_train_embedded.pkl', "rb" ) )

In [17]:
len(train)

223549

In [18]:
#val = torch.load('validation_es_only_text.pkl')
#val = pickle.load( open( 'validation_es_only_text.pkl', "rb" ) )
val = torch.load('es_ds_translated_aws_english_bert_distilbert-base_uncased.pkl')
val = [x.numpy() for x in val]

In [19]:
train_path = '../data/jigsaw-toxic-comment-train.csv'
val_path = '../data/val_es_toxic.csv' 

In [20]:
train_set = pd.read_csv(train_path)
val_set = pd.read_csv(val_path)

In [22]:
from sklearn.model_selection import train_test_split
X_train_eng, X_test_eng, y_train_eng, y_test_eng = train_test_split(train, train_set.toxic, test_size=0.33, random_state=42)

In [23]:
X_train_eng = np.squeeze(np.stack([x.numpy() for x in X_train_eng]))
X_test_eng = np.squeeze(np.stack([x.numpy() for x in X_test_eng]))
#val = np.squeeze(np.stack([x.numpy() for x in val]))

# Logistic regression

### Train - English only

In [24]:
%%time
pipe = make_pipeline(StandardScaler(), LogisticRegression( class_weight='balanced', max_iter=10000))
#pipe = make_pipeline(StandardScaler(), LogisticRegression( max_iter=1000))

pipe.fit(X_train_eng, y_train_eng)
#clf = LogisticRegression(random_state=0).fit(train[0:1000],list(val_set.toxic[0:1000]) )


CPU times: user 3min 7s, sys: 3.61 s, total: 3min 11s
Wall time: 50.5 s


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=10000))])

In [25]:
import joblib

joblib.dump(pipe, "models/Logistic_Regression/logistic_regression_LABSE_embedded.pkl") 
joblib.dump(X_train_eng, "models/X_and_y/X_train_eng.pkl") 
joblib.dump(X_test_eng, "models/X_and_y/X_test_eng.pkl") 
joblib.dump(y_train_eng, "models/X_and_y/y_train_eng.pkl")
joblib.dump(y_test_eng, "models/X_and_y/y_test_eng.pkl")


['models/X_and_y/y_test_eng.pkl']

# MLP Classifier

### Train - English only

In [26]:
%%time

from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=512, random_state=1, max_iter=300)

pipe = make_pipeline(StandardScaler(), clf)
pipe.fit(X_train_eng, y_train_eng)

CPU times: user 49min 14s, sys: 2min 6s, total: 51min 20s
Wall time: 12min 52s


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(hidden_layer_sizes=512, max_iter=300,
                               random_state=1))])

In [27]:
joblib.dump(pipe, "models/MLP_Classifier/MLP_Classifier_LABSE_embedded.pkl") 


FileNotFoundError: [Errno 2] No such file or directory: 'models/MLP_Classifier/MLP_Classifier_LABSE_embedded.pkl'

# Neural Network

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models

class net(nn.Module):

    def __init__(self):

        super(net, self).__init__()
        # dropout layer
        self.dropout = nn.Dropout(0.1)
        # relu activation function
        self.relu =  nn.ReLU()
        # dense layer 1
        self.fc1 = nn.Linear(768,512)
        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512,2)
        #softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)
    
    #define the forward pass
    def forward(self, in_vec):
        
        #pass the inputs to the model  
        x = self.fc1(in_vec)
        x = self.relu(x)
        x = self.dropout(x)
        # output layer
        x = self.fc2(x)
        # apply softmax activation
        x = self.softmax(x)

        return x

In [None]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_weights = compute_class_weight('balanced', np.unique(y_train_eng), y_train_eng)

print("Class Weights:",class_weights)



In [None]:
import time
import copy

train_loss = []
train_con = []

eval_loss = []
eval_con = []

def train_model(model, dataloaders, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    # Init variables that will save info about the best model
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                # Set model to training mode. 
                model.train()  
            else:
                # Set model to evaluate mode. In evaluate mode, we don't perform backprop and don't need to keep the gradients
                model.eval()   

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data
            i = 0 
            for inputs, labels in dataloaders[phase]:
                # Prepare the inputs for GPU/CPU
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                # zero the parameter gradients
                optimizer.zero_grad()

                # ===== forward pass ======
                with torch.set_grad_enabled(phase=='train'):
                    # If we're in train mode, we'll track the gradients to allow back-propagation
                    outputs = model(inputs) # apply the model to the inputs. The output is the softmax probability of each class
                    _, preds = torch.max(outputs, 1) # 
                    #print(outputs, labels)

                    loss = criterion(outputs, labels)

                    # ==== backward pass + optimizer step ====
                    # This runs only in the training phase
                    if phase == 'train':
                        loss.backward() # Perform a step in the opposite direction of the gradient
                        optimizer.step() # Adapt the optimizer


                # Collect statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

                
            if phase == 'train':
                # Adjust the learning rate based on the scheduler
                scheduler.step()
 

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            #writer.add_histogram("conv1.weight_hist", model.conv1.weight, epoch)
            #writer.add_histogram("conv1.weight_avg", model.conv1.weight.mean(), epoch)
            #writer.add_histogram("model_loss", loss.item(), epoch)

            if phase == 'train':
                train_loss.append(epoch_loss)
                train_con.append(epoch_acc)
                i += 1
                #writer.add_scalar('training loss', running_loss / 1000, epoch * len(dataloaders['train']) + i) 
            if phase == 'val':
                eval_loss.append(epoch_loss)
                eval_con.append(epoch_acc)
                i += 1
                #writer.add_scalar('eval loss', running_loss / 1000, epoch * len(dataloaders['train']) + i) 
                #print("pred {0} lable: {1}".format(preds, labels.data))

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # Keep the results of the best model so far
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                # deepcopy the model
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {(time_elapsed // 60):.0f}m {(time_elapsed % 60):.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    torch.save(model.state_dict(), 'saved_weights.pt')
    return model, train_loss, train_con, eval_loss, eval_con

In [None]:
from transformers import AdamW

model = net()
model.cuda()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = "cpu"
# converting list of class weights to a tensor
weights= torch.tensor(class_weights,dtype=torch.float)

# push to GPU
weights = weights.to(device)

# define the loss function
cross_entropy  = nn.NLLLoss(weight=weights) 

# define the optimizer
optimizer = AdamW(model.parameters(),
                  lr = 1e-5)          # learning rate

my_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [None]:
import torch.utils.data as data_utils

tensor_x_train = torch.from_numpy(np.array(X_train_eng)) # transform to torch tensor
tensor_y_train = torch.from_numpy(np.array(y_train_eng))

tensor_x_val = torch.from_numpy(np.array(X_test_eng)) # transform to torch tensor
tensor_y_val = torch.from_numpy(np.array(y_test_eng))

datasets = {'train':data_utils.TensorDataset(tensor_x_train, tensor_y_train) ,'val':data_utils.TensorDataset(tensor_x_val, tensor_y_val)  }


dataloaders_my = {
    'train': torch.utils.data.DataLoader(datasets['train'], batch_size=16,
                                             shuffle=True, num_workers=2),
    'val': torch.utils.data.DataLoader(datasets['val'], batch_size=16,
                                          shuffle=False, num_workers=2)
  }

# Train - English only

In [None]:
%%time

dataset_sizes = {x: len(datasets[x]) for x in ['train', 'val']}

train_loss = []
train_con = []

eval_loss = []
eval_con = []

num_epochs=10
#inputs, labels = inputs.to(device), labels.to(device)

model, train_loss, train_con, eval_loss, eval_con  = train_model(model, 
                    dataloaders_my,
                       cross_entropy, 
                       optimizer, 
                       my_lr_scheduler,
                       num_epochs=num_epochs)

In [None]:
plt.figure(1)
plt.plot(train_loss, label='train')
plt.plot(eval_loss, label='eval')
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.legend()

plt.figure(2)
plt.plot(train_con, label='train')
plt.plot(eval_con, label='eval')
plt.ylabel('accuracy')
plt.xlabel('epoch')

plt.legend()
plt.show()