In [None]:
cd /content/drive/MyDrive/tophd/snowman-application-tasks-ay21-22/dataset

/content/drive/MyDrive/tophd/snowman-application-tasks-ay21-22/dataset


In [None]:
import pandas as pd
df_train = pd.read_csv("train.csv", encoding='cp1252')

In [None]:
df_test = pd.read_csv("test.csv",encoding = 'cp1252')

In [None]:
#preprocessing
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import re
def text_preprocessing(s):
  #lower
  s = str(s).lower()
  # Isolate and remove punctuations except '?'
  s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,\^\*])', r' \1 ', s)
  s = re.sub(r'[^\w\s\?]', ' ', s)
  # Remove some special characters
  s = re.sub(r'([\;\:\|•«\n])', ' ', s)
  s = re.sub(r'[^a-zA-z0-9\s]', '', s)
  # Remove number
  s = re.sub('[0-9]{5,}', '#####', s)
  s = re.sub('[0-9]{4}', '####', s)
  s = re.sub('[0-9]{3}', '###', s)
  s = re.sub('[0-9]{2}', '##', s)
  # Remove trailing whitespace
  s = re.sub(r'\s+', ' ', s).strip()

  return s

In [None]:
train = df_train
test = df_test

In [None]:
train['Tweets'] = train['Tweets'].apply(text_preprocessing)
test['Tweets'] = test['Tweets'].apply(text_preprocessing) 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train['Tweets'], train['Label'],
                                                    stratify=train['Label'], 
                                                    test_size=0.1, random_state = 24)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
## Tokenize the sentences
tokenizer = Tokenizer(num_words=20000) #unique word to use
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)

## Pad the sentences 
X_train = pad_sequences(X_train, maxlen=64) #max num of word in text
X_val = pad_sequences(X_val, maxlen=64)

In [None]:
#Encode Label
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train.values)
y_val = le.transform(y_val.values)

In [None]:
le.classes_

array(['none', 'racism', 'sexism'], dtype=object)

In [None]:
## FUNCTIONS TAKEN FROM https://www.kaggle.com/gmhost/gru-capsule
import numpy as np
max_features = 20000
def load_glove(word_index):
    EMBEDDING_FILE = './glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_index)+1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embeddings_index.get(word.capitalize())
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
#load glove
embedding_matrix = load_glove(tokenizer.word_index)

  exec(code_obj, self.user_global_ns, self.user_ns)


**CNN Model**

In [None]:
import torch.nn as nn
import torch.nn.functional as F
embed_size = 300 # size of each word vector
class CNN_Classifier(nn.Module):
  def __init__(self):
    super(CNN_Classifier, self).__init__()
    filter_sizes = [1,2,3,5]
    num_filters = 36
    n_classes = len(le.classes_)
    self.embedding = nn.Embedding(max_features, embed_size)
    self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
    self.embedding.weight.requires_grad = False
    self.convs1 = nn.ModuleList([nn.Conv2d(1, num_filters, (K, embed_size)) for K in filter_sizes])
    self.dropout = nn.Dropout(0.1)
    self.fc1 = nn.Linear(len(filter_sizes)*num_filters, n_classes)

  def forward(self, x):
    x = self.embedding(x)  
    x = x.unsqueeze(1)  
    x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] 
    x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
    x = torch.cat(x, 1)
    x = self.dropout(x)  
    logit = self.fc1(x) 
    return logit

In [None]:
import torch
#train on CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device) 

cuda


In [None]:
#call model
model = CNN_Classifier()
model.to(device)

CNN_Classifier(
  (embedding): Embedding(20000, 300)
  (convs1): ModuleList(
    (0): Conv2d(1, 36, kernel_size=(1, 300), stride=(1, 1))
    (1): Conv2d(1, 36, kernel_size=(2, 300), stride=(1, 1))
    (2): Conv2d(1, 36, kernel_size=(3, 300), stride=(1, 1))
    (3): Conv2d(1, 36, kernel_size=(5, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (fc1): Linear(in_features=144, out_features=3, bias=True)
)

In [None]:
#loss function and optimizer
loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)

In [None]:
#convert data to tensor
X_train = torch.tensor(X_train, dtype = torch.long)
y_train = torch.tensor(y_train, dtype = torch.long)
X_valid = torch.tensor(X_val, dtype = torch.long)
y_valid = torch.tensor(y_val, dtype = torch.long)

In [None]:
#move to cuda
X_train = X_train.to(device)
y_train = y_train.to(device)
X_valid = X_valid.to(device)
y_valid = y_valid.to(device)

In [None]:
#create dataset
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
val_dataset = torch.utils.data.TensorDataset(X_valid, y_valid)

In [None]:
#create dataloader
batch_size = 512
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size= batch_size, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size= batch_size, shuffle=False)

In [None]:
import time

n_epochs = 5
train_loss = []
valid_loss = []

for epoch in range(n_epochs):
  start_time = time.time()
  # model to train
  model.train()
  avg_loss = 0.  
  for i, (x_batch, y_batch) in enumerate(train_dataloader):
    # Predict
    y_pred = model(x_batch)
    # Compute loss
    loss = loss_fn(y_pred, y_batch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    avg_loss += loss.item() / len(train_dataloader)
    
  # Set model to validation configuration 
  model.eval()        
  avg_val_loss = 0.
  val_preds = np.zeros((len(X_val),len(le.classes_)))
    
  for i, (x_batch, y_batch) in enumerate(val_dataloader):
    y_pred = model(x_batch).detach()
    avg_val_loss += loss_fn(y_pred, y_batch).item() / len(val_dataloader)
    # store predictions
    val_preds[i * batch_size:(i+1) * batch_size] =F.softmax(y_pred).cpu().numpy()
    
  # Check Accuracy
  val_accuracy = sum(val_preds.argmax(axis=1)==y_val)/len(y_val)
  train_loss.append(avg_loss)
  valid_loss.append(avg_val_loss)
  elapsed_time = time.time() - start_time 
  print('Epoch {} \t loss={:.2f} \t val_loss={:.2f}  \t val_acc={:.2f}'.format(epoch + 1, avg_loss, avg_val_loss, val_accuracy))



Epoch 1 	 loss=342.77 	 val_loss=225.72  	 val_acc=0.78
Epoch 2 	 loss=231.25 	 val_loss=203.87  	 val_acc=0.80
Epoch 3 	 loss=200.71 	 val_loss=184.16  	 val_acc=0.82
Epoch 4 	 loss=181.90 	 val_loss=179.49  	 val_acc=0.83
Epoch 5 	 loss=164.73 	 val_loss=171.56  	 val_acc=0.84


In [None]:
torch.save(model,'CNN_Classifier.pt')

In [None]:
#predict

def predict(x):    
    # tokenize
    x = tokenizer.texts_to_sequences([x])
    # pad
    x = pad_sequences(x, maxlen=64)
    # create dataset
    x = torch.tensor(x, dtype=torch.long).to(device)

    pred = model(x).detach()
    pred = F.softmax(pred).cpu().numpy()

    pred = pred.argmax(axis=1)

    pred = le.classes_[pred]
    return pred[0]

In [None]:
y_CNN_pred = []
for sent in test.Tweets.values:
  pred = predict(sent)
  y_CNN_pred.append(pred)

  if sys.path[0] == '':


In [None]:
y_true = df_test['Label']

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_CNN_pred))

              precision    recall  f1-score   support

        none       0.76      0.98      0.86      2186
      racism       1.00      0.62      0.77       387
      sexism       0.69      0.18      0.29       633

    accuracy                           0.78      3206
   macro avg       0.82      0.59      0.64      3206
weighted avg       0.78      0.78      0.73      3206



**LSTM Model**

In [None]:
class BiLSTM_Classifier(nn.Module):    
  def __init__(self):
    super(BiLSTM_Classifier, self).__init__()
    self.hidden_size = 64
    drp = 0.1 #dropout
    n_classes = len(le.classes_)
    self.embedding = nn.Embedding(max_features, embed_size)
    self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
    self.embedding.weight.requires_grad = False
    self.lstm = nn.LSTM(embed_size, self.hidden_size, bidirectional=True, batch_first=True)
    self.linear = nn.Linear(self.hidden_size*4 , 64)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(drp)
    self.out = nn.Linear(64, n_classes)

  def forward(self, x):
    h_embedding = self.embedding(x)
    #_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0))
    h_lstm, _ = self.lstm(h_embedding)
    avg_pool = torch.mean(h_lstm, 1)
    max_pool, _ = torch.max(h_lstm, 1)
    conc = torch.cat(( avg_pool, max_pool), 1)
    conc = self.relu(self.linear(conc))
    conc = self.dropout(conc)
    out = self.out(conc)
    return out

In [None]:
#call BiLSTM Classifier
model = BiLSTM_Classifier()
loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)
model.to(device)

BiLSTM_Classifier(
  (embedding): Embedding(20000, 300)
  (lstm): LSTM(300, 64, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=256, out_features=64, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (out): Linear(in_features=64, out_features=3, bias=True)
)

In [None]:
n_epochs = 10
train_loss = []
valid_loss = []

for epoch in range(n_epochs):
  # Set model to train 
  model.train()
  avg_loss = 0.  
  for i, (x_batch, y_batch) in enumerate(train_dataloader):
    # Predict
    y_pred = model(x_batch)
    # Compute loss
    loss = loss_fn(y_pred, y_batch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    avg_loss += loss.item() / len(train_dataloader)
    
  # Set model to validation configuration 
  model.eval()        
  avg_val_loss = 0.
  val_preds = np.zeros((len(X_val),len(le.classes_)))
    
  for i, (x_batch, y_batch) in enumerate(val_dataloader):
    y_pred = model(x_batch).detach()
    avg_val_loss += loss_fn(y_pred, y_batch).item() / len(val_dataloader)
    # store predictions
    val_preds[i * batch_size:(i+1) * batch_size] =F.softmax(y_pred).cpu().numpy()
    
  # Check Accuracy
  val_accuracy = sum(val_preds.argmax(axis=1)==y_val)/len(y_val)
  train_loss.append(avg_loss)
  valid_loss.append(avg_val_loss)
  print('Epoch {} \t loss={:.2f} \t val_loss={:.2f}  \t val_acc={:.2f}'.format(epoch + 1, avg_loss, avg_val_loss, val_accuracy))



Epoch 1 	 loss=436.72 	 val_loss=349.30  	 val_acc=0.68
Epoch 2 	 loss=392.77 	 val_loss=308.08  	 val_acc=0.68
Epoch 3 	 loss=304.67 	 val_loss=233.01  	 val_acc=0.78
Epoch 4 	 loss=233.05 	 val_loss=199.82  	 val_acc=0.81
Epoch 5 	 loss=203.55 	 val_loss=189.96  	 val_acc=0.82
Epoch 6 	 loss=184.70 	 val_loss=178.99  	 val_acc=0.82
Epoch 7 	 loss=167.72 	 val_loss=174.72  	 val_acc=0.83
Epoch 8 	 loss=153.17 	 val_loss=173.39  	 val_acc=0.84
Epoch 9 	 loss=140.82 	 val_loss=177.06  	 val_acc=0.84
Epoch 10 	 loss=135.06 	 val_loss=164.93  	 val_acc=0.85


In [None]:
#save model
torch.save(model,'LSTM_Classifier.pt')

In [None]:
#predict
def predict(x):    
    # tokenize
    x = tokenizer.texts_to_sequences([x])
    # pad
    x = pad_sequences(x, maxlen=64)
    # create dataset
    x = torch.tensor(x, dtype=torch.long).to(device)

    pred = model(x).detach()
    pred = F.softmax(pred).cpu().numpy()

    pred = pred.argmax(axis=1)

    pred = le.classes_[pred]
    return pred[0]

In [None]:
y_LSTM_pred = []
for sent in test.Tweets.values:
  pred = predict(sent)
  y_LSTM_pred.append(pred)

  # This is added back by InteractiveShellApp.init_path()


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_LSTM_pred))

              precision    recall  f1-score   support

        none       0.80      0.93      0.86      2186
      racism       1.00      0.62      0.76       387
      sexism       0.64      0.42      0.51       633

    accuracy                           0.79      3206
   macro avg       0.81      0.66      0.71      3206
weighted avg       0.79      0.79      0.78      3206

