In [1]:
import numpy as np
import pandas as pd


from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset


In [2]:
class TweetClassifier(nn.ModuleList):

	def __init__(self, args):
		super(TweetClassifier, self).__init__()
		
		self.batch_size = args.batch_size
		self.hidden_dim = args.hidden_dim
		self.LSTM_layers = args.lstm_layers
		self.input_size = args.max_words # embedding dimention
		
		self.dropout = nn.Dropout(0.5)
		self.embedding = nn.Embedding(self.input_size, self.hidden_dim, padding_idx=0)
		self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.LSTM_layers, batch_first=True)
		self.fc1 = nn.Linear(in_features=self.hidden_dim, out_features=257)
		self.fc2 = nn.Linear(257, 1)
		
	def forward(self, x):
	
		h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))
		c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))
		
		torch.nn.init.xavier_normal_(h)
		torch.nn.init.xavier_normal_(c)

		out = self.embedding(x)
		out, (hidden, cell) = self.lstm(out, (h,c))
		out = self.dropout(out)
		out = torch.relu_(self.fc1(out[:,-1,:]))
		out = self.dropout(out)
		out = torch.sigmoid(self.fc2(out))

		return out

In [3]:
class Preprocessing:
	
	def __init__(self, args):
		self.data = args.file_name
		self.max_len = args.max_len
		self.max_words = args.max_words
		self.test_size = args.test_size
		
	def load_data(self):
		df = pd.read_csv(self.data)
		df.drop(['id','keyword','location'], axis=1, inplace=True)
		
		X = df['text'].values
		Y = df['target'].values
		
		self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(X, Y, test_size=self.test_size)
		
	def prepare_tokens(self):
		self.tokens = Tokenizer(num_words=self.max_words)
		self.tokens.fit_on_texts(self.x_train)

	def sequence_to_token(self, x):
		sequences = self.tokens.texts_to_sequences(x)
		return sequence.pad_sequences(sequences, maxlen=self.max_len)

In [4]:
class DatasetMaper(Dataset):
  '''
  Handles batches of dataset
  '''
  def __init__(self, x, y):
    self.x = x
    self.y = y
    
  def __len__(self):
    return len(self.x)
    
  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]
    

class Execute:
  '''
  Class for execution. Initializes the preprocessing as well as the 
  Tweet Classifier model
  '''

  def __init__(self, args):
    self.__init_data__(args)
    
    self.args = args
    self.batch_size = args.batch_size
    
    self.model = TweetClassifier(args)
    
  def __init_data__(self, args):
    '''
    Initialize preprocessing from raw dataset to dataset split into training and testing
    Training and test datasets are index strings that refer to tokens
    '''
    self.preprocessing = Preprocessing(args)
    self.preprocessing.load_data()
    self.preprocessing.prepare_tokens()

    raw_x_train = self.preprocessing.x_train
    raw_x_test = self.preprocessing.x_test
    
    self.y_train = self.preprocessing.y_train
    self.y_test = self.preprocessing.y_test

    self.x_train = self.preprocessing.sequence_to_token(raw_x_train)
    self.x_test = self.preprocessing.sequence_to_token(raw_x_test)
    
  def train(self):
    
    training_set = DatasetMaper(self.x_train, self.y_train)
    test_set = DatasetMaper(self.x_test, self.y_test)
    
    self.loader_training = DataLoader(training_set, batch_size=self.batch_size)
    self.loader_test = DataLoader(test_set)
    
    optimizer = optim.RMSprop(self.model.parameters(), lr=args.learning_rate)
    for epoch in range(args.epochs):
      
      predictions = []
      
      self.model.train()
      
      for x_batch, y_batch in self.loader_training:
        
        x = x_batch.type(torch.LongTensor)
        y = y_batch.type(torch.FloatTensor)
        y_pred = self.model(x)[:,0]
        loss = F.binary_cross_entropy(y_pred, y)
        
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
        
        predictions += list(y_pred.squeeze().detach().numpy())
      
      test_predictions = self.evaluation()
      
      train_accuary = self.calculate_accuray(self.y_train, predictions)
      test_accuracy = self.calculate_accuray(self.y_test, test_predictions)
      
      print("Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f" % (epoch+1, loss.item(), train_accuary, test_accuracy))
      
  def evaluation(self):

    predictions = []
    self.model.eval()
    with torch.no_grad():
      for x_batch, y_batch in self.loader_test:
        x = x_batch.type(torch.LongTensor)
        y = y_batch.type(torch.FloatTensor)
        
        y_pred = self.model(x)
        predictions += list(y_pred.detach().numpy())
        
    return predictions
      
  @staticmethod
  def calculate_accuray(grand_truth, predictions):
    true_positives = 0
    true_negatives = 0
    
    for true, pred in zip(grand_truth, predictions):
      if (pred > 0.5) and (true == 1):
        true_positives += 1
      elif (pred < 0.5) and (true == 0):
        true_negatives += 1
      else:
        pass
        
    return (true_positives+true_negatives) / len(grand_truth)

In [5]:
class Args:
  def __init__(self, epochs=10, learning_rate=0.01, hidden_dim=128, 
           lstm_layers=2, batch_size=64, test_size=0.2, max_len=20,
            max_words=1000, file_name='tweets.csv'):
    self.epochs=epochs
    self.learning_rate=learning_rate
    self.hidden_dim=hidden_dim
    self.lstm_layers=lstm_layers
    self.batch_size=batch_size
    self.test_size=test_size
    self.max_len=max_len
    self.max_words=max_words
    self.file_name = file_name

In [6]:
args = Args()
execute = Execute(args)

In [7]:
execute.train()

Epoch: 1, loss: 0.68749, Train accuracy: 0.56782, Test accuracy: 0.57058
Epoch: 2, loss: 0.87345, Train accuracy: 0.61593, Test accuracy: 0.57124
Epoch: 3, loss: 0.35815, Train accuracy: 0.71675, Test accuracy: 0.76428
Epoch: 4, loss: 0.40028, Train accuracy: 0.79343, Test accuracy: 0.76428
Epoch: 5, loss: 0.16108, Train accuracy: 0.82824, Test accuracy: 0.78858
Epoch: 6, loss: 0.17638, Train accuracy: 0.84844, Test accuracy: 0.77544
Epoch: 7, loss: 0.09451, Train accuracy: 0.86897, Test accuracy: 0.78398
Epoch: 8, loss: 0.12907, Train accuracy: 0.88654, Test accuracy: 0.78135
Epoch: 9, loss: 0.06452, Train accuracy: 0.90509, Test accuracy: 0.78792
Epoch: 10, loss: 0.06108, Train accuracy: 0.91560, Test accuracy: 0.77544
Epoch: 11, loss: 0.05256, Train accuracy: 0.92627, Test accuracy: 0.78332
Epoch: 12, loss: 0.06796, Train accuracy: 0.93251, Test accuracy: 0.76953
Epoch: 13, loss: 0.02582, Train accuracy: 0.93826, Test accuracy: 0.76559
Epoch: 14, loss: 0.00743, Train accuracy: 0.945