In [80]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pytorch_lightning as pl

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

import numpy as np
import pandas as pd
import random
import os

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [77]:
!git clone https://github.com/gerdiedoo/lstm-data

Cloning into 'lstm-data'...


In [76]:
# temp_df = pd.read_csv('lstm-data\\all_labels.csv')
# for i in temp_df.columns:
#     if i == 'filename' or i == 'code':
#         continue
#     temp_df[['filename', 'code', i]].rename(columns={i:'target'}).to_csv(f'{i}.csv', index=False)

In [81]:
# def seed_everything(seed=42):
#     """"
#     Seed everything.
#     """   
#     random.seed(seed)
#     os.environ['PYTHONHASHSEED'] = str(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.cuda.manual_seed_all(seed)
#     torch.backends.cudnn.deterministic = True
#     pl.seed_everything(seed)

# # Set the RNG
# seed_everything(1729)

Global seed set to 1729


In [87]:
class Preprocessing:
	
	def __init__(self, max_len = 512, max_words = 10000, test_size = 0.20):
		self.data = 'lstm-data\\all_labels.csv'
		# self.data = 'data\\temp_data.csv'
		self.max_len = max_len
		self.max_words = max_words
		self.test_size = test_size
		
	def load_data(self, which_label='Quicksort'):
		df = pd.read_csv(self.data)
		df = df[['code', which_label]].rename(columns={which_label:'target'})
		# print(len(df.to_numpy()))
		# df.drop(['id','keyword','location'], axis=1, inplace=True)
		# df.drop(['filename'], axis=1, inplace=True)
		df = df.dropna()
		# print(len(df.to_numpy()))
		X = df['code'].values
		Y = df['target'].values
		
		self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(X, Y, test_size=self.test_size)
		
	def prepare_tokens(self):
		self.tokens = Tokenizer(num_words=self.max_words)
		self.tokens.fit_on_texts(self.x_train)

	def sequence_to_token(self, x):
		sequences = self.tokens.texts_to_sequences(x)
		return sequence.pad_sequences(sequences, maxlen=self.max_len)

In [83]:
class classifier(nn.ModuleList):

	def __init__(self, batch_size=64, hidden_dim=128, lstm_layers=2, max_words=10000):
		super(classifier, self).__init__()
		
		self.batch_size = batch_size
		self.hidden_dim = hidden_dim
		self.LSTM_layers = lstm_layers
		self.input_size = max_words # embedding dimention
		
		self.dropout = nn.Dropout(0.5)
		self.embedding = nn.Embedding(self.input_size, self.hidden_dim, padding_idx=0)
		self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.LSTM_layers, batch_first=True)
		self.fc1 = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim*2)
		self.fc2 = nn.Linear(self.hidden_dim*2, 1)
		
	def forward(self, x):
	
		h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))
		c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))
		
		torch.nn.init.xavier_normal_(h)
		torch.nn.init.xavier_normal_(c)

		out = self.embedding(x)
		out, (hidden, cell) = self.lstm(out, (h,c))
		out = self.dropout(out)
		out = torch.relu_(self.fc1(out[:,-1,:]))
		out = self.dropout(out)
		out = torch.sigmoid(self.fc2(out))

		return out

In [84]:
class DatasetMaper(Dataset):
	'''
	Handles batches of dataset
	'''
	def __init__(self, x, y):
		self.x = x
		self.y = y
		
	def __len__(self):
		return len(self.x)
		
	def __getitem__(self, idx):
		return self.x[idx], self.y[idx]

In [85]:
def evaluation(model):

		predictions = []
		model.eval()
		with torch.no_grad():
			for x_batch, y_batch in loader_test:
				x = x_batch.type(torch.LongTensor)
				y = y_batch.type(torch.FloatTensor)
				
				y_pred = model(x)
				predictions += list(y_pred.detach().numpy())
				
		return predictions
			
def calculate_accuracy(grand_truth, predictions):
    true_positives = 0
    true_negatives = 0
    
    for true, pred in zip(grand_truth, predictions):
        if (pred > 0.5) and (true == 1):
            true_positives += 1
        elif (pred < 0.5) and (true == 0):
            true_negatives += 1
        else:
            pass
            
    return (true_positives+true_negatives) / len(grand_truth)

In [None]:
# df = pd.read_csv("lstm-data\\all_labels.csv")
# df.columns

In [92]:
m = classifier()
pr = Preprocessing(test_size=0.20)
# ['Quicksort', 'Mergesort', 'Selectionsort',
#  'Insertionsort', 'Bubblesort', 'Linear search', 'Binary Search',
#  'Linked List', 'Hashmap']
pr.load_data(which_label='Binary Search')
pr.prepare_tokens()
raw_x_train = pr.x_train
raw_x_test = pr.x_test
y_train= pr.y_train
y_test = pr.y_test
x_train = pr.sequence_to_token(raw_x_train)
x_test = pr.sequence_to_token(raw_x_test)




In [93]:
training_set = DatasetMaper(x_train, y_train)
test_set = DatasetMaper(x_test, y_test)

loader_training = DataLoader(training_set, batch_size=64)
loader_test = DataLoader(test_set)

optimizer = optim.RMSprop(m.parameters(), lr=0.01)

for epoch in range(10):
	predictions = []
	m.train()
	
	for x_batch, y_batch in loader_training:
		
		x = x_batch.type(torch.LongTensor)
		y = y_batch.type(torch.FloatTensor)
		y = y.unsqueeze(-1)
		y_pred = m(x)
		
		loss = F.binary_cross_entropy(y_pred, y)
		optimizer.zero_grad()
		
		loss.backward()
		
		optimizer.step()
		
		predictions += list(y_pred.squeeze().detach().numpy())
	
	test_predictions = evaluation(m)
	
	train_accuracy = calculate_accuracy(y_train, predictions)
	test_accuracy = calculate_accuracy(y_test, test_predictions)
	
	print(f"epoch = {epoch+1}, loss = {loss.item()}, train accuracy = {train_accuracy}, test accuracy = {test_accuracy}") 

epoch = 1, loss = 3.7723281383514404, train accuracy = 0.9104788341429563, test accuracy = 0.9473684210526315
epoch = 2, loss = 6.060606002807617, train accuracy = 0.9354614850798056, test accuracy = 0.9473684210526315
epoch = 3, loss = 0.2953565716743469, train accuracy = 0.9104788341429563, test accuracy = 0.9473684210526315
epoch = 4, loss = 0.2842632532119751, train accuracy = 0.9417071478140181, test accuracy = 0.9473684210526315
epoch = 5, loss = 0.24915003776550293, train accuracy = 0.9417071478140181, test accuracy = 0.9473684210526315
epoch = 6, loss = 0.22019147872924805, train accuracy = 0.9410131852879945, test accuracy = 0.9473684210526315
epoch = 7, loss = 0.20812703669071198, train accuracy = 0.9417071478140181, test accuracy = 0.9473684210526315
epoch = 8, loss = 0.21073630452156067, train accuracy = 0.9417071478140181, test accuracy = 0.9473684210526315
epoch = 9, loss = 0.24200496077537537, train accuracy = 0.9403192227619709, test accuracy = 0.9473684210526315
epoch 