In [1]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import random
from torch.utils import data


data = pd.read_csv("outputfile.csv") 

X_train = []
y_train = []
X_test = []
y_test = []

# splitting train/test data
for i,row in data.iterrows():
    if data['Train/test'][i] == 0:
        X_train.append(row)
        y_train.append(data['Authors'][i])
    else:
        X_test.append(row)
        y_test.append(data['Authors'][i])

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# convert train/test data to tensors
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float()



In [4]:
class DocPredict(nn.Module): 
    """
    Instantiating the nn.Linear module
    """
    
    def __init__(self, input_size, num_classes):
        super(DocPredict, self).__init__()      
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x):
        x = x.squeeze(dim=-1)

        # using a sigmoid as an activation function
        y_pred = torch.sigmoid(self.linear(x)) 
        
        return y_pred
    

class DocFFNN():
    """ Instantiating the feed forward NN)"""
    def __init__(self, X_train, y_train, epochs=200, lr=0.01):
        super(DocFFNN, self).__init__()
        self.epochs = epochs
        self.lr = lr
        
        
    def get_samples(self, X_train, y_train):
        samples = []
        self.documents = X_train
        self.authors = y_train
        a1 = []
        a0 = []
        
        # randomly select a document[i]
        i = np.random.randint(0, len(X_train))

        for i in range(i, len(X_train)-1):
            
            # flipping a coin so that k is either 1 or 0
            k = np.random.randint(0, 2)

            for j in range(0, len(X_train)-1):
                
                self.doc1 = self.documents[i]
                self.doc2 = self.documents[j]

                # check if documents have same or different authors and split them in 2 groups
                if self.authors[i] == self.authors[j]:
                    a1.append(self.doc2)   
                    
                else:
                    a0.append(self.doc2)
                    
            # if k is 1 and d1 has the same author with d2, select the d2, elif k is 0 choose a d2 with a different author        
            if k == 1:
                x = np.random.randint(0, len(a1))
                self.doc2 = a1[x]
                samples.append([self.doc1, self.doc2, k])  

            else:
                x = np.random.randint(0, len(a0))
                self.doc2 = a0[x]
                samples.append([self.doc1, self.doc2, k])
        return samples        


        
    def my_model(self, instance):
        input_features = len(instance)

        self.model = DocPredict(input_features, 1)
    
    def train(self, inputs):
        """
        The training loop.
        """
        samples = self.get_samples(X_train, y_train)
        self.my_model(samples)
        
        # get samples
        criterion = nn.BCELoss()
        optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

        for z in range(self.epochs):
            print("Running epoch {}...".format(z))
            for i in range(len(samples)):
                
                optimizer.zero_grad()

                # create instance
                instance = Variable(torch.cat((samples[i][0], samples[i][1])))
                
                # get instance label
                label = Variable(torch.Tensor([samples[i][2]])) 
                                
                self.my_model(instance)
                
                train_outputs = self.model(instance)
                loss = criterion(train_outputs, label)
                
                loss.backward()
                optimizer.step()
                
        
            
ffnn = DocFFNN(X_train, y_train)
print(ffnn.train(X_train))



Running epoch 0...
tensor(0.6651, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 1...
tensor(0.6810, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 2...
tensor(0.7295, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 3...
tensor(0.7081, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 4...
tensor(0.6929, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 5...
tensor(0.6572, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 6...
tensor(0.6914, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 7...
tensor(0.6775, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 8...
tensor(0.7040, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 9...
tensor(0.6982, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 10...
tensor(0.6752, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 11...
tensor(0.7139, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 12...
tensor(0.6839, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 13...
tensor(0.6859, grad_fn=<BinaryCrossEntrop

tensor(0.7022, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 113...
tensor(0.7109, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 114...
tensor(0.7196, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 115...
tensor(0.7064, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 116...
tensor(0.7078, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 117...
tensor(0.6984, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 118...
tensor(0.7047, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 119...
tensor(0.6856, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 120...
tensor(0.6955, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 121...
tensor(0.6854, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 122...
tensor(0.6664, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 123...
tensor(0.7045, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 124...
tensor(0.6762, grad_fn=<BinaryCrossEntropyBackward>)
Running epoch 125...
tensor(0.6928, grad_fn=<BinaryCrossEnt