In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
#Switching directories for easy access to the data
data = '/Users/alexchoe/Desktop/Capstone/m2py-master/data/all_abstracts_model/'
os.chdir(data)

In [3]:
#Opening contents of Word2Vec model
model = Word2Vec.load("all_abstract_model.model")
vocabulary = list(model.wv.vocab)

In [6]:
model.wv.__getitem__('study')

array([  4.51539   ,   4.103919  ,  -1.8394607 ,  -6.114477  ,
         6.1308594 ,   6.0427675 ,  -1.1978918 ,   1.855273  ,
        -4.160244  ,  -3.742103  ,  -0.5794198 ,   9.883354  ,
        -4.3117127 ,   4.3782854 ,   3.5364165 ,  -4.9079742 ,
        -2.0147028 ,   5.517852  ,  -2.5264072 ,   0.60432535,
        -0.90231055,   2.0508857 ,  -1.0533296 ,  -4.2132974 ,
         4.2740326 ,  -1.8519586 ,   2.927174  ,  -1.760242  ,
       -13.595478  ,   6.390037  ,   5.449058  ,   1.0620196 ,
         1.1603653 ,  -1.8674527 ,  -7.909827  ,   3.757318  ,
        -0.7085782 ,  -1.8104229 ,  -0.7972807 ,   0.24420857,
        -0.02632353,  -5.917606  ,  10.481008  ,  -3.1318085 ,
        -1.7313521 ,   2.7741985 ,   3.4522643 ,  -4.4935923 ,
         1.5952134 ,   0.79401314], dtype=float32)

In [4]:
data = '/Users/alexchoe/Desktop/Capstone/BETO2020-master/data/carbon/'
os.chdir(data)

In [10]:
#Taking in data as a dataframe for easy pre-processing
df = pd.read_excel('Carbon_SynAntList_Full_Refined.xlsx', skiprows = 1, nrows=2000)
carbon_df = df.rename(columns = {'Unnamed: 0':'index', 0:'word 1', 1:'word 2', 2:'relationship', 'Unnamed: 4':'label'})
carbon_df = carbon_df.fillna(0)

In [6]:
#Mask to only keep strong word pair relationships
condition = carbon_df['label'] != 0
keep = (condition)
carbon_df = carbon_df[keep]

In [11]:
carbon_df

Unnamed: 0,index,word 1,word 2,relationship,label
0,0,carbon,original,ant,0
1,1,carbon,graphite,syn,1
2,2,carbon,soot,syn,1
3,3,carbon,imitate,syn,0
4,4,carbon,paint,syn,0
...,...,...,...,...,...
1995,1995,infinite,ephemeral,ant,0
1996,1996,infinite,finite,ant,0
1997,1997,infinite,intermittent,ant,0
1998,1998,infinite,limited,ant,0


In [35]:
carbon_df['word 1'].iloc[1]

'carbon'

In [48]:
#Restructuring the dataframe
for i in range(len(carbon_df)):
    carbon_df['word 1'].iloc[i] = model.wv.__getitem__(str(carbon_df['word 1'].iloc[i]))
    carbon_df['word 2'].iloc[i] = model.wv.__getitem__(str(carbon_df['word 2'].iloc[i]))
    
    if carbon_df['relationship'].iloc[i] == str('syn') & carbon_df['label'].iloc[i] == 1:
        carbon_df['relationship'].iloc[i] = 1
    else: 
        carbon_df['relationship'].iloc[i] = 0

TypeError: 'int' object is not iterable

In [None]:
#Hyper parameters
num_epochs = 100
batch_size = 50
learning_rate = 0.008

In [None]:
X = W2V_df[['Word 1', 'Word 2']] #Input features used to make predictions
Y = W2V_df[['Relationship']] #Target features to be predicted 

x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, shuffle = True) #split dataset into separate testing and training datasets

x_train_tensor = torch.tensor(x_train.values.astype(np.float32)) #convert pd.DataFrame -> np.ndarray -> torch.tensor
syn_train_tensor = torch.tensor(syn_train.values.astype(np.float32))
ant_train_tensor = torch.tensor(syn_train.values.astype(np.float32))

#create tensor with features and targets
train_tensor = torch.utils.data.TensorDataset(x_train_tensor, syn_train_tensor, nonsyn_train_tensor)
#create iterable dataset with batches
training_data_set = torch.utils.data.DataLoader(dataset = train_tensor, batch_size = batch_size, shuffle = True)

x_test_tensor = torch.tensor(x_test.values.astype(np.float32))
syn_test_tensor = torch.tensor(syn_test.values.astype(np.float32))
nonsyn_test_tensor = torch.tensor(nonsyn_test.values.astype(np.float32))

test_tensor = torch.utils.data.TensorDataset(x_test_tensor, syn_test_tensor)
testing_data_set = torch.utils.data.DataLoader(dataset = test_tensor, batch_size = batch_size, shuffle = True)

In [None]:
#Defining the neural network
class syn_NN(nn.Module):
    
    def __init__(self, in_dims, out_dims):
        
        #embedding layer
        self.em_layer = nn.Linear(in_dims, out_dims)

        #hidden layers
        self.h_layer1 = nn.Linear(out_dims, 32)
        self.h_layer2 = nn.Linear(32, 16)
        
        #output layer
        self.o_layer = nn.Linear(16, 2)
        
    def forward(self, x):
        #data enters embedding layer
        out = self.em_layer(x)
        
        #embedded data is passed to hidden layers
        out = self.h_layer1(out)
        out = self.h_layer2(out)
        
        #embedded data is passed to output layers
        syn_out = self.o_layer(out)
        
        return syn_out

In [None]:
def train_model(model, training_data_set, optimizer):
    train_epoch_loss = []
    syn_train_epoch_loss = []
    
    syn_losses = []
    train_total = 0
    
    #switch model to training mode
    model.train()
    syn_criterion = nn.MSELoss()
    
    for features, labels in training_data_set:
        
        model.zero_grad() #zero out any gradients from prior loops 
        syn_out = model(features) #gather model predictions for this loop
        
        #calculate error in the predictions
        syn_loss = syn_criterion(syn_out, labels)
        
        total_loss = syn_loss
        
        #BACKPROPAGATE LIKE A MF
        torch.autograd.backward([syn_loss])
        optimizer.step()
        
        #save loss for this batch
        train_losses.append(total_loss.item())
        train_total+=1
        
        syn_train_losses.append(pce_loss.item())
        
    #calculate and save total error for this epoch of training
    epoch_loss = sum(train_losses)/train_total
    train_epoch_loss.append(epoch_loss)
    
    syn_train_epoch_loss.append(sum(ff_train_losses)/train_total)
    
    #update progress bar
    print(f"Total Epoch Training Loss = {train_epoch_loss}")
    
    return train_epoch_loss, syn_train_epoch_loss

In [None]:
def eval_model(model, testing_data_set, optimizer):
    #evaluate the model
    model.eval()
    
    syn_criterion = nn.MSELoss(lower = 0, upper = 6)
    #accuracy = PhysLoss.MAPE()

    #don't update nodes during evaluation b/c not training
    with torch.no_grad():
        test_losses = []
        syn_test_losses = []
        syn_test_acc_list = []
        
        test_total = 0

        for inputs in testing_data_set:
            inputs = inputs.to(device)
            syn_labels = syn_labels.to(device)

            syn_out = model(inputs)

            # calculate loss per batch of testing data
            syn_test_loss = syn_criterion(syn_out)
            
            test_loss = pce_test_loss + voc_test_loss + jsc_test_loss + ff_test_loss
            
            test_losses.append(test_loss.item())
            syn_test_losses.append(syn_test_loss.item())
            test_total += 1 

            #syn_acc = accuracy(syn_out)
            #syn_test_acc_list.append(syn_acc.item())

        test_epoch_loss = sum(test_losses)/test_total
        syn_test_epoch_loss = sum(syn_test_losses)/test_total
        
        #syn_epoch_acc = sum(syn_test_acc_list)/test_total

        print(f"Total Epoch Testing Loss = {test_epoch_loss}")
        #print(f"Epoch MAPE: Syn = {syn_epoch_acc}")
    return test_epoch_loss, syn_test_epoch_loss #syn_epoch_acc