In [1]:
import random
import pandas as pd
import nltk
import re
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('treebank')
nltk.download('stopwords')
from nltk.corpus import stopwords
import torch 
import torch.nn as nn
import torch.nn.functional as F 
from collections import OrderedDict
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch import optim
import numpy as np 
# nltk.download('punkt')
description_df = pd.read_csv('../dataset/binary_classifiers/description.csv')
installation_df = pd.read_csv('../dataset/binary_classifiers/installation.csv')
invocation_df = pd.read_csv('../dataset/binary_classifiers/invocation.csv')
citation_df = pd.read_csv('../dataset/binary_classifiers/citation.csv')

In [2]:
def lower_stopwords(x):
    x = re.sub(r'[^a-zA-Z\s]', '', x, re.I|re.A)
    x = x.lower()
    x = x.strip()
    text_tokens = [word for word in word_tokenize(x) if word not in stopwords.words()]
    return " ".join(text_tokens)

In [3]:
print(description_df["excerpt"][6])

The original implementation is based on our internal Mxnet version. There are slight differences in the final accuracy and running time due to the plenty details in platform switch.


In [4]:
neg_quant = int(len(description_df) * .375)
treebank_background = pd.DataFrame(list(map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), neg_quant))), columns=["excerpt"]).assign(description=False)
description_corpus = pd.concat([description_df.assign(description=True), installation_df.sample(neg_quant).assign(description=False), invocation_df.sample(neg_quant).assign(description=False), citation_df.sample(neg_quant).assign(description=False),treebank_background], sort=False)
description_corpus.drop('URL', 1, inplace=True)
description_corpus.dropna(0, inplace=True)
description_corpus.reset_index(drop=True, inplace=True)
description_corpus["excerpt"] = description_corpus["excerpt"].apply(lower_stopwords)

In [5]:
print(description_corpus.groupby(by = "description").count())

             contributor  excerpt
description                      
False                600      600
True                 545      545


In [6]:
X, y = description_corpus.excerpt, description_corpus.description
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

In [8]:
def curtail(array,threshold = 0.0):
    return array * (array>=threshold)

In [9]:
# from sklearn.externals import joblib
import joblib
vectorizer = CountVectorizer()
X_vect_train = vectorizer.fit_transform(X_train).toarray()
joblib.dump(vectorizer, "../model/vectorizer.m")
X_vect_test =  vectorizer.transform(X_test).toarray()

y_hot_train = pd.get_dummies(y_train.values)
y_hot_test = pd.get_dummies(y_test.values)

class FFN(nn.Module):
    
    def __init__(self,layer_arch,input_size,output_size,bias = True):
        super(FFN,self).__init__()
        self.layer_arch = layer_arch
        self.input_size = input_size
        self.output_size = output_size
        self.bias = bias
        self.build_model()
        
    def build_model(self):
        model_arch = []
        unit = self.input_size
        for i,num in enumerate(self.layer_arch):
            model_arch.append(("dense_"+str(i), nn.Linear(unit,num,bias = self.bias)))
            model_arch.append(("nonlinear_"+str(i), nn.ReLU()))
            if(i==1):
                model_arch.append(("dropout_"+str(i), nn.Dropout()))
            unit = num
        model_arch.append(("dense_final",nn.Linear(unit,self.output_size,bias=self.bias)))
        model_arch.append(("act_final",nn.Sigmoid()))
        self.model = nn.Sequential(OrderedDict(model_arch))
    def forward(self,inputs):
        return self.model(inputs)
    

        

In [10]:
model = FFN([1024,2048,1024,512,256],X_vect_train.shape[1],2)
epoch_num = 20
bs = 300
train_ds = TensorDataset(torch.tensor(X_vect_train).float(), torch.tensor(y_hot_train.values).float())
train_dl = DataLoader(train_ds, batch_size=bs)
# opt = optim.SGD(model.parameters(),lr = 0.001)
opt = optim.Adam(model.parameters())
Fs = [0]
for i in range(epoch_num):
    for xb,yb in train_dl:
        target = torch.argmax(yb,dim = 1,keepdim=False)
        pred = model(xb)
        loss_fun = nn.CrossEntropyLoss()
        loss = loss_fun(pred,target)
        loss.backward()
        opt.step()
        opt.zero_grad()        
    #### test metrics #####
    test_pred = model(torch.tensor(X_vect_test).float())
    test_pred = torch.argmax(test_pred,dim=1,keepdim=False)
    ground_truth = torch.argmax(torch.tensor(y_hot_test.values).float(),dim=1,keepdim=False)
    tn, fp, fn, tp = confusion_matrix(test_pred,ground_truth).ravel()

    ### precision ###
    precision = (tp/(tp+fp))*100
    ### recall ###
    recall = (tp/(tp+fn))*100
    ### F-measure ###
    F_measure = (2*precision*recall)/(precision+recall)
    ### accuracy ###
    accuracy = (torch.true_divide((torch.sum((test_pred-ground_truth)==0)),test_pred.shape[0]))*100
    if F_measure > max(Fs):
        torch.save(model.state_dict(), '../model/description.pt')
    Fs.append(F_measure)
    print("test accuracy is {}".format(accuracy))
    print("test precision is {}".format(precision))
    print("test recall is {}".format(recall))
    print("test F-measure is {}".format(F_measure))
    print("************************")


test accuracy is 74.91289520263672
test precision is 69.06474820143885
test recall is 76.8
test F-measure is 72.72727272727273
************************
test accuracy is 77.70034790039062
test precision is 74.10071942446042
test recall is 78.62595419847328
test F-measure is 76.29629629629629
************************
test accuracy is 82.22996520996094
test precision is 91.36690647482014
test recall is 76.50602409638554
test F-measure is 83.27868852459017
************************
test accuracy is 83.27526092529297
test precision is 93.5251798561151
test recall is 76.92307692307693
test F-measure is 84.41558441558442
************************
test accuracy is 80.83623504638672
test precision is 96.40287769784173
test recall is 72.82608695652173
test F-measure is 82.97213622291022
************************
test accuracy is 81.1846694946289
test precision is 95.68345323741008
test recall is 73.48066298342542
test F-measure is 83.125
************************
test accuracy is 80.48780822753906
t

In [11]:
print(max(Fs))

84.41558441558442


In [12]:
X_vect_train.shape[1]

3196