In [4]:
import glob
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [5]:
not_wanted = set(["Greater", "Equal", "Pre", "Post", "Extrinsic-Pre", "Extrinsic-Post", "Assign", "Less", "OperationName", "Comparison"])

In [6]:
# This function builds a dictionary with all the words in the ann file as values and indices as their 
# corresponding variable names

def builddict(file):
    d = {}
    f = open(file, 'r')
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line.startswith("T"):
                line = line.split("\t")[1:]
                entity = line[0].split(" ")[0]
                if entity not in not_wanted:
                    _, si, ei = line[0].split(" ")
                    if si not in d:
                        d[si] = []
                        d[si].append(str(entity)) 
                        d[si].append(str(line[1]))
    return d

In [7]:
# This function builds a listt of list for a given txt file, with the entries of the inner list being
# the words in the txt file and its corresponding tag (var/value)

def build_dataframe(file, d):
    df = []
    with open(file) as f:
        cindex = 0
        seen = False
        end = 0
        nonzerocount = 0
        temp = f.readlines()
        flag = False
        
        for line in temp:
            tempstr = ""
            
            if "DESCRIPTION" in line:
                flag = True
            for i,char in enumerate(line):
                i_inner = end+i
                if str(cindex) in d or seen and flag:
                    seen = True
                    tempstr += char
                    
                    if seen and str(cindex) in d and i_inner == cindex+int(len(d[str(cindex)][1])):
                        seen = False
                        # add to df
                        # check if var or val 
                        if d[str(cindex)][0] != "Value":
#                             df.append([tempstr[:-1], d[str(cindex)][0]])
                            df.append([tempstr[:-1], "Variable"])
                        else:
                            df.append([tempstr[:-1], d[str(cindex)][0]])
                        tempstr = ""
                        cindex = i_inner
                        nonzerocount += 1

                else:
                    if flag:
                        if char == " ":
                            df.append([tempstr, '0'])
                            tempstr = ""
                        else:
                            tempstr += char

                    cindex = i_inner

            end += len(line)
    return df,nonzerocount

In [8]:
# This cell generates the training data per file and appends to the data list

import os
from collections import defaultdict

EXTENSIONS = {'.ann', '.txt'}

directory = 'only ann'

grouped_files = defaultdict(int)

for f in os.listdir(directory):
    name, ext = os.path.splitext(os.path.join(directory, f))
    if ext in EXTENSIONS:
        grouped_files[name] += 1
data = []
totalcount = 0
tc2 = 0
for name in grouped_files:
    if grouped_files[name] == len(EXTENSIONS):
        filetxt = '{}.txt'.format(name)
        fileann = '{}.ann'.format(name)
#         print "*"*10,filetxt,fileann
        d = builddict(fileann)
        totalcount += len(d)
        df,nzc = build_dataframe(filetxt, d)
        tc2 += nzc
        data += df

In [9]:
# Converting the data list to pandas dataframe
dataframe = pd.DataFrame(data, columns=['entity', 'tag'])

In [10]:
# Run this cell to get rid of empty entries
dataframe['entity'].replace('', np.nan, inplace=True)
dataframe.dropna(inplace=True)

In [11]:
vvData = dataframe[dataframe['tag']!='0'] 

In [12]:
# Writing train data to file
dataframe.to_csv("train_data.csv")

In [17]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(["Variable", "Value"])

vvData.head()
#X = vvData['entity']
#Y = le.transform(vvData['tag'])
#Y = vvData['tag']

X = dataframe['entity']
Y = dataframe['tag']


from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, Y_train)
#print(clf.predict(count_vect.transform(["current filehandle"])))

# pred = clf.predict(count_vect.transform(X_test))
# for p in pred:
#     if p != pred[0]:
#         print(p)


print('Accuracy of NB classifier on training set: {:.2f}'
     .format(clf.score(X_train_tfidf, Y_train)))
print('Accuracy of NB classifier on test set: {:.2f}\n'
     .format(clf.score(count_vect.transform(X_test), Y_test)))

pred = clf.predict(count_vect.transform(X_test))
print(classification_report(Y_test, pred, target_names=['0','Value','Variable']))

print("==================================================================")

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC


LSVC = LinearSVC()
LSVC.fit(X_train_tfidf, Y_train)

print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(LSVC.score(X_train_tfidf, Y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}\n'
     .format(LSVC.score(count_vect.transform(X_test), Y_test)))

pred = LSVC.predict(count_vect.transform(X_test))
print(classification_report(Y_test, pred, target_names=['0','Value','Variable']))

print("==================================================================")


logreg = LogisticRegression()
logreg.fit(X_train_tfidf, Y_train)

print('Accuracy of Log Reg classifier on training set: {:.2f}'
     .format(logreg.score(X_train_tfidf, Y_train)))
print('Accuracy of Log Reg classifier on test set: {:.2f}\n'
     .format(logreg.score(count_vect.transform(X_test), Y_test)))

pred = logreg.predict(count_vect.transform(X_test))
print(classification_report(Y_test, pred, target_names=['0','Value','Variable']))

print("==================================================================")

rForest = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
rForest.fit(X_train_tfidf, Y_train)

print('Accuracy of Random Forest classifier on training set: {:.2f}'
     .format(rForest.score(X_train_tfidf, Y_train)))
print('Accuracy of Random Forest classifier on test set: {:.2f}'
     .format(rForest.score(count_vect.transform(X_test), Y_test)))

pred = rForest.predict(count_vect.transform(X_test))
print(classification_report(Y_test, pred, target_names=['0','Value','Variable']))

print("==================================================================")

Accuracy of NB classifier on training set: 0.97
Accuracy of NB classifier on test set: 0.97

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1961
       Value       0.00      0.00      0.00        22
    Variable       0.80      0.32      0.46        50

   micro avg       0.97      0.97      0.97      2033
   macro avg       0.59      0.44      0.48      2033
weighted avg       0.96      0.97      0.96      2033

Accuracy of SVM classifier on training set: 0.98
Accuracy of SVM classifier on test set: 0.97

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1961
       Value       0.55      0.27      0.36        22
    Variable       0.70      0.42      0.53        50

   micro avg       0.97      0.97      0.97      2033
   macro avg       0.74      0.56      0.62      2033
weighted avg       0.97      0.97      0.97      2033

Accuracy of Log Reg classifier on training set: 0.