In [33]:
# Improved model for POS tagger

In [10]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, f1_score, precision_score, recall_score, accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn import preprocessing
from sklearn.dummy import DummyClassifier
import string


In [11]:
df = pd.read_csv("/Users/emilyroller/Downloads/assignment-3-emroller-main/src/DATA/pos-eng-5000.data.csv").apply(lambda x: x.astype(str).str.translate(str.maketrans('', '', string.punctuation.replace("'",""))))
#.apply(lambda x: x.astype(str).str.lower())
# notes: lowercasing worsens performance, stripping punctuation is varying but no significant change on average

df.head(n=20)

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,class
0,,,,The,cafeteria,remains,closed,DT
1,,,The,cafeteria,remains,closed,PERIOD,NN
2,,The,cafeteria,remains,closed,PERIOD,,VBZ
3,The,cafeteria,remains,closed,PERIOD,,,JJ
4,cafeteria,remains,closed,PERIOD,,,,PERIOD
5,,,,Some,analysts,argued,that,DT
6,,,Some,analysts,argued,that,there,NNS
7,,Some,analysts,argued,that,there,wo,VBD
8,Some,analysts,argued,that,there,wo,nSQt,IN
9,analysts,argued,that,there,wo,nSQt,be,EX


In [12]:
#os.getcwd()
f_out = open("../answer.txt",'w+')
f_out.write("Performance table , where negative performance indicates an improvement: \n")

In [13]:
# try replacing immediate context - words in colums a3 and a5 - with their POS tag
# this improves the performance, but it seems kinda
for r in range(1, len(df)-1):
    c1 = df['class'][r-1]
    df['a3'][r] = c1

    c2 = df['class'][r+1]
    df['a5'][r] = c2


In [14]:
labels = np.asarray(df['class'].astype("category").cat.codes.tolist())

In [15]:
names = ['Dummy', 'Naive_Bayes', 'Decision_Tree', 'Random_Forest']
classifiers = [DummyClassifier(), GaussianNB(), DecisionTreeClassifier(random_state=0), RandomForestClassifier(max_features=5)]
for name, clf in zip(names, classifiers):
    f_out.write("CLASSIFYING WITH " + name + "\n")
    classify(name, clf)
    f_out.write("\n----------------------\n")

['BASE', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'class']
['BASE', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'class']
['BASE', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'class']
['BASE', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'class']


In [16]:
# f_out.write("CLASSIFYING BASELINE: \n")
# classify("BASE")
# f_out.write("\n------------------\n")

# for feature in df.columns:
#     if feature != 'class' and feature != 'a4':  # don't drop the target column
#         f_out.write("DROPPING FEATURE "+ feature + ":\n")
#         print("....\n")
#         classify(feature)
#         f_out.write("\n------------------\n")

In [17]:
def buildClassifiers(clf, xtrain, xtest, ytrain, ytest):
    clf.fit(xtrain, ytrain)
    ypred = clf.predict(xtest)
    f1 = f1_score(ytest, ypred, average="micro")
    precision = precision_score(ytest, ypred, average="micro")
    recall = recall_score(ytest, ypred, average="micro")
    accuracy = accuracy_score(ytest, ypred)

    return f1, precision, recall, accuracy

In [20]:
def classify(name, clf):

    global base_f1
    global base_precision
    global base_accuracy
    global base_recall

    features = list(df.columns)
    features.insert(0, "BASE")

    for feature in features:
        if feature != 'class' and feature != 'a4':  # don't drop the target column
            if feature == "BASE":
                f_out.write("BASE PERFORMANCE (no features dropped): \n")
            else:
                f_out.write("DROPPING FEATURE "+ feature + ":\n")

            x_vals = df.drop(columns=[feature, 'class']).values if feature != "BASE" else df.drop(columns=['class']).values
            x_vals.tolist()

            le = preprocessing.LabelEncoder()
            le.fit(x_vals.ravel())
            x = le.transform(x_vals.ravel())
            x = x.reshape(5000, -1)

            # fold the data 5 times
            kf = KFold(n_splits = 5)
            fold_counter = 0
            a_list, b_list, c_list, d_list = list(), list(), list(), list()
            for train_i, test_i in kf.split(x):
                xtrain, xtest = x[train_i], x[test_i]
                ytrain, ytest = labels[train_i], labels[test_i]

                f1, precision, recall, accuracy = buildClassifiers(clf, xtrain, xtest, ytrain, ytest)
                a_list.append(f1)
                b_list.append(precision)
                c_list.append(recall)
                d_list.append(accuracy)

            if feature == "BASE":
                base_f1 = np.mean(a_list)
                base_precision = np.mean(b_list)
                base_recall = np.mean(c_list)
                base_accuracy = np.mean(d_list)

                s1= f"\tAverage f1 for {name}:\t\t" + str(base_f1) + "\n"
                s2= f"\tAverage precision for {name}:\t\t" + str(base_precision) + "\n"
                s3= f"\tAverage recall for {name}:\t\t" + str(base_recall) + "\n"
                s4= f"\tAverage accuracy for {name}:\t\t" + str(base_accuracy) + "\n"

                f_out.write(s1)
                f_out.write(s2)
                f_out.write(s3)
                f_out.write(s4)

            else: 
                s1= f"\tDifference in f1 for {name}: " + str(base_f1) + " - " + str(np.mean(a_list)) + " = \t\t" + str(base_f1 - np.mean(a_list)) + "\n"
                s2= f"\tDifference in precision for {name}: " + str(base_precision) + " - " + str(np.mean(b_list)) + " = \t\t" + str(base_precision - np.mean(b_list))+ "\n"
                s3= f"\tDifference in recall for {name}: " + str(base_recall) + " - " + str(np.mean(c_list)) + " = \t\t" + str(base_recall - np.mean(c_list))+ "\n"
                s4= f"\tDifference in accuracy for {name}: " + str(base_accuracy) + " - " + str(np.mean(d_list)) + " = \t\t" + str(base_accuracy - np.mean(d_list))+ "\n"

                f_out.write(s1)
                f_out.write(s2)
                f_out.write(s3)
                f_out.write(s4)


In [19]:
f_out.close()