In [1]:
# Baseline model for POS tagger

In [10]:
import string
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, f1_score, precision_score, recall_score, accuracy_score
from sklearn.feature_extraction import DictVectorizer


In [11]:
# read in the data file
df = pd.read_csv("/Users/emilyroller/Downloads/assignment-3-emroller-main/src/DATA/pos-eng-5000.data.csv")

In [12]:
df.head(n=20)

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,class
0,_,_,_,The,cafeteria,remains,closed,DT
1,_,_,The,cafeteria,remains,closed,PERIOD,NN
2,_,The,cafeteria,remains,closed,PERIOD,_,VBZ
3,The,cafeteria,remains,closed,PERIOD,_,_,JJ
4,cafeteria,remains,closed,PERIOD,_,_,_,PERIOD
5,_,_,_,Some,analysts,argued,that,DT
6,_,_,Some,analysts,argued,that,there,NNS
7,_,Some,analysts,argued,that,there,wo,VBD
8,Some,analysts,argued,that,there,wo,nSQt,IN
9,analysts,argued,that,there,wo,nSQt,be,EX


In [13]:
# get POS labels
labels = np.asarray(df['class'].astype("category").cat.codes.tolist())

In [14]:
from sklearn import preprocessing

# drop the 'class' column - don't use it as a feature
x_vals = df.drop(columns=['class']).values
x_vals.tolist()

# Initialize the encoder
le = preprocessing.LabelEncoder()
le.fit(x_vals.ravel())
x = le.transform(x_vals.ravel())
x = x.reshape(5000, -1)

In [15]:
# build the classifier (clf param) and train it on the xtrain ytrain data
def buildClassifiers(clf, xtrain, xtest, ytrain, ytest):
    clf.fit(xtrain, ytrain)
    ypred = clf.predict(xtest)
    # Use the test data to calculate the performance metrics
    # We can switch average to "macro" to see different values
    f1 = f1_score(ytest, ypred, average="micro")
    precision = precision_score(ytest, ypred, average="micro")
    recall = recall_score(ytest, ypred, average="micro")
    accuracy = accuracy_score(ytest, ypred)

    return f1, precision, recall, accuracy

In [16]:
# Baseline model only implements these three
names = ['Dummy', 'Naive_Bayes', 'Decision_Tree']
classifiers = [DummyClassifier(), GaussianNB(), DecisionTreeClassifier(random_state=0)]

for name, clf in zip(names, classifiers):
    print("Now classifying", name)

    # Fold the data five times
    kf = KFold(n_splits = 5)
    fold_counter = 0
    a_list, b_list, c_list, d_list = list(), list(), list(), list()
    # Split the data so that a different portion is used for testing and training each time
    for train_i, test_i in kf.split(x):
        # print(train_i)
        # print(test_i)
        xtrain, xtest = x[train_i], x[test_i]
        ytrain, ytest = labels[train_i], labels[test_i]

        f1, precision, recall, accuracy = buildClassifiers(clf, xtrain, xtest, ytrain, ytest)
        a_list.append(f1)
        b_list.append(precision)
        c_list.append(recall)
        d_list.append(accuracy)

    # Just print the performance - we won't use this for the table
    print("\tAverage f1 for {}:\t\t".format(name), np.mean(a_list))
    print("\tAverage precision for {}:\t".format(name), np.mean(b_list))
    print("\tAverage recall for {}:\t\t".format(name), np.mean(c_list))
    print("\tAverage accuracy for {}:\t".format(name), np.mean(d_list))


Now classifying Dummy
	Average f1 for Dummy:		 0.1296
	Average precision for Dummy:	 0.1296
	Average recall for Dummy:		 0.1296
	Average accuracy for Dummy:	 0.1296
Now classifying Naive_Bayes
	Average f1 for Naive_Bayes:		 0.3298
	Average precision for Naive_Bayes:	 0.3298
	Average recall for Naive_Bayes:		 0.3298
	Average accuracy for Naive_Bayes:	 0.3298
Now classifying Decision_Tree
	Average f1 for Decision_Tree:		 0.6085999999999999
	Average precision for Decision_Tree:	 0.6085999999999999
	Average recall for Decision_Tree:		 0.6085999999999999
	Average accuracy for Decision_Tree:	 0.6085999999999999
