In [1]:
import math
import pandas as pd
import preprocess
import feature_extractor

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [2]:
# read data file
raw_df = pd.read_csv("filtered_sentences.tsv", sep="\t")

In [3]:
# preprocess
data = preprocess.process_data(raw_df)

In [4]:
raw_sentences = map(lambda x: x['SENTENCE'], data.values())
processed_sentences = map(lambda x: x['PROCESSED'], data.values())
labels = map(lambda x: x['LABEL'], data.values())

In [5]:
test_split = 0.3
num_test_items = int(math.ceil(test_split*(len(processed_sentences))))

train_sentences = processed_sentences[:-num_test_items]
train_labels = labels[:-num_test_items]
test_sentences = processed_sentences[-num_test_items:]
test_labels = labels[-num_test_items:]

In [6]:
Xtrn, Xtst, Ytrn, Ytst = feature_extractor.get_tfidf_features(
        train_sentences, test_sentences, train_labels, test_labels)

In [7]:
clfs = [KNeighborsClassifier(), MultinomialNB(), LogisticRegression(), RandomForestClassifier(n_estimators=100, n_jobs=2), AdaBoostClassifier(n_estimators=100)]
clf_names = ['Nearest Neighbors', 'Multinomial Naive Bayes', 'Logistic Regression', 'RandomForestClassifier', 'AdaBoostClassifier']

results = {}
for (i, clf_) in enumerate(clfs):
    clf = clf_.fit(Xtrn, Ytrn)
    preds = clf.predict(Xtst)
    
    precision = metrics.precision_score(Ytst, preds)
    recall = metrics.recall_score(Ytst, preds)
    f1 = metrics.f1_score(Ytst, preds)
    accuracy = accuracy_score(Ytst, preds)
    # report = classification_report(Ytst, preds)
    # matrix = metrics.confusion_matrix(Ytst, preds, labels=list(set(labels)))
    
    data = {'precision':precision,
            'recall':recall,
            'f1_score':f1,
            'accuracy':accuracy,
            # 'clf_report':report,
            # 'clf_matrix':matrix,
            'y_predicted':preds}
    
    results[clf_names[i]] = data

cols = ['precision', 'recall', 'f1_score', 'accuracy']
pd.DataFrame(results).T[cols].T

Unnamed: 0,AdaBoostClassifier,Logistic Regression,Multinomial Naive Bayes,Nearest Neighbors,RandomForestClassifier
precision,0.6666667,0.6666667,0.6666667,0.6666667,0.6666667
recall,1.0,1.0,1.0,1.0,1.0
f1_score,0.8,0.8,0.8,0.8,0.8
accuracy,0.6666667,0.6666667,0.6666667,0.6666667,0.6666667
