In [1]:
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import *
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression

In [2]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [4]:
print(len(df_train))
print(len(df_test))

647
162


In [7]:
X_train = list(df_train.X)
y_train = list(df_train.y)
X_test = list(df_test.X)
y_test = list(df_test.y)

# Bag Of Words Count Vectorizer and Training

1) Train Naive Bayes, Linear SVM and Logisitic Regression using BOW

2) (TBD) Use word2vec

3) (TBD) CNN with word2vec

4) (TBD) RNN + LTSM with word2vec

5) (TBD) BERT with word2vec

In [8]:
cv = CountVectorizer(strip_accents='unicode', 
                     lowercase=True, 
                     token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b',
                     stop_words='english')
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [9]:
# check training data
word_freq_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names())
top_words_df = pd.DataFrame(word_freq_df.sum()).sort_values(0, ascending=False)
top_words_df.head()

Unnamed: 0,0
years,8636
sentence,7285
court,7070
imprisonment,5096
offence,4862


In [10]:
def do_some_training(model):
    clf = model
    clf.fit(X_train_cv, y_train)
    preds = clf.predict(X_test_cv)
    print(model)
    print('Accuracy: ', round(accuracy_score(y_test, preds),3))
    print('Precision: ', round(precision_score(y_test, preds),3))
    print('Recall: ', round(recall_score(y_test, preds),3))
    print("\n------------------------\n")

In [11]:
do_some_training(BernoulliNB())
do_some_training(svm.SVC(kernel='linear', C = 1.0))
do_some_training(LogisticRegression(solver='liblinear',multi_class='ovr'))

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
Accuracy:  0.543
Precision:  0.6
Recall:  0.587

------------------------

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
Accuracy:  0.414
Precision:  0.483
Recall:  0.457

------------------------

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
Accuracy:  0.469
Precision:  0.533
Recall:  0.522

------------------------

