In [93]:
import sys
from sklearn.feature_extraction import text
from sklearn import pipeline
from sklearn import linear_model
import numpy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

In [1]:
# Hacker Rank Link:
# https://www.hackerrank.com/challenges/document-classification/problem

In [46]:
def load_training_data(filename):
    df = pd.read_fwf(filename, delim_whitespace = True, header = None)
    df = df.rename(columns = {df.columns[0]:'text'})
    df = df[['text']][1:]
    df = pd.DataFrame(df['text'].str.split(" ", 1).tolist(), columns = ['category','text'])
    train_set, validation_set = train_test_split(df, test_size=.2)
    y_train = train_set['category']
    x_train = train_set['text']
    y_validation = validation_set['category']
    x_validation = validation_set['text']
    return x_train, y_train, x_validation, y_validation

In [47]:
def load_new_input_data(filename):
    df = pd.read_fwf(filename, delim_whitespace = True, header = None)
    df = df.rename(columns = {df.columns[0]:'text'})
    df = df[['text']][1:]
    x = df['text']
    return x

In [48]:
def vectorize_training_data(x):
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 1),
                              strip_accents='ascii', lowercase=True)
        x_vectorized = vectorizer.fit_transform(x)
        return x_vectorized, vectorizer   

In [49]:
def vectorize_testing_data(x_test, vectorizer):
    x_test_vectorized = vectorizer.transform(x_test)
    return x_test_vectorized

In [50]:
def train_model(model,x_vectorized, y):
        classifier = linear_model.SGDClassifier(class_weight='balanced')
        targets = y
        classifier.fit(x_vectorized, y)
        return classifier

In [51]:
def evaluate(model, x_validation_vectorized, y_validation):
        predictions = model.predict(x_validation_vectorized)
        print (classification_report(y_validation, predictions))
        print ("The accuracy score is {:.2%}".format(accuracy_score(y_validation, predictions)))

In [96]:
def find_best_parameter(input_model, param_grid, x_train_vectorized, y_train):
    cv_sets = ShuffleSplit(n_splits = 2, test_size = .33, random_state = 1)
    grid_search = GridSearchCV(estimator=input_model, param_grid=param_grid, cv = cv_sets, scoring='accuracy')
    grid_search.fit(x_train_vectorized, y_train)
    best_model = grid_search.best_estimator_
    return best_model

In [53]:
x_train, y_train, x_validation, y_validation = load_training_data('trainingdata.txt')

x_train_vectorized, vectorizer = vectorize_training_data(x_train)

x_validation_vectorized = vectorize_testing_data(x_validation, vectorizer)

In [100]:
model1 = LogisticRegression()
model = train_model(model1,x_train_vectorized, y_train)
evaluate(model, x_validation_vectorized, y_validation)

              precision    recall  f1-score   support

           1       1.00      0.98      0.99       545
           2       0.96      0.99      0.97       329
           3       0.98      0.97      0.98        66
           4       0.96      0.88      0.92        26
           5       1.00      1.00      1.00         5
           6       0.96      0.96      0.96        49
           7       0.78      0.97      0.86        29
           8       0.96      0.90      0.92        48

   micro avg       0.97      0.97      0.97      1097
   macro avg       0.95      0.96      0.95      1097
weighted avg       0.98      0.97      0.97      1097

The accuracy score is 97.36%


In [101]:
# grid search for best C
param_grid={'C': [0.01, 0.1, 1, 10]}
model = find_best_parameter(model1, param_grid, x_train_vectorized, y_train)
evaluate(model, x_validation_vectorized, y_validation)

              precision    recall  f1-score   support

           1       0.98      0.98      0.98       545
           2       0.94      0.98      0.96       329
           3       1.00      0.94      0.97        66
           4       0.95      0.73      0.83        26
           5       1.00      0.80      0.89         5
           6       0.98      0.88      0.92        49
           7       0.76      0.97      0.85        29
           8       0.97      0.79      0.87        48

   micro avg       0.96      0.96      0.96      1097
   macro avg       0.95      0.88      0.91      1097
weighted avg       0.96      0.96      0.96      1097

The accuracy score is 95.99%


In [102]:
model2 = linear_model.SGDClassifier()
model = train_model(model2,x_train_vectorized, y_train)
evaluate(model, x_validation_vectorized, y_validation)

              precision    recall  f1-score   support

           1       0.99      0.98      0.99       545
           2       0.96      0.98      0.97       329
           3       1.00      0.97      0.98        66
           4       0.96      0.88      0.92        26
           5       1.00      1.00      1.00         5
           6       0.96      0.98      0.97        49
           7       0.78      0.97      0.86        29
           8       0.93      0.85      0.89        48

   micro avg       0.97      0.97      0.97      1097
   macro avg       0.95      0.95      0.95      1097
weighted avg       0.97      0.97      0.97      1097

The accuracy score is 97.17%


In [103]:
model3 = MultinomialNB()
model = train_model(model3,x_train_vectorized, y_train)
evaluate(model, x_validation_vectorized, y_validation)

              precision    recall  f1-score   support

           1       0.99      0.98      0.99       545
           2       0.96      0.98      0.97       329
           3       0.97      0.98      0.98        66
           4       0.92      0.88      0.90        26
           5       1.00      0.80      0.89         5
           6       0.94      0.96      0.95        49
           7       0.71      1.00      0.83        29
           8       1.00      0.81      0.90        48

   micro avg       0.97      0.97      0.97      1097
   macro avg       0.94      0.92      0.92      1097
weighted avg       0.97      0.97      0.97      1097

The accuracy score is 96.81%


In [104]:
x_test = load_new_input_data('stdin.txt')

x_test_vectorized = vectorize_testing_data(x_test, vectorizer) 

In [82]:
for line in model.predict(x_test_vectorized):
    print(line)

2
2
2


In [5]:

# # when we are running in hacker rank:
# import fileinput 

# def load_hacker_rank_input():
#     temp = []  
    
#     for f in fileinput.input(): 
#         temp.append(f)
    
#     df = pd.DataFrame(temp)
#     df = df.rename(columns = {df.columns[0]:'text'})
#     df['text'] = df['text'].str.replace('\n', '')
#     df = df[['text']][1:]
#     x = df['text']
#     return x

# x_test = load_hacker_rank_input()

# x_test_vectorized = vectorize_testing_data(x_test, vectorizer)

# for line in model.predict(x_test_vectorized):
#     print(line)