# Document Classification
The problem description may be seen <a href="https://www.hackerrank.com/challenges/document-classification/problem">here</a> and may be summarized as follows:
1. We have a stack of preprocessed documents, and our task is to assign the documents to a category, labeled 1 through 8 inclusive.
2. We are given training data which includes the correct classification number and some sample text.
3. We will use the natural language tool kit, specifically PorterStemmer, to do the text processing. The text feature extraction is handled using Tf-idf term weighting (term-frequency inverse document frequency), which allows us to weight common words such as "a", "is", etc. less.

## Summary
Linear support vector machine algorithm yields an accuracy of roughly 97% on the test set.

In [60]:
import pandas as pd
import numpy as np
import re as re
from nltk.stem.porter import PorterStemmer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss

In [61]:
# Read in the data
train_df = pd.read_csv('train.txt', delimiter='\t')
test_df = pd.read_csv('test.txt', header = None, delimiter="\n")

In [62]:
print("Training shape =", train_df.shape, "\nTesting shape =", 
      test_df.shape)

Training shape = (5484, 2) 
Testing shape = (2189, 1)


In [63]:
#Naming the columns in train and test set
train_df.columns = ["label","text"]
test_df.columns = ["text"]
test_df.head()

Unnamed: 0,text
0,asian exporters fear damage from u s japan rif...
1,china daily says vermin eat pct grain stocks a...
2,australian foreign ship ban ends but nsw ports...
3,sumitomo bank aims at quick recovery from merg...
4,amatil proposes two for five bonus share issue...


In [64]:
#We will use Porter Stemmer natural language tk
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    """Perform the word stemming"""
    stemmed_text = []
    for item in tokens:
        stemmed_text.append(stemmer.stem(item))
    return stemmed_text

def tokenize(text):
    """Tokenize the text, remove any non-word characters"""
    text = re.sub("[^a-zA-Z]", " ", text)
    tokens = text.split(" ")
    stems = stem_tokens(tokens, stemmer)
    return stems

In [65]:
# Text feature extraction:
vectorizer = TfidfVectorizer(analyzer='word',\
                             tokenizer=tokenize,\
                             ngram_range=(1,3),\
                             lowercase=True,\
                             stop_words ='english',\
                             max_features =1100)

vectorized_features = vectorizer.fit_transform(train_df.text.tolist() + test_df.text.tolist())

#Convert the document term matrix to numpy nd array
vectorized_features_nd = (vectorized_features.toarray())
print (vectorized_features_nd.shape)

(7673, 1100)


In [66]:
# Models:
clf = LinearSVC(penalty = 'l2', dual = True, C=1.0, loss='hinge')
#clf = KNeighborsClassifier()
#clf = MultinomialNB()
#clf = RandomForestClassifier(n_estimators=20, n_jobs=-1, max_features='sqrt')

In [67]:
# Predictions
X_train = vectorized_features_nd[0:len(train_df)]
Y_train = train_df.label
X_test = vectorized_features_nd[len(train_df):]

# split in to train and test set
txt_train, txt_valid, label_train, label_valid = \
    train_test_split(X_train, Y_train, test_size=0.2, random_state=5)

#print (len(txt_train), len(txt_valid), len(txt_train) + len(txt_valid))

# Fit model to train subset and predict on validation set
clf = clf.fit(txt_train,label_train)
pred_valid = clf.predict(txt_valid)
score = accuracy_score(label_valid, pred_valid)*100.0
print ('accuracy of validation set: %1.3f%%' % score)

label_test = []

# read true labels from file
foput = open("test_labels.txt","r")
for m in foput :
    m = str(m).strip()
    label_test.append(int(m))

# fit entire training set and make prediction on test set
clf = clf.fit(X_train,Y_train)
pred_test = clf.predict(X_test)
score = accuracy_score(label_test, pred_test)*100.0
print ('accuracy of test set: %1.3f%%' % score)

accuracy of validation set: 97.174%
accuracy of test set: 97.396%
