<H1>Indix Classification Problem<H1>

<H6>Loading necessary libraries</H6>

In [1]:
import os
import os
import numpy as np 
import scipy as sp 
import matplotlib as mpl 
import matplotlib.cm as cm 
import matplotlib.pyplot as plt 
import pandas as pd 
import nltk
import re
import csv
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import SGDClassifier
from scipy.sparse import hstack
from sklearn.metrics import confusion_matrix
from time import time

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

%matplotlib inline

<H6>Loading training data</H6>

In [2]:
data = pd.DataFrame.from_csv('classification_train.tsv', sep="\t", header=None, index_col=None)

  if self.run_code(code, result):


<H6>Adding column names</H6>

In [3]:
data.columns = ['product_title','brand_id','category_id']
data = data[data.brand_id != 'bid']

<H6>Converting 'brand id' and 'category id' columns to integer</H6>

In [4]:
data['brand_id'] = data['brand_id'].apply(lambda x: int(x))
data['category_id'] = data['category_id'].apply(lambda x: int(x))

<H6>Dropping duplicate entries</H6>

In [5]:
data = data.drop_duplicates()

<H6>Identifying duplicated product titles</H6>

In [6]:
sum(data['product_title'].duplicated())

12571

<H6>Deleting duplicated product titles</H6>

In [7]:
data = data.drop_duplicates(subset = ['product_title'], keep = False)

In [8]:
data.shape

(877396, 3)

<H6>Randomly shuffling the dataset</H6>

In [9]:
data = data.iloc[np.random.permutation(len(data))]

<H6>Getting brand counts and category counts</H6>

In [10]:
data['brand_count'] =  data.groupby('brand_id')['brand_id'].transform(lambda s: s.count())
data['category_count'] =  data.groupby('category_id')['category_id'].transform(lambda s: s.count())

<H6>Splitting dataset for training and validation</H6>

In [11]:
train, valid = train_test_split(data, test_size=0.1, random_state=44)

<H6>Subsetting training dataset</H6>
<p>Removed categories and brands whose representation are low in the dataset for better accuracy</p> 

In [12]:
train = train[(data.category_count > 500) & (data.brand_count>20)]
train = train.reset_index()
valid = valid.reset_index()

  if __name__ == '__main__':


In [13]:
print train.shape
print len(set(train['brand_id']))
print len(set(train['category_id']))

(697690, 6)
3071
135


<H6>Defining a function to extract words from product title and combining it with category id</H6>

<p>The function performs the following:</p>
<p>- Select first 5 words in product title as brand name is most likely to be found in the first few words</p>
<p>- Keep only alphabetic characters</p>
<p>- Convert the words to lower case</p>
<p>- Remove stop words</p>
<p>- Add category id to the word list</p>

In [14]:
def title_to_words(title, category):
    if (len(re.findall(r'\w+', title)) > 5):
        title = title.split()[:5]
        title = " ".join(title)
    letters_only = re.sub("[^a-zA-Z]", " ", title) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]
    meaningful_words.append(category)
    return( " ".join( meaningful_words ))  

In [15]:
def data_cleansing(table):
    time_0 = time()
    num_tables = table['product_title'].size
    print "Cleaning and parsing the table data...\n"
    clean_title = []
    for i in xrange(0, num_tables):
        if((i+1)%50000 == 0):
            print "Cleaning %d th row of %d rows\n" % (i+1, num_tables) 
        clean_title.append(title_to_words(table['product_title'][i], str(table['category_id'][i])))
    print "Returned clean data"
    time_taken = time() - time_0
    print "Time taken: %f sec" % (time_taken)
    return (clean_title)

<H6>Function to vectorize the cleaned data</H6>

In [16]:
def vectorize_data(clean_title, vectorizer, vectorize = False, table = None):
    time_0 = time()
    print "Vectorizing cleaned data...\n"
    if (vectorize == True):
        X_features = vectorizer.fit_transform(clean_title)
        X_features = X_features.toarray()
    else:
        X_features = vectorizer.transform(clean_title)
        X_features = X_features.toarray()        
    if (table is not None and 'brand_id' in table.columns):
        y_variable = np.asarray(table['brand_id'])
        print "Returned X and y"
        time_taken = time() - time_0
        print "Time taken: %f sec" % (time_taken)
        return (X_features, y_variable, vectorizer)
    else:
        time_taken = time() - time_0
        print "Returned X"
        print "Time taken: %f sec" % (time_taken)
        return(X_features, vectorizer)

<H6>Function to make prediction</H6>

In [17]:
def model_predict(model, test, true_y = None):
    time_0 = time()
    predict_y = model.predict(test)
    if (true_y is not None):
        accu = accuracy_score(true_y,predict_y)
        print "Accuracy score: %f" % (accu)
        time_taken = time() - time_0
        print "Returned X"
        print "Time taken: %f sec" % (time_taken)
        return (predict_y, accu)    
    else:
        return (predict_y)

<p>Selecting 4000 words for base model</p>

In [18]:
vectorizer = CountVectorizer(analyzer = "word", max_features = 4000)

<H6>Cleaning training dataset</H6>

In [19]:
clean_train = data_cleansing(train)

Cleaning and parsing the table data...

Cleaning 50000 th row of 697690 rows

Cleaning 100000 th row of 697690 rows

Cleaning 150000 th row of 697690 rows

Cleaning 200000 th row of 697690 rows

Cleaning 250000 th row of 697690 rows

Cleaning 300000 th row of 697690 rows

Cleaning 350000 th row of 697690 rows

Cleaning 400000 th row of 697690 rows

Cleaning 450000 th row of 697690 rows

Cleaning 500000 th row of 697690 rows

Cleaning 550000 th row of 697690 rows

Cleaning 600000 th row of 697690 rows

Cleaning 650000 th row of 697690 rows

Returned clean data
Time taken: 183.557000 sec


<H6>Vectorizing the training set</H6>

In [20]:
X_train, y_train, vectorizer = vectorize_data(clean_title=clean_train, vectorizer=vectorizer, vectorize=True, table = train)

Vectorizing cleaned data...

Returned X and y
Time taken: 8.687000 sec


<H6>Fitting the base model</H6>

In [21]:
time_0 = time()
model = MultinomialNB()
model.fit(X_train, y_train)
time_taken = time() - time_0
print "Time taken: %f sec" % (time_taken)

Time taken: 104.700000 sec


<p>Training accuracy</p>

In [22]:
model_predict(model = model, test = X_train, true_y = y_train)

Accuracy score: 0.672776
Returned X
Time taken: 119.903000 sec


(array([ 9974, 26003, 20539, ..., 30503, 21468,  8015], dtype=int64),
 0.67277553319411976)

<H6>Cleaning and predicting validation set</H6>

In [23]:
clean_valid = data_cleansing(valid)
X_valid, y_valid, vectorizer = vectorize_data(clean_title=clean_valid, vectorizer=vectorizer, vectorize=False, table=valid)
predict_valid, accur = model_predict(model = model, test = X_valid, true_y = y_valid)

Cleaning and parsing the table data...

Cleaning 50000 th row of 87740 rows

Returned clean data
Time taken: 22.922000 sec
Vectorizing cleaned data...

Returned X and y
Time taken: 1.093000 sec
Accuracy score: 0.589138
Returned X
Time taken: 17.187000 sec


<H6>Defining a function to tune the max_features parameter</H6>

In [24]:
def parameter_tuning(table, clean_data, param_list, algorithm = MultinomialNB):
    time_0 = time()
    no_iter = len(param_list)
    score = np.zeros(no_iter)
    for i in xrange(0,no_iter):
        vectorizer = CountVectorizer(analyzer = "word", max_features = param_list[i])
        X_tune,y_tune,vectorizer=vectorize_data(clean_title=clean_data,vectorizer=vectorizer,vectorize=True, table = table)
        val_len = len(X_tune)/5
        X_tune_train = X_tune[val_len:,]
        X_tune_test = X_tune[:val_len,]
        y_tune_train = y_tune[val_len:]
        y_tune_test = y_tune[:val_len]
        algorithm.fit(X_tune_train, y_tune_train)
        _ , score[i] = model_predict(model = algorithm, test = X_tune_test, true_y = y_tune_test)
        print "Score for parameter %d: %f" % (param_list[i], score[i])
    best_score = np.argmax(score)
    best_param = param_list[best_score]
    time_taken = time() - time_0
    print "Time taken: %f sec" % (time_taken)
    print "Best parameter is: %d" % (best_param)

<H6>Tuning the parameter</H6>

<p>Got memory error while running the tuning function. When run separately, the best feature for max_features is found out to be 4000. So, keeping base model as the final model</p>

In [27]:
param_list = [2000, 3000, 5000, 6000]
parameter_tuning(table = train, clean_data = clean_train, param_list = param_list, algorithm = MultinomialNB())

Vectorizing cleaned data...

Returned X and y
Time taken: 8.250000 sec


MemoryError: 

<H6>Loading test set</H6>

In [22]:
blindset = pd.DataFrame.from_csv('classification_blind_set_corrected.tsv',sep="\t",header=None,index_col=None)
blindset.columns  = ['product_title','category_id']

<H6>Cleaning and predicting test set</H6>

In [23]:
clean_test = data_cleansing(blindset)
X_test, vectorizer = vectorize_data(clean_title = clean_test, vectorizer = vectorizer, vectorize = False, table = None)
y_test = model_predict(model = model, test = X_test)

Cleaning and parsing the table data...

Cleaning 50000 th row of 619240 rows

Cleaning 100000 th row of 619240 rows

Cleaning 150000 th row of 619240 rows

Cleaning 200000 th row of 619240 rows

Cleaning 250000 th row of 619240 rows

Cleaning 300000 th row of 619240 rows

Cleaning 350000 th row of 619240 rows

Cleaning 400000 th row of 619240 rows

Cleaning 450000 th row of 619240 rows

Cleaning 500000 th row of 619240 rows

Cleaning 550000 th row of 619240 rows

Cleaning 600000 th row of 619240 rows

Returned clean data
Time taken: 166.636000 sec
Vectorizing cleaned data...

Returned X
Time taken: 8.953000 sec


MemoryError: 

In [26]:
del X_train, y_train, data, train, valid

In [29]:
import gc
gc.collect()

64

In [30]:
y_test = model_predict(model = model, test = X_test)

<H6>Writing the result to a csv file</H6>

In [31]:
result = y_test.tolist()
with open('result_0530.csv', "w") as final:
    writer = csv.writer(final, lineterminator='\n')
    for val in result:
        writer.writerow([val])  

<H4>Model Behaviour</H4>
<p>Model predicts well for frequently repeated items in the training set (for e.g. brand ids 42835 and 6584). Model was built only on roughly 85% of training data after deleting records with less frequent categories and brands. One way to overcome poor prediction for less frequent brands is to build separate models and use other techniques such as KNN and SVM.</p>