# Text Processing on Haptik Dataset
#### We import all the packages required for all the processing required like data cleaning data prerocessing,building model

In [1]:
#future imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

#Data Cleaning packages
from pandas import read_csv, Series
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
%matplotlib inline

In [4]:
#timeit Decorator
import time                                                

def timeit(method):
    
    '''defined a function timeit
    for finding the time taken to 
    execute the function'''
    
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [6]:
@timeit
def load_dataset(filepath):
    
    '''defined a function load_dataset
    for loading the csv files from
    location '''
    
    dataset = read_csv(filepath, encoding = 'UTF-8' )
    
    return dataset

train = load_dataset('./haptik/haptik_train_data.csv')

test = load_dataset('./haptik/haptik_test_data.csv') 

'load_dataset'  313.93 ms
'load_dataset'  32.34 ms


In [7]:
#separating feature and target variable
@timeit
def feature_target(df):
    
    '''defined a function feature_target
    for separating the features and the 
    target variables from our dataset '''
    
    feature = df.iloc[:,0]
    target = df.iloc[:,1:]
    return feature, target

X_train, y_train = feature_target(train)
X_test, y_test = feature_target(test)


'feature_target'  0.65 ms
'feature_target'  0.51 ms


In [8]:
# One hot encoding for changing the categorical target variable to binary target variable(numerical)
#do this on both train and test target variable i.e y_train ,y_test
@timeit
def encode(target):
    
    '''defined a function encode
    to perform one hot encoding  on
    the target categorical variable'''
    
    target = target.astype(str).replace({'T':1, 'F':0}, axis =1)
    target = target.idxmax(axis = 1)
    return target

y_train = encode(y_train)
y_test = encode(y_test)



  warn('the "axis" argument is deprecated and will be removed in'


'encode'  544.93 ms
'encode'  99.68 ms


In [11]:
#label Encoding for creating the array of labeled target variable
@timeit
def labelEncode(y_train):
    
    '''defined a function labelEncode
    for labeling the target variable'''
    
    lab_en = LabelEncoder()
    return (lab_en.fit_transform(y_train))
    

y_train = labelEncode(y_train)
y_test = labelEncode(y_test)

'labelEncode'  2.66 ms
'labelEncode'  8.39 ms


In [23]:
def vect_model(X_test,y_test,X_train,y_train):
    
    tokenizer = RegexpTokenizer(r'\w+')
    
    vect = CountVectorizer(tokenizer=tokenizer.tokenize,      
                           stop_words='english',             
                           ngram_range=(1, 2), 
                           max_df=0.5,                        
                           min_df=2,
                          )
    
    vect.fit(X_train)
    train_dtm = vect.transform(X_train)
    test_dtm  = vect.transform(X_test)
    
    nb = MultinomialNB(alpha=0.25,fit_prior=False)
    nb.fit(train_dtm, y_train)
    Y_pred_class = nb.predict(test_dtm)
    
    Y_pred_class1 = nb.predict(test_dtm)
    
    return metrics.accuracy_score(Y_pred_class1,y_test)

In [24]:
accuracy = vect_model(X_test, y_test, X_train, y_train)
accuracy

0.78910000000000002