<H1>Code for Indix Datamonster2 Challenge</H1>
<H4>Change working directory</H4>

In [1]:
import os
os.chdir('C:\Users\ernestkirubakarans\Desktop\Materials\hacks\indix')

<H4>Import libraries</H4>

In [2]:
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

<H4>Reading Data</H4>

In [3]:
data = pd.read_csv('data.csv')
blindset = pd.read_csv('blindset_table_out.csv')

In [4]:
data['y'] = 0
data.ix[data.label == 'yes', 'y'] = 1

<H2>Data Cleansing</H2>
<p>Defining a function for:</p>
<ul>
<li>Removing HTML tags</li>
<li>Keeping only alphabets</li>
<li>Converting all words to lower case</li>
<li>Splitting the words</li>
<li>Remove English stopwords</li>
</ul>

In [5]:
def table_to_words( raw_text ):
    review_text = BeautifulSoup(raw_text).get_text() 
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]   
    return( " ".join( meaningful_words ))   

<H4>Cleaning and Parsing the column 'table-text' in training dataset</H4>

In [6]:
num_tables = data["table-text"].size
print "Cleaning and parsing the table data...\n"
clean_tables = []
for i in xrange(0, num_tables):
    if((i+1)%10000 == 0):
        print "Review %d of %d\n" % (i+1, num_tables)                                                                    
    clean_tables.append(table_to_words(data["table-text"][i]))

Cleaning and parsing the table data...

Review 10000 of 91845

Review 20000 of 91845

Review 30000 of 91845

Review 40000 of 91845

Review 50000 of 91845

Review 60000 of 91845

Review 70000 of 91845

Review 80000 of 91845

Review 90000 of 91845



<H4>Using top 1000 most occuring words in 'table-text' column as features for the model</H4>

In [7]:
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 1000)
data_features = vectorizer.fit_transform(clean_tables)
data_features = data_features.toarray()

<H2>Training a Random Forest Model</H2>
<H4>Building a Random Forest Classifier with 50 trees</H4>

In [20]:
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 1000)
data_features = vectorizer.fit_transform(clean_tables)
data_features = data_features.toarray()

<H4>Building a Random Forest Classifier with 50 trees</H4>

In [21]:
forest = RandomForestClassifier(n_estimators = 50)
forest = forest.fit(data_features, data['y'] )

<H4>Verifying F1 score for training set</H4>

In [12]:
from sklearn.metrics import f1_score

In [22]:
train_result = forest.predict(data_features)
f1_score(data['y'],train_result)

0.96937164508999063

<H2>Making Predictions</H2>
<H4>Cleaning and Parsing the column 'table-text' in blind set</H4>

In [23]:
num_tables = blindset["table-text"].size
clean_tables = []

In [24]:
for i in xrange(0,num_tables):
    if( (i+1) % 10000 == 0 ):
        print "Review %d of %d\n" % (i+1, num_tables)
    clean_tables.append(table_to_words(blindset["table-text"][i]))

Review 10000 of 11842



In [25]:
blindset_features = vectorizer.transform(clean_tables)
blindset_features = blindset_features.toarray()

<H4>Predicting for blind set</H4>

In [26]:
result = forest.predict(blindset_features)
output = ['no']*num_tables

In [27]:
for i in xrange(0,num_tables):
    if (result[i] == 1):
        output[i] = 'yes'

<H4>Creating submission File</H4>

In [28]:
import csv
with open('rf_0510.csv', "w") as final:
    writer = csv.writer(final, lineterminator='\n')
    for val in output:
        writer.writerow([val])  