In [1]:
import pandas as pd
import numpy as np
import seaborn as sn 
import os, glob, re
import math

"""
N = 100

# Simulation
classes, labels = list(np.random.normal(-1,0.6,N)) + list(np.random.normal(1,0.9,N)),\
                   list(np.repeat(1,N)) + list(np.repeat(0,N))

data = pd.DataFrame({'x': list(map(lambda x: round(x,0), classes)), 'y': labels}); data.head()
"""

"\nN = 100\n\n# Simulation\nclasses, labels = list(np.random.normal(-1,0.6,N)) + list(np.random.normal(1,0.9,N)),                   list(np.repeat(1,N)) + list(np.repeat(0,N))\n\ndata = pd.DataFrame({'x': list(map(lambda x: round(x,0), classes)), 'y': labels}); data.head()\n"

## Speed Performance Test

In [2]:
from sklearn import tree

tree = tree.DecisionTreeClassifier()
%timeit tree.fit(data.x.reshape((N*2, 1)), data.y)

1000 loops, best of 3: 248 µs per loop


In [3]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
%timeit nb.fit(data.x.reshape((N*2, 1)), data.y)

The slowest run took 17.80 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 435 µs per loop


In [4]:
from sklearn.cross_validation import cross_val_score

np.mean(cross_val_score(tree, data.x.reshape((N*2, 1)), data.y, cv=10))

0.88500000000000012

In [8]:
np.mean(cross_val_score(nb, data.x.reshape((N*2, 1)), data.y, cv=5))

0.82499999999999996

The winner is Decision tree which appears to perform better and faster than naive bayes.

### Reusable Code Base

In [2]:
import textmining

# TFD function
def term_document_dataframe(test_df):
    """ Transforms a column with text into a set of
        dummy variables for each word in a dictionary.
    """
    
    # Initialize class to create term-document matrix
    tdm = textmining.TermDocumentMatrix()
    
    # Get word frequency in each document
    for doc in test_df:
        tdm.add_doc(doc)
        
    # Sets tdm into list data structure
    tdmMatrix = list(tdm.rows(cutoff=1))
    
    return pd.DataFrame(tdmMatrix[1:], columns=tdmMatrix[0])

# Number replacer 
def number_replacer(x):
    """ Finds digit strings and replaces it 
        with alphabetical character values """
    
    return re.sub('\d', ' digitexist ', x)

# Remove missing values
def missing_value_remover(x):
    """ Replaces missing value with an empty string """
    
    if type(x).__name__ != 'str':
        if math.isnan(x):
            x = ''     
    
    return x

## Load Data

In [2]:
os.chdir('/Users/daniellee/Desktop/Kaggle/jobs_code_demo/data/stackoverflow_data/')

train = pd.read_csv('cleaned/topic_model_df_train.csv'); train.head(3)
test = pd.read_csv('cleaned/topic_model_df_test.csv'); train.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,id,title,content,tags,category,combined,content_size,title_size,tags_size,TitleNTags,ContentNTags,title_pred,content_pred,combined_pred,title_nouns,content_nouns
0,0,0,0,1,criticality ribosome binding site relative sta...,prokaryotic translation critical efficient tra...,"['ribosome', 'binding-sites', 'translation', '...",biology,criticality ribosome binding site relative sta...,24,9,4,2,2,criticality ribosome prokaryotic codon,translation 7b observable prokaryotic,translation prokaryotic ribosome codon,criticality ribosome binding site relative sta...,prokaryotic translation critical efficient tra...
1,1,1,1,2,rnase contamination rna based experiments prev...,anyone suggestions prevent rnase contamination...,"['rna', 'biochemistry']",biology,rnase contamination rna based experiments prev...,21,6,2,1,1,prevented rnase contamination experiments,rnase depc pipette degradation,rnase contamination rna depc,rnase contamination rna based experiment preve...,doe anyone suggestion prevent rnase contaminat...
2,2,2,2,3,lymphocyte sizes clustered two groups,tortora writes principles anatomy physiology l...,"['immunology', 'cell-biology', 'hematology']",biology,lymphocyte sizes clustered two groups tortora ...,31,5,3,0,0,clustered lymphocyte groups sizes,lymphocytes diameter tortora 14,lymphocytes clustered groups sizes,lymphocyte size clustered group,tortora principle anatomy physiology lymphocyt...


## Feature Engineering

In [32]:
train = train[['tags','title_nouns','content_nouns','category']]
train.columns = ['tags','title','content','category']

In [39]:
# Replace missing values

for col in ['title','content']:
    train[col] = train[col].map(missing_value_remover)
    test[col] = test[col].map(missing_value_remover)

In [40]:
# Combine content

train['combined'] = train['title'] + ' ' + train['content']
test['combined'] = test['title'] + ' ' + test['content']

In [43]:
# Replce numeric characters with the word digitexist

train['title'] = train['title'].map(number_replacer)
test['title'] = test['title'].map(number_replacer)

In [28]:
index = np.random.choice(train.shape[0], int(train.shape[0]*0.10), False)
trainX = train.ix[index, 'combined'].map(number_replacer)
trainTestX = train.

array([ 0.91617694,  0.94941367,  0.1693934 , ...,  0.01336172,
        0.84560524,  0.36253842])

In [None]:
# Create TFD matrices from train and test data
trainX = term_document_dataframe(train['title'])

In [21]:
import time, datetime

start = time.clock()
term_document_dataframe(train['combined'][:5000])
end = time.clock()
value = end - start
timestamp = datetime.datetime.fromtimestamp(value)
print(timestamp.strftime('%Y-%m-%d %H:%M:%S'))

1969-12-31 19:00:39


In [None]:
testX  = term_document_dataframe(test['combined'])

In [None]:
# Derive a dummy set from topic column
trainTopics = pd.get_dummies(train.category)

### Tier 1 Prediction: Topic Prediction 

In [None]:
# Train decision tree

from sklearn.tree import DecisionTreeClassifier()
from operator import itemgetter

# Initialize and train model
tree = DecisionTreeClassifier()

# Train Data
topicmodels = {}
for topic in train.category.unique():
    topicmodels[topic] = tree.fit(trainX, trainTopics[train]) 
    
# Predict Topics
topicPredList = [list(sorted([(topic, topicmodels[topic].predict_proba(sentence)) for topic in train.category.unique()] \
            , key=itemgetter(1), reverse=True))[0] for sentence in test]
        
# Predict topic for each test document
test['topic_pred'] = topicPredList 