In [1]:
import pandas as pd
import numpy as np
import random 

from scipy.stats import zscore
from numpy import linalg

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn import tree
from sklearn.svm import SVC

In [4]:
# Load the data
data = pd.read_csv('../data/train.csv')

In [5]:
# Randomize by label
## Create new column to store random values
data['random'] = 0.0
## Iterate through data to fill random column
## TODO: this could be better stratified, but the groups are almost equal 
for label in data.label.unique():
    shape = data.loc[data.label == label, 'random'].shape
    data.loc[data.label == label, 'random'] = np.random.rand(shape[0])

In [6]:
# Normalize data
## Get only columns that have pixel in the title
pixel_cols = data.columns.str.contains('pixel')
## Save columns that don't
label_columns = data.loc[:,['random', 'label']]
## Put then back together while normalizing the other columns
## This method saves memory
data = data.loc[:, pixel_cols].apply(zscore, axis=0)
data['label'] = label_columns.label
data['random'] = label_columns.random
del label_columns

In [7]:
# drop NAs
data = data.dropna(axis=1)

In [8]:
randarray = np.random.rand(len(data))
train_msk = (randarray >= .2) & (randarray <= .8)
test_msk = randarray < .2
validate_msk = randarray > .8
# Get test and train data
train_data = data[train_msk]
test_data = data[test_msk]
validate_data = data[validate_msk]
print(validate_data.shape[0], ' in the validation set')
print(test_data.shape[0], ' in the testing set')
print(train_data.shape[0], ' in the training set')
print(validate_data.shape[0] + test_data.shape[0] + train_data.shape[0], ' total')

8472  in the validation set
8474  in the testing set
25054  in the training set
42000  total


In [9]:
# For 500 times randomly select 10 attributes at a time and train a decision tree 
# classifier
lastset = 0
results = []
for idx in range(500):
    cols = random.sample(range(data.shape[1]-2), 10)
    clf = tree.DecisionTreeClassifier()
    clf.fit(train_data.iloc[:, cols], train_data.label)
    predictions = clf.predict(test_data.iloc[:, cols])
    results.append([accuracy_score(test_data.label, predictions)] + cols)

In [10]:
# Use the results to extract important columns
clf_results = pd.DataFrame(results)
important_cols = []
mean_result = clf_results.iloc[:,0].mean()
for row in clf_results.iterrows():
    if row[1][0] > mean_result:
        important_cols.extend(row[1][1:])
important_cols_result = pd.Series(important_cols).value_counts()

In [11]:
# Use dtree classifier to fit a new model
cols = important_cols_result.index[0:61]
clf = tree.DecisionTreeClassifier()
clf.fit(train_data.iloc[:, cols], train_data.label)
predictions = clf.predict(validate_data.iloc[:, cols])
print(accuracy_score(validate_data.label, predictions))
#print(precision_score(validate_data.label, predictions))
#print(recall_score(validate_data.label, predictions))

0.803234183192


In [12]:
# Use SVM classifier to fit a new model, it works much better
cols = important_cols_result.index[0:61]
clf = SVC()
clf.fit(train_data.iloc[:, cols], train_data.label)
predictions = clf.predict(validate_data.iloc[:, cols])
accuracy_score(validate_data.label, predictions)
print(accuracy_score(validate_data.label, predictions))
#print(precision_score(validate_data.label, predictions))
#print(recall_score(validate_data.label, predictions))

0.941690273843


In [13]:
# Show important columns
train_data.iloc[:, cols].columns

Index(['pixel511', 'pixel569', 'pixel515', 'pixel128', 'pixel636', 'pixel429',
       'pixel147', 'pixel461', 'pixel638', 'pixel179', 'pixel295', 'pixel463',
       'pixel236', 'pixel464', 'pixel216', 'pixel378', 'pixel457', 'pixel455',
       'pixel350', 'pixel325', 'pixel459', 'pixel751', 'pixel291', 'pixel328',
       'pixel541', 'pixel436', 'pixel545', 'pixel466', 'pixel465', 'pixel108',
       'pixel408', 'pixel454', 'pixel346', 'pixel400', 'pixel187', 'pixel186',
       'pixel372', 'pixel655', 'pixel442', 'pixel381', 'pixel68', 'pixel441',
       'pixel435', 'pixel431', 'pixel430', 'pixel403', 'pixel685', 'pixel656',
       'pixel157', 'pixel721', 'pixel514', 'pixel575', 'pixel241', 'pixel557',
       'pixel555', 'pixel259', 'pixel129', 'pixel267', 'pixel274', 'pixel130',
       'pixel289'],
      dtype='object')