In [76]:
import pandas as pd
import numpy as np
import csv

from sklearn.decomposition import PCA
from sklearn.lda import LDA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score

In [77]:
def separate_labels(data):
    """ Splits the label columns from the data """
    # Fetch a pixel column mask
    pixel_col_msk = data.columns.str.contains(
        'pixel|Solidity|AspectRatio|Perimeter|Area|Angle'
    )
    # Get the label column
    label_column = data.loc[:,data.columns.str.contains('digit|label')]
    # Get the pixel columns
    data = data.loc[:, pixel_col_msk]
    return label_column, data

def process_pixel(row):
    """ Process an image """
    if row < 85:
        return 0
    elif row >= 85 and row <= 170:
        return 1 #1
    else:
        return 2 #2
    
def process_pixel_col(col):
    """ Process an image """
    return col.apply(process_pixel)

def prep_data(data):
    """ This function contains a few functions for preping the data """
    try:
        labels, data = separate_labels(data)
    except:
        None
    data = data.apply(process_pixel_col, axis=1)
    # Reinsert the labels
    try:
        data['label'] = labels
    except:
        None
    # Drop the empty columns
    data = data.dropna(axis=1)
    return data

def split_data(data):
    """ Split data into a test, training, and validation set
    using a 60, 20, 20 ratio """
    # Create a numpy array of randoms
    randarray = np.random.rand(len(data))
    # Create a training, test mask, and validation mask
    train_msk = (randarray >= .2) & (randarray <= .8)
    validate_msk = randarray > .8
    # Apply the masks and return data
    training_data = data[train_msk]
    validation_data = data[validate_msk]
    return training_data, validation_data

In [78]:
data = pd.read_csv('../data/train.csv')
testing_data = pd.read_csv('../data/test.csv')
testing_data['label'] = 11
data = data.append(testing_data)

In [79]:
data = prep_data(data)

In [80]:
# Remove empty columns
data = data.loc[:, list(data.max()[data.max() > 0].index)]

In [81]:
# Split into training and testing data
data, testing_data = data[data.label < 11], data[data.label == 11]

In [82]:
# Split into training and validation data
training_data, validation_data = split_data(data)

In [83]:
# Split labels
training_labels, training_data = separate_labels(training_data) 
validation_labels, validation_data = separate_labels(validation_data)
_, testing_data = separate_labels(testing_data)

In [128]:
# PCA transformation building
pca = PCA()
training_pca = pca.fit(training_data)
training_pca.n_components = 110 # use only 110 of the componenets

In [129]:
# PCA transformation fiting
training_transformed = training_pca.fit_transform(training_data)
validation_transformed = training_pca.transform(validation_data)
testing_transformed = training_pca.transform(testing_data)

In [None]:
# Training Model
clf = SVC(kernel="rbf", C=10, gamma=0.01)
clf.fit(training_transformed, training_labels.values.ravel())

In [None]:
# Validating Model
predicitons = clf.predict(validation_transformed)
accuracy_score(validation_labels.values, predicitons)

In [75]:
# Predict Test set
final_export = pd.DataFrame()
final_export['label'] = clf.predict(testing_transformed)
final_export['ImageId'] = final_export.index + 1
final_export.to_csv('test14.csv', index=False)