In [12]:
import pandas as pd
import numpy as np
training_val_df = pd.read_csv('./data/Training_set_values.csv', index_col='id')
training_label_df = pd.read_csv('./data/Training_set_labels.csv', index_col='id')
training_df = training_val_df.join(training_label_df, how='inner')

In [13]:
# data cleaning

# remove columns with identical column
# region = region_cd
rm_cols = ['region']
for col in rm_cols:
    training_df.drop(col, axis=1, inplace=True)

# remove columns with no uniqueness
training_cols = list(training_df.columns.values)
for col in training_cols:
    unique_elements = len(training_df[col].unique())
    if unique_elements == 1:
        print('Removed column with constant value: ', col)
        training_df.drop(col, axis=1, inplace=True)

# deal with null values
null_values = [0, 'none' 'None', 'na', 'NA', 'Na', 'Unknown', 'Not Known', '', ' ', 'unknown']
for null_val in null_values:
    training_df.replace(null_val, np.nan)
        
# drop columns that are completely NA
training_df = training_df.dropna(axis=1, how='all')

Removed column with constant value:  recorded_by


In [14]:
# clean up free form columns
ff_cols = ['funder', 'installer', 'wpt_name', 'scheme_name', 'longitude', 'latitude', 'date_recorded', 'num_private']
free_form_df = training_df[ff_cols]
for col in ff_cols:
    training_df.drop(col, axis=1, inplace=True)

In [15]:
# convert remaining non-numeric cols to categorical
# pd.to_datetime(training_df['date_recorded'])
training_cols = list(training_df.columns.values)
for col in training_cols:
    if training_df[col].dtype == np.object:
        training_df[col] = training_df[col].astype('category')

In [16]:
training_cols = list(training_df.columns.values)
print(training_cols)
training_df['construction_year'] = training_df['construction_year'].astype('category')
training_df['district_code'] = training_df['district_code'].astype('category')
training_df['region_code'] = training_df['region_code'].astype('category')
dummy_col = training_cols.remove('status_group')
print(training_df.dtypes)

['amount_tsh', 'gps_height', 'basin', 'subvillage', 'region_code', 'district_code', 'lga', 'ward', 'population', 'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group', 'status_group']
amount_tsh                float64
gps_height                  int64
basin                    category
subvillage               category
region_code              category
district_code            category
lga                      category
ward                     category
population                  int64
public_meeting           category
scheme_management        category
permit                   category
construction_year        category
extraction_type          category
extraction_type_group    category
extraction_t

In [11]:
# convert to numerical vector of 1,0 for all categorical values
vect_training_df = pd.get_dummies(training_df, columns=dummy_col, sparse=True)
vect_training_df.to_pickle('./data/vect_training.pickle')

In [1]:
import pandas as pd
import numpy as np
vect_df = pd.read_pickle('./data/vect_training.pickle')
training_label_df = pd.read_csv('./data/Training_set_labels.csv', index_col='id')

In [2]:
repl_val = ['functional', 'functional needs repair', 'non functional']
y_df = training_label_df[['status_group']]
y_df = y_df.replace(repl_val, [0.0, 1.0, 2.0])
y = y_df.as_matrix().astype(np.float)

In [3]:
X = vect_df.as_matrix().astype(np.float)

# This is important
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [4]:
print("Feature space holds %d observations and %d features" % X.shape)
print("Unique target labels:", np.unique(y))

In [5]:
from sklearn.cross_validation import KFold


def run_cv(X, y, clf_class, **kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

In [None]:
print("Support vector machines:")
print("%.3f" % accuracy(y, run_cv(X,y,SVC)))

In [None]:
print("Random forest:")
print("%.3f" % accuracy(y, run_cv(X,y,RF)))