In [1]:
from sklearn import ensemble, svm

In [2]:
# Add pipeline_src to include path
import sys
path = "../pipeline_src/"
if not path in sys.path:
    sys.path.insert(1, path)
del path

from experiment import Experiment

Vendor:  Continuum Analytics, Inc.
Package: iopro
Message: trial mode expires in 24 days


In [3]:
import numpy as np
import pandas as pd

In [4]:
exp = Experiment()

In [5]:
exp.hdf_file = '/mnt/scratch/master_loan_features_v41.comp.h5'
exp.load_data_hdf(train_range=(2008, 2012), 
                  test_range=(2012, 2013), 
                  past=True, 
                  all_years=True)

Loading training data from file /mnt/scratch/master_loan_features_v41.comp.h5...
Training Data loaded.
Loading Testing data...
Data loading complete
.x_train.shape = (3401389, 598)
.y_train.shape = (3401389,)
.x_test.shape = (1570992, 598)
.y_test.shape = (1570992,)


In [6]:
regimen_train = exp.x_train['regimen']
regimen_test = exp.x_test['regimen']

In [7]:
colonias_train = exp.x_train['coloniaid'] 
colonias_test = exp.x_test['coloniaid']

In [8]:
cve_train = exp.x_train['cve']
cve_test = exp.x_test['cve']

In [9]:
#### Create a list of columns that won't be used in prediction to drop
non_features = ['cv_credito',
                'coloniaid',
                'nom_mun',
                'mun_region',
                'mun_geo_zone',
                'year_granted',
                'cve',
                'abandoned',
                'abandoned_y',
                'abandoned_ever_y',
                'abandon_year',
                'abandon_month',
                'cur_year',
                'past',
                'regimen',
                'loan_has_subsudy',
                'loan_voluntary_contrib_bool']

non_quantize_features = ['years_since_granted', 
                         'personal_married', 
                         'personal_gender',
                         'loan_joint'] 

encode_features = ['personal_gender',
                   'personal_married',
                   'loan_building_type']

In [10]:
# Drop non-feature columns by name
exp.drop_cols(non_features)

# Drop all columns that have any NA's in them
# Can also pass 'row' to drop all rows with NA's in them
exp.drop_nas('row')

# Drop non-feature columns by name again incase they came back.....
exp.drop_cols(non_features)

Original shape before dropping columns:
.x_train.shape = (3401389, 598)
.y_train.shape = (3401389,)
.x_test.shape = (1570992, 598)
.y_test.shape = (1570992,)
Column "abandoned_y" not found in x_train... passing...
Column "abandoned_y" not found in x_test... passing...
New shape after dropping:
.x_train.shape = (3401389, 582)
.y_train.shape = (3401389,)
.x_test.shape = (1570992, 582)
.y_test.shape = (1570992,)
Original shape of dataframes before dropping NA's:
.x_train.shape = (3401389, 582)
.y_train.shape = (3401389,)
.x_test.shape = (1570992, 582)
.y_test.shape = (1570992,)
New shape after dropping:
.x_train.shape = (1284875, 583)
.y_train.shape = (1284875,)
.x_test.shape = (930424, 583)
.y_test.shape = (930424,)
Original shape before dropping columns:
.x_train.shape = (1284875, 583)
.y_train.shape = (1284875,)
.x_test.shape = (930424, 583)
.y_test.shape = (930424,)
Column "cv_credito" not found in x_train... passing...
Column "cv_credito" not found in x_test... passing...
Column "col

In [None]:
# Convert all features to quantiles
features_to_quantize = [feature for feature in list(exp.x_train) if not feature in non_quantize_features]

# Get names of the new features
quantile_features = [feature + "_quantile" for feature in features_to_quantize]

# Make quantiles (5 buckets)
exp.make_quantile_columns(features_to_quantize, 7)

In [12]:
exp.print_shape()

.x_train.shape = (1284875, 1150)
.y_train.shape = (1284875,)
.x_test.shape = (930424, 1150)
.y_test.shape = (930424,)


In [None]:
# # note to self: Try svm w/ these interactions...
# multiplicitive_features = ['personal_risk_index',
#                            'personal_daily_wage',
#                            'personal_age']

# print "Quantile features to be interacted with:"
# print quantile_features

# print "Multiplicative features:"
# print multiplicitive_features

# for q_feature in quantile_features:
#   for m_feature in multiplicitive_features:
#     if m_feature in q_feature:
#       continue

#     new_feature = q_feature + "_x_" + m_feature +  "_interaction"

#     try:
#         exp.x_train[new_feature] = exp.x_train[q_feature] * exp.x_train[m_feature]
#         exp.x_test[new_feature] = exp.x_test[q_feature] * exp.x_test[m_feature]
#         print "Created feature: {}".format(new_feature)
#     except KeyError:
#         print "Quantile feature doesnt exist. Probably didnt get made. Skipping interaction"

In [14]:
rfc = ensemble.RandomForestClassifier(n_estimators=2000, 
                                      max_depth=15, 
                                      n_jobs=30,
                                      class_weight='subsample',
                                      max_features='auto')

In [None]:
rfc = rfc.fit(exp.x_train, exp.y_train)

In [19]:
exp.test_model(rfc)

Testing model on x_test...
Predicted probabilities for each test case saved in exp.y_test_pred_prob.


In [31]:
pred_real = pd.merge(pd.DataFrame(exp.y_test_pred_prob), pd.DataFrame(exp.y_test), left_index=True, right_index=True)

In [33]:
pred_real.columns = ['abandoned_pred', 'abandoned_y']

In [None]:
from sklearn import tree
from sklearn.externals.six import StringIO
from sklearn.tree import DecisionTreeRegressor

# Compute difference between error and prediction
y_diff = exp.y_test - exp.y_test_pred_prob

# Train a range of decision trees on the errors
for depth in [1, 2, 4, 6, 8, 10, 15, 20]:
    print "Training Decision tree of depth {} on errors...".format(depth)
    dc_err = DecisionTreeRegressor(max_depth=depth)

    dc_err = dc_err.fit(exp.x_test, y_diff)

    with open("dc_err_depth_{}.dot".format(depth), 'w') as f:
        f = tree.export_graphviz(dc_err, out_file=f)
        
# FINALLY: type "dot -Tpng dc_err_depth_XXX.dot -o tree_XXX.png"
#          in terminal to convert dot to png

In [37]:
pred_real_colonia = pd.merge(pred_real, pd.DataFrame(colonias_test), left_index=True, right_index=True)
# pred_real_regimen = pd.merge(pred_real, regimen_test, left_index=True, right_index=True)
# pred_real_cve = pd.merge(pred_real, cve_test, left_index=True, right_index=True)

In [121]:
pred_real_by_colonia = pred_real_colonia.groupby('coloniaid')

In [105]:
a = pred_real_by_colonia['abandoned_y'].sum() / pred_real_by_colonia.size()

In [106]:
b = pred_real_by_colonia['abandoned_pred'].sum() / pred_real_by_colonia.size()

In [120]:
pred_real['abandoned_y'].sum() / pred_real.count()

abandoned_pred    0.01429
abandoned_y       0.01429
dtype: float64

In [107]:
a.sort(ascending=False)
b.sort(ascending=False)

In [118]:
(pred_real_by_colonia.size() == 1).sum()

362

In [22]:
rfc.score(exp.x_train, exp.y_train)

1.0

In [23]:
rfc.score(exp.x_test, exp.y_test)

0.53651845518027652

In [None]:
exp.eval_model(rfc, "./model_results/RFC_med_main.html")

Writing evaluation html document for RandomForestClassifier
Evaluation document written to ./model_results/RFC_big_main_20150818_222014.html