#Modeling using modular pipeline 

In [1]:
#Imports
from IPython.display import display, HTML 
from sklearn import ensemble, linear_model, svm
import numpy as np
import pandas as pd
import re
import sys
# Add pipeline_src to include path
path = "../../pipeline_src/"
if not path in sys.path:
    sys.path.insert(1, path)
del path

from experiment import Experiment

Vendor:  Continuum Analytics, Inc.
Package: iopro
Message: trial mode expires in 28 days


##Configure parameters for this notebook

In [2]:
#All features that have the following prefix will be deleted
#['mun_pop', 'mun_nat', 'mun_car', 'mun_nmn', 'mun_region', 'mun_geo']
prefixes_to_delete = ['mun_pop', 'mun_nat', 'mun_car', 'mun_nmn', 'mun_geo']
#The following columns will be deleted just before training
#'colonia_num_abd'
features_to_delete = ['coloniaid', 'colonia_num_loans','abandoned', 'nom_mun', 'cve',
                      'cv_credito', 'cur_year', 'past', 'year_granted', 'abandon_year', 'abandon_month']
#How many cores, sir?
cores = 25
#Class_weights
cw = 'auto'
#Class_weights for tree based models
cw_tree = 'subsample'
#Number of estimators
n_est = 100
#Feature scaling?
feature_scaling = False
#Dropping nas?
drop_criteria = 'col'
#Subsample ratio?
subsample_ratio = None
#Save results to SQL table=
save_to_sql = True
#Set a common name for the models run here
#output files will be
#prefix_model_name_date.html
prefix = "lrcv_cw_auto_only_ecuve_region"

##Declare models to train

In [3]:
#Models with class_weight – try auto and subsample

#ExtraTreesClassifier
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
et = ensemble.ExtraTreesClassifier(n_estimators=n_est, class_weight=cw_tree, n_jobs=cores)

#RandomForestClassifier
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
rf = ensemble.RandomForestClassifier(n_estimators=n_est, class_weight=cw_tree, n_jobs=cores)

#RidgeClassifier
#http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html#sklearn.linear_model.RidgeClassifier
rc = linear_model.RidgeClassifier(class_weight=cw)

#SGDClassifier
#http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier
sgd = linear_model.SGDClassifier(class_weight=cw, n_jobs=cores)

#LinearSVC
#http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
lsvc = svm.LinearSVC(class_weight=cw)

#SVC
#http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
svc = svm.SVC(class_weight=cw)

#LogisticRegressionCV
#http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
lrcv = linear_model.LogisticRegressionCV(class_weight=cw, n_jobs=cores)

#LogisticRegression
#http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
lr = linear_model.LogisticRegression(class_weight=cw)

#Create a list of models
#models = [rf, lsvc, lrcv]
models = [lrcv]

##Helper functions

In [4]:
#Given a model, get the name of the class to name the report file
def filename_for_model(model, extension):
    s = str(type(model))
    class_name = re.search(".*'(.+?)'.*", s).group(1).split(".")[-1]
    return prefix+"_"+class_name+"."+extension
#Given the name of a file, return the contents of it
def render_html(html_file):
    file = open(html_file, 'r')
    return file.read()
#Given a list of prefixes, return features
#that contain a prefix given by the user
def contains_prefix(s, prefixes_list):
    result = sum([s.startswith(prefix) for prefix in prefixes_list])
    return result > 0
def list_features():
    return list(exp.x_train)
def list_feature_groups():
    features = list(exp.x_train)
    prefixes =  map(lambda x: x.split("_")[0], features)
    unique = set(prefixes) 
    return unique

##Data loading

In [5]:
#Instantiate experiment
exp = Experiment()
exp.load_data_hdf()
#Print training and testing range
print 'Training on: '+str(exp.train_range)
print 'Testing on: '+str(exp.test_range)

Loading training data from file /mnt/data/infonavit/master_loan_features/master_loan_features_v4.h5...
Opening /mnt/data/infonavit/master_loan_features/master_loan_features_v4.h5 in read-only mode
Training Data loaded.
Loading Testing data...
Opening /mnt/data/infonavit/master_loan_features/master_loan_features_v4.h5 in read-only mode
Data loading complete
.x_train.shape = (1369239, 583)
.y_train.shape = (1369239,)
.x_test.shape = (1570992, 583)
.y_test.shape = (1570992,)
Training on: (2008, 2012)
Testing on: (2012, 2013)


In [6]:
#Drop mun features (new table will be loaded from a csv file)
old_mun_features = filter(lambda x: contains_prefix(x, ["mun"]) , list_features())
exp.drop_cols(old_mun_features)

Original shape before dropping columns:
.x_train.shape = (1369239, 583)
.y_train.shape = (1369239,)
.x_test.shape = (1570992, 583)
.y_test.shape = (1570992,)
New shape after dropping:
.x_train.shape = (1369239, 541)
.y_train.shape = (1369239,)
.x_test.shape = (1570992, 541)
.y_test.shape = (1570992,)


In [7]:
#Map some features to bools
dic = {'f':0, 't': 1}
exp.x_train['loan_has_subsudy'] = exp.x_train['loan_has_subsudy'].map(dic)
exp.x_train['loan_voluntary_contrib_bool'] = exp.x_train['loan_voluntary_contrib_bool'].map(dic)

exp.x_test['loan_has_subsudy'] = exp.x_train['loan_has_subsudy'].map(dic)
exp.x_test['loan_voluntary_contrib_bool'] = exp.x_train['loan_voluntary_contrib_bool'].map(dic)

In [8]:
#Load master municipality features table
mun_features = pd.read_csv("/mnt/scratch/master_muns_features.csv")
mun_features.rename(columns={'mun_cve': 'cve', 'mun_year': 'cur_year'}, inplace=True)
#Add features to experiment object
exp.x_train = pd.merge(exp.x_train, mun_features, on=('cve', 'cur_year'))
exp.x_test = pd.merge(exp.x_test, mun_features, on=('cve', 'cur_year'))

In [9]:
print mun_features['cve'].unique().shape
print exp.x_train['cve'].unique().shape
print exp.x_test['cve'].unique().shape

(545,)
(503,)
(528,)


##Pipeline

In [10]:
print 'Deleting:'
print features_to_delete
#Drop features
exp.drop_cols(features_to_delete)

Deleting:
['coloniaid', 'colonia_num_loans', 'abandoned', 'nom_mun', 'cve', 'cv_credito', 'cur_year', 'past', 'year_granted', 'abandon_year', 'abandon_month']
Original shape before dropping columns:
.x_train.shape = (1369239, 690)
.y_train.shape = (1369239,)
.x_test.shape = (1570992, 690)
.y_test.shape = (1570992,)
New shape after dropping:
.x_train.shape = (1369239, 679)
.y_train.shape = (1369239,)
.x_test.shape = (1570992, 679)
.y_test.shape = (1570992,)


In [11]:
#Remove features that match prefix given in the configuration cell
matches_prefix = filter(lambda x: contains_prefix(x, prefixes_to_delete), list_features())
matches_prefix
exp.drop_cols(matches_prefix)

Original shape before dropping columns:
.x_train.shape = (1369239, 679)
.y_train.shape = (1369239,)
.x_test.shape = (1570992, 679)
.y_test.shape = (1570992,)
New shape after dropping:
.x_train.shape = (1369239, 541)
.y_train.shape = (1369239,)
.x_test.shape = (1570992, 541)
.y_test.shape = (1570992,)


In [12]:
#Drop columns with nas
exp.drop_nas(drop_criteria)

Original shape of dataframes before dropping NA's:
.x_train.shape = (1369239, 541)
.y_train.shape = (1369239,)
.x_test.shape = (1570992, 541)
.y_test.shape = (1570992,)
NA's found in the following columns. Dropping: set(['colonia_avg_water_escore', 'sustainability_escore_relative', 'colonia_avg_transportation_escore', 'schools_escore_relative', 'water_escore', 'parks_escore_relative', 'colonia_avg_hospital_escore', 'sustainability_escore', 'colonia_avg_power_escore', 'water_escore_relative', 'colonia_avg_community_escore', 'colonia_avg_markets_escore', 'hospital_escore_relative', 'digital_escore_relative', 'equip_escore_relative', 'roads_transport_escore_relative', 'colonia_avg_digital_escore', 'personal_risk_index', 'satisfaction_escore_relative', 'loan_subaccount_value_relative', 'colonia_avg_validitiy_index_escore', 'colonia_avg_references_escore', 'power_escore_relative', 'references_escore_relative', 'roads_transport_escore', 'colonia_avg_sustainability_escore', 'transportation_es

In [13]:
#Count per colums
#is_na = exp.x_train.isnull()
#nas_count = is_na.apply(sum, axis=0)
#Filter for columns with more than 0 nas
#col_names = nas_count[nas_count > 0]
#print "These columns have nas: "+str(col_names)

In [14]:
#Perform feature scaling if needed
if feature_scaling:
    exp.scale_features_training()
    exp.scale_features_testing()

In [None]:
# Subsample the training set into exp.x_train_sub with the specified ratio
if subsample_ratio:
    exp.subsample_training(subsample_ratio)

In [None]:
#Fit the models
fits = [model.fit(exp.x_train, exp.y_train) for model in models]

In [None]:
#Model evaluation
#train_scores = [fit.score(exp.x_train, exp.y_train) for fit in fits]
#test_scores =  [fit.score(exp.x_train, exp.y_train) for fit in fits]
#print train_scores, test_scores
#print train_score, test_score

##Model evaluation and report generation

In [None]:
#Generate HTML reports
htmls = []
for fitted_model in fits:
    print "Evaluating model "+filename_for_model(fitted_model, "")
    exp.test_model(fitted_model)
    html = exp.eval_model(fitted_model, "./"+filename_for_model(fitted_model, "html"), save_to_sql=save_to_sql)
    htmls.append(html)

print htmls

##Report rendering

In [None]:
#Render html (IPython notebook)
for html in htmls:
    get_ipython().run_cell_magic(u'HTML', u'', render_html(html))