# Andrew's model work for final deliverables

In [26]:
import itertools as it
import matplotlib
import matplotlib.cm as cmx
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import os.path as op
import pandas as pd
import scipy as sp
import sklearn.preprocessing as Preprocessing
import datetime

from itertools import combinations
from sklearn.cross_validation import KFold as kfold
from sklearn.decomposition import TruncatedSVD as tSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import AdaBoostClassifier as Boost
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression as Lin_Reg
from sklearn.linear_model import LogisticRegression as Log_Reg
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC
from scipy.io import mmread
from sklearn.dummy import DummyClassifier
from sklearn import ensemble

%matplotlib inline
plt.style.use('ggplot') 
from IPython.display import display, HTML

In [2]:
### specify processed data files to generate - full/partial, partial %, and train/test
### Note: this cell is the same in both notebooks

# load and clean full dataset?
#load_full = False
load_full = True  # AMG

# if not loading and cleaning full dataset, what sample percentage?
sample_percent = 10

if load_full:
    pct_str = ""
else: # not load_full
    pct_str = str(sample_percent) + "_pct"
    
# use training or testing data to generate minor files?
minor_use_train = True
if minor_use_train:
    mode_str = "train"
else: # not minor_use_train
    mode_str = "test"
    
### set intermediate file names
dir_str = "./intermediate_files/"

processed_data_train_file = dir_str + "processed_data_" + "train" + pct_str + ".json"
processed_data_test_file = dir_str + "processed_data_" + "test" + pct_str + ".json"

nlp_data_file = dir_str + "nlp_data_" + mode_str + pct_str + ".json"
nlp_data_train_file = dir_str + "nlp_data_" + mode_str + pct_str + ".json"
term_freqs_file = dir_str + "term_freqs_" + mode_str + pct_str + ".mtx"
diff_terms_file = dir_str + "diff_terms_" + mode_str + pct_str + ".json"

In [3]:
processed_data_train_file

'./intermediate_files/processed_data_train.json'

In [4]:
### load processed data
data = pd.read_json(processed_data_train_file)
data_nlp = pd.read_json(nlp_data_file)
desc_matrix_coo = mmread(term_freqs_file)
desc_matrix = sp.sparse.csr_matrix(desc_matrix_coo)
count_cols_df = pd.read_json(diff_terms_file)

count_cols_bool = count_cols_df.values > 0.0

In [5]:
print len(data)

173805


In [6]:
data_filtered = data[data.loan_term == 36]
data_filtered = data_filtered[pd.to_datetime(data_filtered.issue_date).dt.year.isin([2011,2012,2013])]
print len(data_filtered)

79052


In [7]:
# Get a more manageable sample
np.random.seed(1729)
indexes = np.array(range(len(data_filtered)))
np.random.shuffle(indexes)
print "Indexes computed"                   
data_sampled = data_filtered.iloc[indexes[0:len(indexes)/4], :]
print len(data_sampled)

Indexes computed
19763


In [8]:
data_remaining = data_filtered.iloc[indexes[len(indexes)/4:], :]
print len(data_remaining)

59289


In [9]:
x = data_sampled.drop('loan_status', 1)
y = data_sampled['loan_status']

x_nlp = data_nlp.drop('loan_status', 1)
y_nlp = data_nlp['loan_status']

In [10]:
x_test = data_remaining.drop('loan_status', 1)
y_test = data_remaining['loan_status']

In [11]:
y_test.value_counts()

False    50281
True      9008
Name: loan_status, dtype: int64

In [12]:
x.loan_term.value_counts()

36    19763
Name: loan_term, dtype: int64

# Explore and clean up and standardize data

In [13]:
pd.to_datetime(x.issue_date).dt.year.value_counts()

2013    9682
2012    7591
2011    2490
Name: issue_date, dtype: int64

In [14]:
# del x['verif_status']
x.describe().T



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
annual_income,19763.0,69813.15,80329.39,7200.0,42000.0,60000.0,84000.0,7141778.0
cpi,19763.0,0.0213014,0.0004214194,0.02069612,0.02092458,0.02132454,0.02166718,0.02224093
delinq_2_yrs,19763.0,0.1867631,0.4847464,0.0,0.0,0.0,0.0,2.0
desc_len,9697.0,249.7785,212.1872,1.0,,,,4544.0
dti,19763.0,0.002418883,0.00113384,0.0,0.001555584,0.002375341,0.00324189,0.005386393
gdp,19763.0,0.9997263,3.296536e-05,0.9996063,0.9997035,0.9997311,0.9997524,0.9997853
id,19763.0,3602569.0,2742526.0,364693.0,1338386.0,2375759.0,5844894.0,10224660.0
inquiry_6_mos,19763.0,0.7960836,0.9562914,0.0,0.0,1.0,1.0,3.0
installment,19763.0,398.9107,246.265,30.42,219.14,343.39,514.64,1408.13
interest_rate,19763.0,12.82315,3.9964,5.42,9.91,12.99,15.61,25.89


In [15]:
# earliest_credit is not really a good indicator -- we want to know how long has elapsed since then
# See http://stackoverflow.com/questions/17414130/pandas-datetime-calculate-number-of-weeks-between-dates-in-two-columns
x['months_since_earliest_credit'] = (
    (pd.to_datetime(x.issue_date) - pd.to_datetime(x.earliest_credit))/np.timedelta64(1,'M')
).round()
x_test['months_since_earliest_credit'] = (
    (pd.to_datetime(x_test.issue_date) - pd.to_datetime(x_test.earliest_credit))/np.timedelta64(1,'M')
).round()

In [16]:
x.columns

Index([               u'address_state',                u'annual_income',
                                u'cpi',                 u'delinq_2_yrs',
                           u'desc_len',                  u'description',
                                u'dti',              u'earliest_credit',
                      u'employ_length',                 u'employ_title',
                                u'gdp',                   u'home_owner',
                                 u'id',          u'initial_list_status',
                      u'inquiry_6_mos',                  u'installment',
                      u'interest_rate',                          u'ipr',
                         u'issue_date',                  u'loan_amount',
                       u'loan_purpose',                u'loan_subgrade',
                          u'loan_term',     u'months_since_last_record',
                      u'open_accounts',                   u'recoveries',
                         u'revol_util',            

In [17]:
def expand_x(x, x_orig):
    x_expanded = pd.DataFrame()
    for colname in x_orig.columns:
        if colname in ('description', 'verif_status', 'loan_subgrade', 'id', 'interest_rate', 
                       'loan_term',
                       'index', 'recoveries', 'issue_date', 'earliest_credit'):
            continue
        print colname, x_orig[colname].dtype
        if x_orig[colname].dtype == 'object':
            values = x[colname].fillna('MISSING')
            value_columns = x_orig[colname].fillna('MISSING').value_counts().index
            if len(value_columns) > 50:
                value_columns = value_columns[:50]
            for val in value_columns:
                x_expanded[colname + '__' + val.replace(' ', '_')] = (values == val).astype(int)
        else:
            values = x[colname].fillna(x[colname].median())
            sd = np.nanstd(x_orig[colname])
            if sd < 1e-10:
                sd = 1
            x_expanded[colname] = (values - np.nanmean(x_orig[colname]))/sd
    return x_expanded

In [18]:
x_expanded = expand_x(x, x)

address_state object
annual_income float64
cpi float64
delinq_2_yrs int64
desc_len float64
dti float64
employ_length object
employ_title object
gdp float64
home_owner object
initial_list_status object
inquiry_6_mos int64
installment float64
ipr float64
loan_amount int64
loan_purpose object
months_since_last_record float64
open_accounts int64
revol_util float64
rir float64
total_accounts int64
unemploy float64
months_since_earliest_credit float64


In [19]:
x_expanded.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
address_state__CA,19763.0,1.766432e-01,0.381376,0.000000,0.000000,0.000000,0.000000,1.000000
address_state__NY,19763.0,8.844811e-02,0.283953,0.000000,0.000000,0.000000,0.000000,1.000000
address_state__TX,19763.0,7.665840e-02,0.266055,0.000000,0.000000,0.000000,0.000000,1.000000
address_state__FL,19763.0,7.311643e-02,0.260334,0.000000,0.000000,0.000000,0.000000,1.000000
address_state__IL,19763.0,3.951829e-02,0.194829,0.000000,0.000000,0.000000,0.000000,1.000000
address_state__NJ,19763.0,3.678591e-02,0.188241,0.000000,0.000000,0.000000,0.000000,1.000000
address_state__PA,19763.0,3.111876e-02,0.173643,0.000000,0.000000,0.000000,0.000000,1.000000
address_state__OH,19763.0,3.096696e-02,0.173233,0.000000,0.000000,0.000000,0.000000,1.000000
address_state__GA,19763.0,3.056216e-02,0.172133,0.000000,0.000000,0.000000,0.000000,1.000000
address_state__VA,19763.0,3.015736e-02,0.171024,0.000000,0.000000,0.000000,0.000000,1.000000


In [20]:
x_expanded.columns

Index([             u'address_state__CA',              u'address_state__NY',
                    u'address_state__TX',              u'address_state__FL',
                    u'address_state__IL',              u'address_state__NJ',
                    u'address_state__PA',              u'address_state__OH',
                    u'address_state__GA',              u'address_state__VA',
       ...
               u'loan_purpose__vacation',            u'loan_purpose__house',
       u'loan_purpose__renewable_energy',       u'months_since_last_record',
                        u'open_accounts',                     u'revol_util',
                                  u'rir',                 u'total_accounts',
                             u'unemploy',   u'months_since_earliest_credit'],
      dtype='object', length=145)

In [21]:
x_test_expanded = expand_x(x_test, x)

address_state object
annual_income float64
cpi float64
delinq_2_yrs int64
desc_len float64
dti float64
employ_length object
employ_title object
gdp float64
home_owner object
initial_list_status object
inquiry_6_mos int64
installment float64
ipr float64
loan_amount int64
loan_purpose object
months_since_last_record float64
open_accounts int64
revol_util float64
rir float64
total_accounts int64
unemploy float64
months_since_earliest_credit float64


In [22]:
# be prepared to split stuff up by year of issue
years = pd.to_datetime(x.issue_date).dt.year

In [23]:
years_test = pd.to_datetime(x_test.issue_date).dt.year

# Start evaluating models

In [43]:
def eval_model_all_years(model_factory, columns=None, poly_degree=None, prob_threshold=0.5):
    k = 5
    np.random.seed(1729)
    
    if columns is None:
        x_local = x_expanded
        x_local_test = x_test_expanded
    else:
        x_local = x_expanded[columns]
        x_local_test = x_test_expanded[columns]
        
    if poly_degree is not None:
        poly_xform = Preprocessing.PolynomialFeatures(degree=poly_degree, include_bias=False)
        x_local = pd.DataFrame(poly_xform.fit_transform(x_local))
        x_local_test = pd.DataFrame(poly_xform.fit_transform(x_local_test))
        
    if True: # for yr in [2011, 2012, 2013]: # set(years.values):
        indexes = range(len(years))
        np.random.shuffle(indexes)

        cm_accum = np.zeros((2, 2))
        f1_accum = 0
        score = 0
        weighted_score = 0

        # k-fold cross-validation
        for i in range(k):
            train_indexes = list(indexes[0:len(indexes)*i/k]) + list(indexes[len(indexes)*(i+1)/k:])
            test_indexes = indexes[len(indexes)*i/k:len(indexes)*(i+1)/k]
        
            #print "TRAIN ", train_indexes
            #print 'TEST', test_indexes
            #print "Y", y.iloc[test_indexes]
            
            # model = model_factory().fit(x_expanded[years==yr], y[years==yr])
            # score = model.score(x_expanded[years==yr], y[years==yr]) / k
            model = model_factory().fit(x_local.iloc[train_indexes,:], y.iloc[train_indexes])
            yhat = model.predict(x_local)
            score += model.score(x_local.iloc[test_indexes], y.iloc[test_indexes]) / k
            yhat_weighted = (model.predict_proba(x_local)[:,0] > prob_threshold)[test_indexes]
            weighted_score += (y.iloc[test_indexes][yhat_weighted]).mean() / k
            cm_accum += confusion_matrix(y.iloc[test_indexes], yhat[test_indexes])
            f1_accum += f1_score(y.iloc[test_indexes], yhat[test_indexes], pos_label = 1) / k
        
        # but also test against the x_test
        test_yhat = (model.predict_proba(x_local_test)[:,0] > prob_threshold)
        test_score = (y_test == test_yhat).mean()
        test_precision = 1- y_test[test_yhat].mean()
        test_f1 = f1_score(y_test, test_yhat, pos_label = 1) / k

        print "      score: %.3f  baseline: %.3f   wscore: %.3f   f1: %.3f  | test score %.3f  1-prec %.3f f1 %.3f"  % (
            score, 1-y.mean(), 1-weighted_score, f1_accum, test_score, test_precision, test_f1)
        if score > .95 * (1-y.mean()):
            print "^^^^^^^^^^^^^^"
        

# TODO: Confusion matrix (right now, we're not doing well enough to worry about that)
# TODO: Pretty-print
# TODO: Store results to allow side-by-side

In [25]:
def eval_model_by_year(model_factory, columns=None, prob_threshold=0.5):
    eval_model_all_years(model_factory, columns, prob_threshold)
    k = 5
    np.random.seed(1729)
    
    if columns is None:
        x_local = x_expanded
        x_local_test = x_test_expanded
    else:
        x_local = x_expanded[columns]
        x_local_test = x_test_expanded[columns]
        
    for yr in [2011, 2012, 2013]: # set(years.values):
        indexes = np.where(years==yr)[0]
        np.random.shuffle(indexes)

        cm_accum = np.zeros((2, 2))
        f1_accum = 0
        score = 0
        weighted_score = 0

        # k-fold cross-validation
        for i in range(k):
            train_indexes = list(indexes[0:len(indexes)*i/k]) + list(indexes[len(indexes)*(i+1)/k:])
            test_indexes = indexes[len(indexes)*i/k:len(indexes)*(i+1)/k]
        
            #print "TRAIN ", train_indexes
            #print 'TEST', test_indexes
            #print "Y", y.iloc[test_indexes]
            
            # model = model_factory().fit(x_expanded[years==yr], y[years==yr])
            # score = model.score(x_expanded[years==yr], y[years==yr]) / k
            model = model_factory().fit(x_local.iloc[train_indexes,:], y.iloc[train_indexes])
            yhat = model.predict(x_local)
            score += model.score(x_local.iloc[test_indexes], y.iloc[test_indexes]) / k
            yhat_weighted = (model.predict_proba(x_local)[:,0] > prob_threshold)[test_indexes]
            weighted_score += (y.iloc[test_indexes][yhat_weighted]).mean() / k
            cm_accum += confusion_matrix(y.iloc[test_indexes], yhat[test_indexes])
            f1_accum += f1_score(y.iloc[test_indexes], yhat[test_indexes], pos_label = 1) / k
        
        # but also test against the x_test
        test_score = model.score(x_local_test[years_test == yr], y_test[years_test == yr])
        test_yhat = (model.predict_proba(x_local_test[years_test == yr])[:,0] > prob_threshold)
        test_precision = 1- y_test[years_test == yr][test_yhat].mean()

        print "%d  score: %.3f  baseline: %.3f   wscore: %.3f   f1: %.3f  | test score %.3f  1-prec %.3f"  % (
            yr, score, 1-y[years==yr].mean(), 1-weighted_score, f1_accum, test_score, test_precision)

# TODO: Confusion matrix (right now, we're not doing well enough to worry about that)
# TODO: Pretty-print
# TODO: Store results to allow side-by-side

In [113]:
def eval_model_with_threshold(model_factory, columns=None):
    k = 5
    np.random.seed(1729)
    if columns is None:
        x_local = x_expanded
    else:
        x_local = x_expanded[columns]

    if True: # because old indent for loop
        indexes = range(len(y))
        np.random.shuffle(indexes)

        probs = np.ones_like(y) * -1

        for i in range(k):
            train_indexes = list(indexes[0:len(indexes)*i/k]) + list(indexes[len(indexes)*(i+1)/k:])
            test_indexes = indexes[len(indexes)*i/k:len(indexes)*(i+1)/k]
        
            model = model_factory().fit(x_local.iloc[train_indexes,:], y.iloc[train_indexes])
            probs_test = (model.predict_proba(x_local)[:,0]) #[test_indexes]
            probs = np.where([ii in test_indexes for ii in range(len(y))],  # slow but the only one I've found that works!
                             probs_test, probs)
            # print i, (probs == -1).sum(), (probs > 0).sum()
            
    thresholds = np.arange(0, 1, 0.05)
    plt.plot(thresholds,
             [1-y[probs > t].mean() for t in thresholds])
    plt.show()

    return probs

In [34]:
for C in [1, 100, 10000]:
    eval_model_all_years(lambda: Log_Reg(class_weight='balanced', C=C))

0  score: 0.630  baseline: 0.846   wscore: 0.903   f1: 0.342  | test score 0.378  1-prec 0.902 f1 0.031
0  score: 0.629  baseline: 0.846   wscore: 0.903   f1: 0.342  | test score 0.379  1-prec 0.903 f1 0.031
0  score: 0.629  baseline: 0.846   wscore: 0.903   f1: 0.342  | test score 0.379  1-prec 0.903 f1 0.031


In [32]:
for col in x.columns:
    columns=[c for c in x_expanded.columns
             if c==col or c.startswith(col+'_') ]
    if len(columns):
        print col
        eval_model_all_years(lambda: Log_Reg(class_weight='balanced', C=10000), 
                             columns = columns                          
                         )

address_state
0  score: 0.566  baseline: 0.846   wscore: 0.858   f1: 0.249  | test score 0.432  1-prec 0.856 f1 0.046
annual_income
0  score: 0.498  baseline: 0.846   wscore: 0.888   f1: 0.293  | test score 0.505  1-prec 0.887 f1 0.034
cpi
0  score: 0.532  baseline: 0.846   wscore: 0.871   f1: 0.273  | test score 0.472  1-prec 0.872 f1 0.039
delinq_2_yrs
0  score: 0.747  baseline: 0.846   wscore: 0.847   f1: 0.154  | test score 0.246  1-prec 0.850 f1 0.051
desc_len
0  score: 0.322  baseline: 0.846   wscore: 0.879   f1: 0.272  | test score 0.686  1-prec 0.870 f1 0.031
dti
0  score: 0.557  baseline: 0.846   wscore: 0.871   f1: 0.274  | test score 0.451  1-prec 0.871 f1 0.040
employ_length
0  score: 0.654  baseline: 0.846   wscore: 0.851   f1: 0.215  | test score 0.328  1-prec 0.852 f1 0.049
employ_title
0  score: 0.800  baseline: 0.846   wscore: 0.851   f1: 0.146  | test score 0.200  1-prec 0.852 f1 0.051
gdp
0  score: 0.563  baseline: 0.846   wscore: 0.860   f1: 0.253  | test score 0.43

In [36]:
for col1, col2 in combinations(x.columns, 2):
    columns=[c for c in x_expanded.columns
             if c==col1 
             or c.startswith(col1+'_') 
             or c==col2 
             or c.startswith(col2+'_') 
            ]
    if len(columns):
        print col1, col2
        eval_model_all_years(lambda: Log_Reg(class_weight='balanced', C=10000), 
                             columns = columns                          
                         )

address_state annual_income
0  score: 0.526  baseline: 0.846   wscore: 0.884   f1: 0.292  | test score 0.481  1-prec 0.882 f1 0.036
address_state cpi
0  score: 0.528  baseline: 0.846   wscore: 0.866   f1: 0.265  | test score 0.476  1-prec 0.867 f1 0.041
address_state delinq_2_yrs
0  score: 0.547  baseline: 0.846   wscore: 0.857   f1: 0.249  | test score 0.447  1-prec 0.857 f1 0.045
address_state desc_len
0  score: 0.524  baseline: 0.846   wscore: 0.861   f1: 0.257  | test score 0.519  1-prec 0.861 f1 0.042
address_state description
0  score: 0.566  baseline: 0.846   wscore: 0.858   f1: 0.249  | test score 0.432  1-prec 0.856 f1 0.046
address_state dti
0  score: 0.566  baseline: 0.846   wscore: 0.873   f1: 0.278  | test score 0.447  1-prec 0.871 f1 0.040
address_state earliest_credit
0  score: 0.566  baseline: 0.846   wscore: 0.858   f1: 0.249  | test score 0.432  1-prec 0.856 f1 0.046
address_state employ_length
0  score: 0.557  baseline: 0.846   wscore: 0.859   f1: 0.251  | test score

In [41]:
for col1, col2 in combinations(x.columns, 2):
    columns=[c for c in x_expanded.columns
             if c==col1 
             or c.startswith(col1+'_') 
             or c==col2 
             or c.startswith(col2+'_') 
            ]
    if len(columns):
        print col1, col2
        eval_model_all_years(lambda: Log_Reg(), # class_weight='balanced', C=10000), 
                             columns = columns,
                             poly_degree = 2
                         )

address_state annual_income
0  score: 0.846  baseline: 0.846   wscore: 0.846   f1: 0.000  | test score 0.152  1-prec 0.848 f1 0.053
address_state cpi
0  score: 0.846  baseline: 0.846   wscore: 0.846   f1: 0.000  | test score 0.152  1-prec 0.848 f1 0.053
address_state delinq_2_yrs
0  score: 0.846  baseline: 0.846   wscore: 0.846   f1: 0.000  | test score 0.152  1-prec 0.848 f1 0.053
address_state desc_len
0  score: 0.846  baseline: 0.846   wscore: 0.846   f1: 0.000  | test score 0.152  1-prec 0.848 f1 0.053
address_state description
0  score: 0.846  baseline: 0.846   wscore: 0.846   f1: 0.000  | test score 0.152  1-prec 0.848 f1 0.053
address_state dti
0  score: 0.846  baseline: 0.846   wscore: 0.846   f1: 0.000  | test score 0.152  1-prec 0.848 f1 0.053
address_state earliest_credit
0  score: 0.846  baseline: 0.846   wscore: 0.846   f1: 0.000  | test score 0.152  1-prec 0.848 f1 0.053
address_state employ_length
0  score: 0.846  baseline: 0.846   wscore: 0.846   f1: 0.001  | test score

In [44]:
for col1, col2 in combinations(x.columns, 2):
    columns1=[c for c in x_expanded.columns
              if c==col1 
              or c.startswith(col1+'_') 
             ]
    columns2=[c for c in x_expanded.columns
              if c==col2 
              or c.startswith(col2+'_') 
            ]
    if len(columns1) and len(columns2):
        print col1, col2
        eval_model_all_years(lambda: Log_Reg(class_weight='balanced', C=10000), 
                             columns = columns1 + columns2,
                             poly_degree = 2
                         )

address_state annual_income
      score: 0.537  baseline: 0.846   wscore: 0.883   f1: 0.291  | test score 0.467  1-prec 0.881 f1 0.037
address_state cpi
      score: 0.491  baseline: 0.846   wscore: 0.864   f1: 0.262  | test score 0.521  1-prec 0.868 f1 0.039
address_state delinq_2_yrs
      score: 0.529  baseline: 0.846   wscore: 0.857   f1: 0.250  | test score 0.531  1-prec 0.858 f1 0.042
address_state desc_len
      score: 0.536  baseline: 0.846   wscore: 0.860   f1: 0.255  | test score 0.460  1-prec 0.858 f1 0.045
address_state dti
      score: 0.555  baseline: 0.846   wscore: 0.870   f1: 0.272  | test score 0.459  1-prec 0.870 f1 0.040
address_state employ_length
      score: 0.540  baseline: 0.846   wscore: 0.851   f1: 0.238  | test score 0.463  1-prec 0.857 f1 0.045
address_state employ_title
      score: 0.575  baseline: 0.846   wscore: 0.863   f1: 0.258  | test score 0.425  1-prec 0.859 f1 0.045
address_state gdp
      score: 0.555  baseline: 0.846   wscore: 0.862   f1: 0.258 