In [1]:
import pandas as pd

In [2]:
loans = pd.read_csv('loanstats.csv')

## Remove the columns that reflect Lending Club's actual clusters

We are effectively predicting the grade and sub_grade columns, the clusters Lending Club uses to assign interest rates.

In [3]:
loans.drop('grade', axis=1, inplace=True)
loans.drop('sub_grade', axis=1, inplace=True)
loans.drop('set', axis=1, inplace=True)
loans.drop('emp_title', axis=1, inplace=True)
loans.drop('timestamp', axis=1, inplace=True)
loans.drop('issue_d', axis=1, inplace=True)
loans.drop('last_credit_pull_d', axis=1, inplace=True)
loans.drop('title', axis=1, inplace=True)
loans.drop('purpose', axis=1, inplace=True)
loans.drop('next_pymnt_d', axis=1, inplace=True)
loans.drop('zip_code', axis=1, inplace=True)
loans.drop('last_fico_range_high', axis=1, inplace=True)
loans.drop('fico_range_high', axis=1, inplace=True)
loans.drop('last_fico_range_low', axis=1, inplace=True)
loans.drop('installment', axis=1, inplace=True)

In [4]:
loans['term'] = loans['term'].map(lambda a: int(a.strip(' months')))
loans['application_type'] = loans['application_type'].map(lambda a: 1 if a=='Joint App' else 0)

def elength(a):
    if (a=='n/a'):
        return 0
    elif (a=='10+ years'):
        return 10
    elif (a=='1 year'):
        return 1
    elif (a=='< 1 year'):
        return 0.5
    else:
        return float(a.strip(' years'))
    
loans['emp_length'] = loans['emp_length'].map(lambda a: elength(a))

loans['revol_util'] = loans['revol_util'].map(lambda a: float(a.strip('%')))

homes = pd.get_dummies(loans['home_ownership'], prefix='home')
loans = loans.join(homes)
loans.drop('home_ownership', axis=1, inplace=True)

states = pd.get_dummies(loans['addr_state'], prefix='st')
loans = loans.join(states)
loans.drop('addr_state', axis=1, inplace=True)

In [5]:
pd.options.display.max_columns = 101

loans.head()

Unnamed: 0,id,loan_amnt,term,int_rate,emp_length,annual_inc,dti,delinq_2yrs,fico_range_low,mths_since_last_delinq,open_acc,revol_bal,revol_util,total_acc,collections_12_mths_ex_med,policy_code,application_type,tot_cur_bal,bc_util,chargeoff_within_12_mths,mo_sin_old_il_acct,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_tl,num_il_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,total_bc_limit,home_ANY,home_MORTGAGE,home_NONE,home_OTHER,home_OWN,home_RENT,st_AK,st_AL,st_AR,st_AZ,st_CA,st_CO,st_CT,st_DC,st_DE,st_FL,st_GA,st_HI,st_IA,st_ID,st_IL,st_IN,st_KS,st_KY,st_LA,st_MA,st_MD,st_ME,st_MI,st_MN,st_MO,st_MS,st_MT,st_NC,st_ND,st_NE,st_NH,st_NJ,st_NM,st_NV,st_NY,st_OH,st_OK,st_OR,st_PA,st_RI,st_SC,st_SD,st_TN,st_TX,st_UT,st_VA,st_VT,st_WA,st_WI,st_WV,st_WY
0,10149342,27050.0,36,10.99,10.0,55000.0,22.87,0.0,730.0,195.0,14.0,36638.0,61.2,27.0,0.0,1.0,0,114834.0,53.9,0.0,117.0,6.0,4.0,16.0,8.0,0.0,2.0,4.0,8.0,8.0,15.0,4.0,14.0,0.0,0.0,1.0,25.0,0.0,0.0,35700.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,10149488,4800.0,36,10.99,2.0,39600.0,2.49,0.0,755.0,195.0,3.0,4136.0,16.1,8.0,0.0,1.0,0,4136.0,16.1,0.0,104.0,25.0,0.0,25.0,3.0,0.0,2.0,2.0,4.0,1.0,7.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,25700.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,10119623,12000.0,36,11.99,10.0,130000.0,13.03,0.0,715.0,195.0,9.0,10805.0,67.0,19.0,0.0,1.0,0,327264.0,93.0,0.0,173.0,4.0,3.0,85.0,4.0,0.0,3.0,5.0,4.0,8.0,8.0,5.0,9.0,0.0,0.0,3.0,1.0,0.0,0.0,10700.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,10129506,20800.0,36,13.53,10.0,81500.0,16.73,0.0,685.0,64.0,29.0,23473.0,54.5,41.0,0.0,1.0,0,23473.0,54.6,0.0,115.0,0.0,0.0,0.0,0.0,1.0,8.0,24.0,17.0,1.0,40.0,24.0,29.0,0.0,0.0,3.0,50.0,0.0,0.0,15000.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,10159611,10000.0,36,9.67,7.0,102000.0,15.55,2.0,670.0,11.0,9.0,9912.0,44.4,22.0,0.0,1.0,0,39143.0,89.4,0.0,243.0,8.0,0.0,25.0,8.0,1.0,3.0,4.0,6.0,9.0,13.0,4.0,9.0,0.0,0.0,1.0,66.7,0.0,0.0,9200.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## K Means

In [6]:
from sklearn import cluster

k_means=cluster.KMeans(n_clusters=7)

In [7]:
loans_y = loans['int_rate']
loans_x = loans.drop('int_rate', axis=1)

In [8]:
k_means.fit(loans_x)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=7, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [9]:
labels7 = k_means.labels_

In [10]:
k_means

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=7, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [11]:
import pickle
from sklearn.externals import joblib

#pickle.dump(k_means, open('kmeans.pkl', 'wb'))
with open('/Users/emily-air13/Desktop/code/INFO-7390-ADS-Fall-17-TeamNo.4/Assignment 2/kmeans.pkl', 'wb') as f:
    joblib.dump(k_means, f)

In [12]:
loans2 = pd.read_csv('loanstats.csv')

In [13]:
loans2['kcluster7'] = labels7

In [15]:
loans2['kcluster7'].value_counts()

3    279997
5    235023
2    234444
0    233309
6    224882
4    186810
1    118418
Name: kcluster7, dtype: int64

In [16]:
loans2['grade'].value_counts()

C    455309
B    445905
A    249686
D    216608
E    101833
F     33678
G      9864
Name: grade, dtype: int64

In [17]:
pd.crosstab(loans2.grade,loans2.kcluster7)

kcluster7,0,1,2,3,4,5,6
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,47925,18730,40753,36148,33485,33411,39234
B,70153,29857,69577,87655,49472,74040,65151
C,64011,32778,80493,94882,52861,65832,64452
D,28943,22087,27128,39810,29272,37453,31915
E,15294,10710,10653,14767,16065,16249,18095
F,5498,3291,3380,5273,4564,6642,5030
G,1485,965,2460,1462,1091,1396,1005


## K Means Attempt to Recreate Sub Grades

In [18]:
k_means35=cluster.KMeans(n_clusters=35)
k_means35.fit(loans_x)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=35, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [19]:
loans2['kcluster35'] = k_means35.labels_

In [20]:
pd.crosstab(loans2.sub_grade,loans2.kcluster35)

kcluster35,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
sub_grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
A1,2201,859,2557,1687,657,987,1492,1183,591,1531,600,638,1109,1085,1638,7015,1862,1468,1139,2174,550,590,792,2385,2022,2244,1625,781,779,1283,762,1621,621,1551,1191
A2,1739,834,1793,1338,950,1171,1102,1139,797,1021,632,836,702,919,1413,2212,1480,1150,1050,1466,704,945,764,1343,1275,1795,962,436,1138,1306,546,1189,694,1150,860
A3,1529,778,1671,1236,1379,1092,899,1047,735,906,638,816,663,928,1209,2159,1892,1481,1024,1392,1142,956,659,1167,1272,1534,907,496,1009,1387,503,1172,771,1155,826
A4,1563,1017,1984,1392,2088,1722,3876,1200,1142,1034,1211,1092,748,1175,1696,2810,1784,2203,1473,1892,1691,1481,743,1233,1410,1533,963,848,1663,1384,1032,1545,1087,1297,1198
A5,2639,1010,2942,1842,2379,2393,1965,2144,1600,1478,1444,1505,1001,1577,2593,3103,2478,3001,2117,2829,1949,2138,1167,1963,1864,2769,1302,1026,2384,2083,1151,2486,1202,1707,1694
B1,3152,1037,3469,3858,2503,1818,2352,2454,1353,2252,1424,1310,1701,2612,2886,3669,3537,2764,1807,3055,2047,1675,2132,2269,3235,3314,2184,1085,1791,4305,1392,2623,973,3704,1820
B2,2988,1060,3801,3097,2781,1879,2413,2202,1451,1706,1597,1295,1215,2974,2649,3949,3328,4549,1812,3181,2158,1752,2044,2361,2439,3228,1519,1146,1897,4744,1442,2473,1110,2575,1950
B3,3274,1248,3983,3183,3132,2262,2840,2398,1560,1942,1820,1420,1258,3276,2775,4585,3490,5365,2107,3716,2521,1975,2105,2612,2513,3500,1639,1260,2066,4974,1699,2683,1234,2716,2097
B4,2894,1223,3973,3961,3009,2477,2874,2336,1732,1975,1973,1606,1426,4066,2638,4508,2933,5032,2136,3654,2367,2081,2746,2592,2999,2988,1817,1228,2408,5123,1839,2598,1295,3384,2064
B5,3379,1177,3778,4710,2722,2142,3016,2106,1632,2611,2056,1552,2226,4472,2503,4435,3595,2909,1937,3452,2684,1943,3026,2523,4077,3585,2792,1196,2137,2342,1654,2583,1270,4169,2004


## K Means with Selected Features

In [21]:
application = ['fico_range_low','dti','loan_amnt','term']
k_means7=cluster.KMeans(n_clusters=7)
k_means7.fit(loans_x[application])

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=7, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [22]:
loans2['kclusters7'] = k_means7.labels_

In [23]:
pd.crosstab(loans2.grade,loans2.kclusters7)

kclusters7,0,1,2,3,4,5,6
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,41329,14661,54244,34297,74242,22794,8119
B,71488,21816,112092,58086,134592,30097,17734
C,77244,28491,100578,62911,121863,36926,27296
D,38463,16654,42032,31458,51626,20014,16361
E,18814,10130,12928,17428,20299,11704,10530
F,6403,4188,3217,6106,5727,4209,3828
G,1914,1384,534,1805,1614,1256,1357


## Attempt at Reproducing "Grades"

In [24]:
def manual_clusters(fico, dti, amnt, term):
    if fico > 700 and dti < 30 and amnt < 20000:
        return 'Group1'
    elif (fico > 780 and dti < 30 and amnt < 20000 and term==' 36 months') or (fico > 700 and term==' 60 months'):
        return 'Group2'
    elif dti < 30 and amnt < 20000 and term==' 36 months': 
        return 'Group3'
    elif dti < 30 and term==' 36 months': # amnt < 25000 
        return 'Group4'
    elif fico > 670 and dti < 30 and amnt < 30000:
        return 'Group5'
    elif dti < 30:
        return 'Group6'
    else:
        return 'Group7'

loans2['manual_cat'] = loans2.apply(lambda x: manual_clusters(x['fico_range_low'], x['dti'], x['loan_amnt'], x['term']), axis=1)

In [25]:
pd.crosstab(loans2.grade, loans2.manual_cat)

manual_cat,Group1,Group2,Group3,Group4,Group5,Group6,Group7
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,119503,4538,54232,61273,1231,96,8813
B,92241,27984,209915,63078,17985,5702,29000
C,50738,34290,178094,50144,55022,40434,46587
D,14703,13406,71978,21781,29494,32286,32960
E,5109,7595,19620,5986,19962,26215,17346
F,1493,2273,4017,981,7343,11837,5734
G,305,613,710,322,2177,3882,1855


## Group by Credit Score

In [26]:
def manual_clusters(fico):
    if fico > 720:
        return 'Group1'
    elif fico > 700:
        return 'Group2'
    elif fico > 690: 
        return 'Group3'
    elif fico > 680: # amnt < 25000 
        return 'Group4'
    elif fico > 670:
        return 'Group5'
    else:
        return 'Group6'

loans2['manual_cat2'] = loans2.apply(lambda x: manual_clusters(x['fico_range_low']), axis=1)

In [27]:
pd.crosstab(loans2.grade, loans2.manual_cat2)

manual_cat2,Group1,Group2,Group3,Group4,Group5,Group6
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,117173,59007,26135,22643,14908,9820
B,72469,83718,56164,61101,67496,104957
C,38872,66303,53631,67209,82843,146451
D,11193,24129,22494,31345,42095,85352
E,4342,10076,10259,15132,20504,41520
F,1208,2829,3077,4715,6889,14960
G,290,702,801,1329,2049,4693
