In [54]:
import pandas as pd
import numpy as np

This is just a short example of the use of the crosstab function. It is essentially a form of pivot table that has turned our columns into the variables from the 'status' column, then aggregated all the different grades within those columns using 'margins=True'. 

In [55]:
df=pd.read_csv('lc_loans.csv')
df.head()

Unnamed: 0,status,grade
0,Fully Paid,B
1,Charged Off,C
2,Fully Paid,C
3,Fully Paid,C
4,Current,B


In [63]:
df_cross = pd.crosstab(index=df['grade'], columns=df['status'], margins=True)
df_cross

status,Charged Off,Current,Fully Paid,Late,All
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1562,50051,20408,469,72490
B,5302,93852,31160,2056,132370
C,6023,88928,23147,2777,120875
D,5007,53281,13681,2308,74277
E,2842,24639,5949,1374,34804
F,1526,8444,2328,606,12904
G,409,1990,643,199,3241
All,22671,321185,97316,9789,450961


We can by default normalize the values by dividing each value by the sum of the values. However, using normalize='index' will normalize by row. For example, for Charged Off and Grade A the sum would be 1562/72490 which comes from the subtotal of the 'All' row for Grade A. 

In [60]:
df_cross_1 = pd.crosstab(index=df['grade'], columns=df['status'], margins=True, normalize='index')
df_cross_1

status,Charged Off,Current,Fully Paid,Late
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,0.021548,0.690454,0.281528,0.00647
B,0.040054,0.709013,0.235401,0.015532
C,0.049828,0.735702,0.191495,0.022974
D,0.06741,0.717328,0.184189,0.031073
E,0.081657,0.707936,0.170929,0.039478
F,0.118258,0.654371,0.180409,0.046962
G,0.126196,0.614008,0.198396,0.061401
All,0.050273,0.712223,0.215797,0.021707


Both the 'All' column and 'All' row should equal to 1.0 if you have done it correctly. This addition to the normalized dataframe shows the proportion of Grades overall, so Grade A makes up 0.16 of the total of all loans and Grade B makes up 0.29 etc. 

In [61]:
df_cross_1['All'] = df_cross['All']/df_cross['All'][-1]
df_cross_1

status,Charged Off,Current,Fully Paid,Late,All
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,0.021548,0.690454,0.281528,0.00647,0.160746
B,0.040054,0.709013,0.235401,0.015532,0.293529
C,0.049828,0.735702,0.191495,0.022974,0.268039
D,0.06741,0.717328,0.184189,0.031073,0.164708
E,0.081657,0.707936,0.170929,0.039478,0.077177
F,0.118258,0.654371,0.180409,0.046962,0.028614
G,0.126196,0.614008,0.198396,0.061401,0.007187
All,0.050273,0.712223,0.215797,0.021707,1.0


Checking that the rows truly add up to 1.0. The sum of the rows equals 2 because it is also adding the +1.0 from the All column, but if we minus that then we end up with 1.0.

In [76]:
print(df_cross_1['All'][0:7].sum())
print(df_cross_1.loc['All'].sum())

1.0
2.0
