### Importing Modules

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
loans = pd.read_csv('https://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv')
loansData = loans

In [2]:
import statistics
# statistics can be used for IQR but scipy has it built in--why reinvent wheel?
from scipy import stats

## Cleaning

### Reformatting, typecasting

In [3]:
loansData.describe()

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Monthly.Income,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months
count,2500.0,2500.0,2499.0,2498.0,2498.0,2498.0
mean,12406.5,12001.573236,5688.931321,10.075661,15244.559648,0.906325
std,7801.544872,7745.320754,3963.118185,4.508644,18308.549795,1.231036
min,1000.0,-0.01,588.5,2.0,0.0,0.0
25%,6000.0,6000.0,3500.0,7.0,5585.75,0.0
50%,10000.0,10000.0,5000.0,9.0,10962.0,0.0
75%,17000.0,16000.0,6800.0,13.0,18888.75,1.0
max,35000.0,35000.0,102750.0,38.0,270800.0,9.0


In [4]:
loansData.head()

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
81174,20000,20000.0,8.90%,36 months,debt_consolidation,14.90%,SC,MORTGAGE,6541.67,735-739,14.0,14272.0,2.0,< 1 year
99592,19200,19200.0,12.12%,36 months,debt_consolidation,28.36%,TX,MORTGAGE,4583.33,715-719,12.0,11140.0,1.0,2 years
80059,35000,35000.0,21.98%,60 months,debt_consolidation,23.81%,CA,MORTGAGE,11500.0,690-694,14.0,21977.0,1.0,2 years
15825,10000,9975.0,9.99%,36 months,debt_consolidation,14.30%,KS,MORTGAGE,3833.33,695-699,10.0,9346.0,0.0,5 years
33182,12000,12000.0,11.71%,36 months,credit_card,18.78%,NJ,RENT,3195.0,695-699,11.0,14469.0,0.0,9 years


In [5]:
lengths = []
for loan in loansData['Loan.Length']:
    len_form = int(loan[:2])
    lengths.append(len_form)

In [6]:
loansData['Loan.Length'] = lengths

In [7]:
loansData.head()

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
81174,20000,20000.0,8.90%,36,debt_consolidation,14.90%,SC,MORTGAGE,6541.67,735-739,14.0,14272.0,2.0,< 1 year
99592,19200,19200.0,12.12%,36,debt_consolidation,28.36%,TX,MORTGAGE,4583.33,715-719,12.0,11140.0,1.0,2 years
80059,35000,35000.0,21.98%,60,debt_consolidation,23.81%,CA,MORTGAGE,11500.0,690-694,14.0,21977.0,1.0,2 years
15825,10000,9975.0,9.99%,36,debt_consolidation,14.30%,KS,MORTGAGE,3833.33,695-699,10.0,9346.0,0.0,5 years
33182,12000,12000.0,11.71%,36,credit_card,18.78%,NJ,RENT,3195.0,695-699,11.0,14469.0,0.0,9 years


In [8]:
def perc_to_float(s):
    s = s[:-1]
    s = float(s)
    return s

cents = zip(loansData['Interest.Rate'], loansData['Debt.To.Income.Ratio'])
intrates = []
DIratio = []

for rate, ratio in cents:
    intrates.append(perc_to_float(rate))
    DIratio.append(perc_to_float(ratio))
    
newVals = list(zip(intrates, DIratio))
loansData['Interest.Rate'], loansData['Debt.To.Income.Ratio'] = zip(*newVals)

# unzipping only works on lists of tuples

In [9]:
fico = loansData['FICO.Range']
intscore = []
intranges = []
for cat in fico:
    cat = cat.split(sep='-')
    x = int(cat[0])
    y = int(cat[1])
    intscore.append(x)
    intranges.append((x,y))

In [10]:
loansData['FICO.Scoring'] = intscore

In [11]:
loansData.head()

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length,FICO.Scoring
81174,20000,20000.0,8.9,36,debt_consolidation,14.9,SC,MORTGAGE,6541.67,735-739,14.0,14272.0,2.0,< 1 year,735
99592,19200,19200.0,12.12,36,debt_consolidation,28.36,TX,MORTGAGE,4583.33,715-719,12.0,11140.0,1.0,2 years,715
80059,35000,35000.0,21.98,60,debt_consolidation,23.81,CA,MORTGAGE,11500.0,690-694,14.0,21977.0,1.0,2 years,690
15825,10000,9975.0,9.99,36,debt_consolidation,14.3,KS,MORTGAGE,3833.33,695-699,10.0,9346.0,0.0,5 years,695
33182,12000,12000.0,11.71,36,credit_card,18.78,NJ,RENT,3195.0,695-699,11.0,14469.0,0.0,9 years,695


In [12]:
for col in loansData:
    print(loansData[col].value_counts(dropna=False))

10000    206
12000    151
5000     110
20000    107
6000     103
15000     98
8000      90
25000     65
7000      54
16000     53
35000     51
18000     47
3000      45
4000      45
24000     41
9000      38
30000     38
14000     37
4800      28
7200      25
7500      25
2000      24
21000     23
9600      21
4500      19
2500      19
3600      18
13000     17
17000     17
4200      16
        ... 
15575      1
25800      1
23750      1
9425       1
27825      1
30100      1
7575       1
15775      1
25475      1
34500      1
23300      1
31500      1
8975       1
3725       1
13075      1
24175      1
11050      1
23400      1
5650       1
1450       1
1550       1
7050       1
7675       1
15250      1
17900      1
13225      1
9700       1
7625       1
15300      1
16350      1
Name: Amount.Requested, dtype: int64
10000.00    164
12000.00    108
5000.00      87
6000.00      85
8000.00      69
15000.00     68
20000.00     59
7000.00      40
4000.00      35
16000.00     35
3000.00   

### Removing Outliers

In [13]:
FICO_IQR = stats.iqr(x=loansData['FICO.Scoring'], nan_policy='omit')

In [14]:
fico_med = statistics.median(loansData['FICO.Scoring'])

In [15]:
# scrapped my code and wrote some fxn

def is_outlier(x):
    outliers = []
    iqr = stats.iqr(x=x, nan_policy='omit')
    median = statistics.median(x)
    for val in x:
        if val > (median + 1.5 * iqr) or val < (median - 1.5 * iqr):
            outliers.append(float(1))
        else:
            outliers.append(float(0))
    return outliers

In [16]:
outliers = is_outlier(loansData['FICO.Scoring'])
print(len(outliers))

# looking for 2500 observations

2500


For some reason, if `outliers` is converted to a series before being put into the df, there are issues with NaN values popping up:
```python
>>> outliers = pd.Series(outliers)
>>> outliers.value_counts(dropna=False)
0.0    2341
1.0     159
dtype: int64
```
but it is fine to insert a list...

[~~***May*** have to do with index~~](http://stackoverflow.com/questions/12555323/adding-new-column-to-existing-dataframe-in-python-pandas)

**Definitely** was an index issue--series didn't have an index

In [17]:
outliers = is_outlier(loansData['FICO.Scoring'])

In [18]:
loansData['outlier'] = pd.Series(outliers, index=loansData.index)
# this fixed the above issue--it

In [19]:
print(loansData['outlier'].value_counts(dropna=False))
df = loansData[loansData['outlier']==False]

0.0    2341
1.0     159
Name: outlier, dtype: int64


In [20]:

df.describe()

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Debt.To.Income.Ratio,Monthly.Income,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,FICO.Scoring,outlier
count,2341.0,2341.0,2341.0,2341.0,2341.0,2341.0,2340.0,2340.0,2340.0,2341.0,2341.0
mean,12358.532678,11957.903755,13.42232,41.269543,15.710397,5585.047151,10.137607,15426.582906,0.916239,700.305425,0.0
std,7745.936698,7691.109984,4.04889,9.936958,7.415687,3421.286996,4.530156,17971.933138,1.237435,28.434635,0.0
min,1000.0,-0.01,5.42,36.0,0.0,588.5,2.0,0.0,0.0,640.0,0.0
25%,6000.0,6000.0,10.65,36.0,10.29,3458.33,7.0,5990.75,0.0,675.0,0.0
50%,10000.0,10000.0,13.11,36.0,15.64,5000.0,9.0,11175.5,1.0,695.0,0.0
75%,17000.0,16000.0,15.96,36.0,20.91,6666.67,13.0,19148.0,1.0,720.0,0.0
max,35000.0,35000.0,24.89,60.0,34.91,65000.0,38.0,270800.0,9.0,765.0,0.0


### Removing NaN's

The column "Employment.Length" has several values as "n/a"

Below is a formatted count.
- [formatting strings](https://docs.python.org/3/library/string.html#formatspec)
- [formatting mini-language](https://docs.python.org/3/library/string.html#formatspec)

In [21]:
col_has_nan = []

for col in loansData:
    if loansData[col].isnull().sum() > 0:
        col_has_nan.append(col)
        print('{:<30}'.format(col), '  NaN values:', loansData[col].isnull().sum())

Monthly.Income                   NaN values: 1
Open.CREDIT.Lines                NaN values: 2
Revolving.CREDIT.Balance         NaN values: 2
Inquiries.in.the.Last.6.Months   NaN values: 2


In [22]:
print(col_has_nan)

['Monthly.Income', 'Open.CREDIT.Lines', 'Revolving.CREDIT.Balance', 'Inquiries.in.the.Last.6.Months']


Iterating over rows to capture all instances of NaN values; if any col has a nan val, the row will be removed  
[docs](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.itertuples.html#pandas.DataFrame.itertuples)

example:
```python
>>> df_short = df[:3]
>>> type(df_short)
>>> df_short
>>> for row in df_short.itertuples():
...    # print(row)
...    if 0.0 in row:
...        print('out')
out
out
out
```

In [24]:
has_nan = []
nan_df = df.loc[:, col_has_nan]
for row in range(len(nan_df)):
    if nan_df.iloc[row].isnull().sum() > 0:
        has_nan.append(1)
    else:
        has_nan.append(0)

In [25]:
nan_df.iloc[3].isnull().sum()

0

In [26]:
has_nan = pd.Series(has_nan, index=df.index)

In [27]:
df2 = df[has_nan==0]

In [28]:
len(df2)

2340

In [29]:
for col in df2:
#     if df2[col].isnull().sum() > 0:
#         col_has_nan.append(col)
        print('{:<30}'.format(col), '  NaN values:', df2[col].isnull().sum())

Amount.Requested                 NaN values: 0
Amount.Funded.By.Investors       NaN values: 0
Interest.Rate                    NaN values: 0
Loan.Length                      NaN values: 0
Loan.Purpose                     NaN values: 0
Debt.To.Income.Ratio             NaN values: 0
State                            NaN values: 0
Home.Ownership                   NaN values: 0
Monthly.Income                   NaN values: 0
FICO.Range                       NaN values: 0
Open.CREDIT.Lines                NaN values: 0
Revolving.CREDIT.Balance         NaN values: 0
Inquiries.in.the.Last.6.Months   NaN values: 0
Employment.Length                NaN values: 0
FICO.Scoring                     NaN values: 0
outlier                          NaN values: 0
