# Project: Predicting Credit Card Default

In [4]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Load Data

Load the data from csv file. It contains 15,000 records. Each record has these values:

**Response Variable：**
* Default or Not      

**Predictor Variables:**
1. Balance divided by credit limit.    
2. Age.                  
3. Number of times that 30-59 days past due.   
4. Debt divied by monthly income.                           
5. Monthly income.                 
6. Number of open credit lines and loans.        
7. Number of times that 90 and more days past due.              
8. Number of real estate loans.            
9. Number of times that 60-89 days past due.    
10. Number of dependents.

The goal of this project is to build a credit scoring model by predicting probability of credit default in the future.

In [5]:
import pandas as pd
df=pd.read_csv('./data/cs-training.csv', sep=',', index_col=0)
#print list(df.columns.values)
#print df.info() # basic information
#print data.describe() # basic statistics
df.head()

IOError: File ./data/cs-training.csv does not exist

## Preprocessing: 
### 1. Data distribution
#### 1.1. Q-Q plot

The quantile-quantile (q-q) plot is a graphical technique for determining if two data sets come from populations with a common distribution.

In [None]:
from scipy.stats import probplot
def qq_plot(x):
    probplot(x, dist='norm', plot=plt)

In [None]:
x=df['MonthlyIncome']
qq_plot(x[x<14000])

#### 1.2. KDE plot
In statistics, kernel density estimation (KDE) is a non-parametric way to estimate the probability density function of a random variable. Kernel density estimation is a fundamental data smoothing problem where inferences about the population are made, based on a finite data sample.

In [None]:
from scipy.stats.kde import gaussian_kde
def kde_plot(x):   
    kde = gaussian_kde(x)
    positions = np.linspace(x.min(), x.max())
    smoothed = kde(positions)
    plt.plot(positions, smoothed)
    
def kde_values(x):   
    kde = gaussian_kde(x)
    positions = np.linspace(x.min(), x.max())
    smoothed = kde(positions)
    return positions, smoothed

In [None]:
x=df['MonthlyIncome']
x=x.dropna()
kde_plot(x)
plt.title("Distribution of all MonthlyIncome data")
plt.show()

## Preprocessing: 
### 2. Outliers detection 
Set all values that are greater then 99 percentile be NaN.

In [None]:
print x.quantile(0.99), x.quantile(0.01)

In [None]:
kde_plot(x[x<25000])
plt.title("Distribution of MonthlyIncome data which is less than 25,000")
plt.show

In [None]:
for col in range(1,11):
    m=df.iloc[:,col].dropna().quantile(0.99)
    df.iloc[:,col]=df.iloc[:,col].map(lambda x: None if x>m else x)

In [None]:
x=df['MonthlyIncome']
x=x.dropna()

plt.rcParams['figure.figsize'] = (15,5)
f, (ax1, ax2) = plt.subplots(1, 2)
positions, smoothed = kde_values(x)
ax1.plot(positions, smoothed)
ax1.set_title('Monthly Income: All Data')
positions, smoothed = kde_values(x[x<50000])
ax2.plot(positions, smoothed)
ax2.set_title('Monthly Income: Data after deleting outliers')
plt.show()

In [None]:
x=df.iloc[:,1] # balance/limit
x=x.dropna()

plt.rcParams['figure.figsize'] = (15,5)
f, (ax1, ax2) = plt.subplots(1, 2)
positions, smoothed = kde_values(x)
ax1.plot(positions, smoothed)
ax1.set_title('balance/limit: All Data')
positions, smoothed = kde_values(x[x<1])
ax2.plot(positions, smoothed)
ax2.set_title('balance/limit: Data after deleting outliers')
plt.show()

In [None]:
# NumberOfOpenCreditLinesAndLoans

x=df.iloc[:,10]
x=x.dropna()
m=x.quantile(0.99)

plt.rcParams['figure.figsize'] = (15,5)
f, (ax1, ax2) = plt.subplots(1, 2)
positions, smoothed = kde_values(x)
ax1.plot(positions, smoothed)
ax1.set_title('Number Of Open Credit Lines And Loans: All Data')
positions, smoothed = kde_values(x[x<m])
ax2.plot(positions, smoothed)
ax2.set_title('Number Of Open Credit Lines And Loans: Data after deleting outliers')
plt.show()

# Preprocessing: 
### 3. Missing value
Replace all nan values as mean values.

In [None]:
y=df.iloc[:,0].fillna(df.mean()).as_matrix()
x=df.iloc[:,1:].fillna(df.mean()).as_matrix()
print y.shape
print x.shape



# Preprocessing: 
### Normalize

In [None]:
from sklearn import preprocessing
x = preprocessing.StandardScaler().fit_transform(x)

## Choose One Model

# Logistic Classification

In [None]:
from sklearn.linear_model import LogisticRegression

classify = LogisticRegression() # You can adjust some paraments here.
classify.fit(x,y)

y_pred = classify.predict(x)
classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100
print("classif_rate for %s : %f " % ('LogisticRegression', classif_rate))
print classify.coef_
print df.columns.values[1:]

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Logistic Classification
classify = RandomForestClassifier(n_estimators=25) # You can adjust some paraments here.
classify.fit(x,y)

y_pred = classify.predict(x)
classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100
print("classif_rate for %s : %f " % ('RandomForestClassifier', classif_rate))
print classify.feature_importances_

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

classify = GradientBoostingClassifier(n_estimators=25) # You can adjust some paraments here.
classify.fit(x,y)

y_pred = classify.predict(x)
classif_ra# Logistic Classificationte = np.mean(y_pred.ravel() == y.ravel()) * 100
print("classif_rate for %s : %f " % ('GradientBoostingClassifier', classif_rate))
print classify.feature_importances_

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

classify = DecisionTreeClassifier() # You can adjust some paraments here.
classify.fit(x,y)

y_pred = classify.predict(x)
classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100
print("classif_rate for %s : %f " % ('DecisionTreeClassifier', classif_rate))
print classify.feature_importances_

# SVM

In [None]:
from sklearn.svm import SVC

classify = SVC(kernel='linear') # You can adjust some paraments here.
classify.fit(x,y)

y_pred = classify.predict(x)
classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100
print("classif_rate for %s : %f " % ('SVC', classif_rate))
print classify.coef_

# Plot Result

In [6]:
features=np.array([ "Balance divided by credit limit",
                    "Age",
                    "Number of times that 30-59 days past due",
                    "Debt divied by monthly income",
                    "Monthly income",
                    "Number of open credit lines and loans",
                    "Number of times that 90 and more days past due",
                    "Number of real estate loans",
                    "Number of times that 60-89 days past due",
                    "Number of dependents"])
value=scores.values()
ind=sorted(range(len(value)),reverse=False,key=lambda k: value[k])
features=features[ind]
value=sorted(value,reverse=False)
ind=np.array(range(10))
plt.rcParams['figure.figsize'] = (9,7)
plt.barh(bottom=ind,height=0.5,width=value,color='r')
plt.yticks(ind+0.25,features)
plt.xlabel('Weights')
plt.ylabel('Features')
plt.title('Feature Importances')
#plt.subplots_adjust(left=0.2)
plt.tight_layout()
#plt.savefig('feature_importances.png', format='png', dpi=300)
plt.show()

NameError: name 'scores' is not defined