<h1 style="font-size:42px; text-align:center; margin-bottom:30px;"><span style="color:SteelBlue">Cubank:</span> Machine Learning with Python and TM1</h1><hr>

## Import Dependencies

Before we get started, plotting libraries need to be imported. All the other dependencies we can import on the fly when needed.

In [None]:
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.offline as py

py.init_notebook_mode()

%matplotlib inline

# Part 1: Bring TM1 Data To Python

-----

## 1.1 Query TM1 cube data

We use **TM1py** to query data from cube Loans through an **MDX** Query

In [None]:
ADDRESS = "localhost"
PORT = 5001
USER = "admin"
PWD = "YXBwbGU="
SSL = True

In [None]:
from TM1py import TM1Service

mdx = """
SELECT 
    NON EMPTY  
    { HEAD ( {Tm1FilterByLevel ( {Tm1SubsetAll ([Loan])} , 0 ) } , 20000 ) }  * 
    {Tm1FilterByLevel ( {Tm1SubsetAll ([LC Rating])} , 0 ) } * 
    {Tm1FilterByLevel ( {Tm1SubsetAll ([FICO Score])} , 0 ) } *
    {Tm1FilterByLevel ( {Tm1SubsetAll ([Purpose])} , 0 ) } * 
    {Tm1FilterByLevel ( {Tm1SubsetAll ([State])} , 0 ) } * 
    {Tm1FilterByLevel ( {Tm1SubsetAll ([Income To Loan Ratio])} , 0 ) } * 
    {Tm1FilterByLevel ( {Tm1SubsetAll ([Home Ownership])} , 0 ) } *
    {Tm1FilterByLevel ( {Tm1SubsetAll ([Delinquency Events])} , 0 ) } *
    {Tm1FilterByLevel ( {Tm1SubsetAll ([Time])} , 0 ) } *
    {Tm1FilterByLevel ( {Tm1SubsetAll ([Income])} , 0 ) } *
    {Tm1FilterByLevel ( {Tm1SubsetAll ([Application Type])} , 0 ) } *
    {[Loan Status].[Fully Paid], [Loan Status].[Charged Off], [Loan Status].[default]}  ON ROWS,
    {[Loans Measure].[loan_amnt], [Loans Measure].[defaulted], [Loans Measure].[int_rate],
    [Loans Measure].[num_personal_inquiries], [Loans Measure].[inquiries_in_last_12m],
    [Loans Measure].[mths_since_last_delinq], [Loans Measure].[mths_since_recent_bc_dlq], 
    [Loans Measure].[mths_since_recent_inq]} ON COLUMNS
FROM [Loans]
WHERE ([Employment].[Total Employment], [Term].[Total Term])
"""

with TM1Service(address=ADDRESS, port=PORT, user=USER, password=PWD, ssl=SSL, decode_b64=True) as tm1:
    loans_raw = tm1.cubes.cells.execute_mdx_dataframe(mdx)

Print out number of rows and columns

In [None]:
loans_raw.shape

Print out the 5 sample records from our data set

In [None]:
loans_raw.head(10)

## 1.2 Preprocessing

Rearrange the dataframe into something that is more convenient for consumption

- Arrange Measures as seperate columns

- Remove Value Column

- Set new index in DataFrame based on Loan-Id

In [None]:
loans = loans_raw.copy()

# Arrange measures as columns
for measure in ("defaulted", "loan_amnt", "num_personal_inquiries", "int_rate",
                "inquiries_in_last_12m", "mths_since_last_delinq", "mths_since_recent_bc_dlq",
                "mths_since_recent_inq"):
    loans[measure] = loans.apply(lambda row: row["Value"] if row["Loans Measure"] == measure else None, axis=1)

loans.drop(columns=["Value"], inplace=True)
loans.drop(columns=["Loans Measure"], inplace=True)

columns_to_remain = ["LC Rating", "FICO Score", "Purpose", "State", "Time", "Income", "Income To Loan Ratio",
                 "Home Ownership", "Delinquency Events", "Application Type"]

loans = loans.groupby(["Loan"] + columns_to_remain).sum()

for column in columns_to_remain:
    loans.reset_index(level=column, inplace=True)

In [None]:
loans.shape

In [None]:
loans.head()

In [None]:
loans['defaulted'].value_counts(normalize=True)

# Part 2: Exploratory Data Analysis and Feature Selection

-----

We want to use pandas and plotly to 

- Get a high level overview of the dataset we are dealing with 
- Select relevant features and remove irrelavant features



## 2.1 LC Rating

Assigned loan grade by Lending Club


In [None]:
bar = go.Bar(
    x=sorted(loans["LC Rating"].unique()),
    y=loans.groupby(by="LC Rating").mean()["defaulted"].values)

layout = go.Layout(
    barmode='stack',
    title="Default Rate by Rating")

data = [bar]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

## 2.2 Purpose

A category provided by the borrower for the loan request

In [None]:
bar = go.Bar(
    x=sorted(loans["Purpose"].unique()),
    y=loans.groupby(by="Purpose").mean()["defaulted"].values)

layout = go.Layout(
    barmode='stack',
    title="Default Rate by Purpose")

data = [bar]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

## 2.3 State

The state provided by the borrower in the loan application

In [None]:
loans_by_state = loans.groupby(by="State").mean()["defaulted"] * 100
loans_by_state.sort_values()

In [None]:
data = [go.Choropleth(
    autocolorscale=True,
    locations=loans_by_state.index,
    z=loans_by_state.values,
    locationmode='USA-states',
    marker=go.choropleth.Marker(
        line=go.choropleth.marker.Line(
            color='rgb(255,255,255)',
            width=2
        )),
    colorbar=go.choropleth.ColorBar(
        title="Defaults in %")
)]

layout = go.Layout(
    title=go.layout.Title(
        text='Percentage of Loans that default'
    ),
    geo=go.layout.Geo(
        scope='usa',
        projection=go.layout.geo.Projection(type='albers usa'),
        showlakes=True,
        lakecolor='rgb(255, 255, 255)'),
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

## 2.4 Income To Loan Ratio

A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.

In [None]:
df_temp = loans.loc[loans['Income To Loan Ratio'] < 5]

sum(df_temp["defaulted"]) / len(df_temp)

In [None]:
df_temp = loans.loc[loans['Income To Loan Ratio'] > 15]

sum(df_temp["defaulted"]) / len(df_temp)

In [None]:
loans.groupby('defaulted')['Income To Loan Ratio'].describe()

## 2.5 Income

The self-reported annual income provided by the borrower during registration.

In [None]:
bar = go.Bar(
    x=sorted(loans["Income"].unique()),
    y=loans.groupby(by="Income").mean()["defaulted"].values)

layout = go.Layout(
    barmode='stack',
    title="Default Rate by Income")

data = [bar]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
loans.drop("Income", axis=1, inplace=True)

Income is provided by the user. Seems unreliable. So we decide to remove it from the dataframe

## 2.6 Application Type

Indicates whether the loan is an individual application or a joint application with two co-borrowers

In [None]:
bar = go.Bar(
    x=sorted(loans["Application Type"].unique()),
    y=loans.groupby(by="Application Type").mean()["defaulted"].values)

layout = go.Layout(
    barmode='stack',
    title="Default Rate by Income")

data = [bar]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
loans['Application Type'].value_counts(normalize=True)

## 2.7 Time

The month the loan was funded

In [None]:
loans['Time'].describe()

In [None]:
loans.drop("Time", axis=1, inplace=True)

## 2.8 num_personal_inquiries

Number of personal finance inquiries

In [None]:
bar = go.Bar(
    x=sorted(loans["num_personal_inquiries"].unique()),
    y=loans.groupby(by="num_personal_inquiries").mean()["defaulted"].values)

layout = go.Layout(
    barmode='stack',
    title="Default Rate by Number of Personal Finance Inquiries")

data = [bar]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

# Part 3: More Data Processing

-----

### Translate String columns to Numeric or Binary

Unfortunately Classifier Implementation for python can only consume numeric **features**, so we need to translate string columns (e.g. State, Rating, purpose) into numeric and binary columns.

**FROM**

| Loan  | State  |
| :-: | :-: |
| Loan 5 | CA |
| Loan 8 | WA |


**TO**

| Loan  | State_CA | State_WA |
| :-: | :-: | :-: |
| Loan 5 | 1 | 0 |
| Loan 8 | 0 | 1 |


In [None]:
import pandas as pd

loans_numeric = pd.get_dummies(
    loans, 
    columns=['LC Rating', 'Home Ownership', 'Purpose', 'State', 'Application Type'],
    drop_first=True)

In [None]:
loans_numeric.shape

In [None]:
loans_numeric.head()

Split data into **Features** (e.g. Income, Rating) and **Class** (Defaulted or Fully Paid)

In [None]:
X = loans_numeric.loc[:, loans_numeric.columns != "defaulted"]
y = loans_numeric["defaulted"]

### Calculate correlation between columns and default

Now that all columns are numeric values we can calculate Correlations between them

- Values between 0 and 0.3 (0 and -0.3) indicate a weak positive (negative) linear relationship
- Values between 0.3 and 0.7 (-0.3 and -0.7) indicate a moderate positive (negative) linear relationship
- Values between 0.7 and 1.0 (-0.7 and -1.0) indicate a strong positive (negative) linear relationship

In [None]:
linear_dep = pd.DataFrame()
for col in X.columns:
    linear_dep.loc[col, 'corr'] = X[col].corr(y)
    
linear_dep['abs_corr'] = abs(linear_dep['corr'])
linear_dep.sort_values('abs_corr', ascending=False, inplace=True)

linear_dep.head(10)

# Part 4: Fit And Evaluate Maschine Learning Model

-----

## 4.1 Scale data and split into test and training sets

Before applying Machine Learning, we need to scale our data such that each feature has the same variance

In [None]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaler.fit(X)
X_scaled=scaler.transform(X)

In [None]:
X_scaled.shape

In [None]:
X_scaled[0]

### Split data set into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.20, random_state=4)

## 4.2 KNN (K Nearest Neighbors)

Fit KNN model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

Apply KNN model on test data set

In [None]:
y_pred = knn_classifier.predict(X_test)

### Evaluate KNN results

Calculate confusion matrix from results on the test set and pretty-print visualization of confusion matrix as heatmap

**Top Left** - True Positives: Loan default and we predicted default

**Top Right** - False Negatives: Loan default and we predicted fine.

**Bottom Left** - False Positive: Loan fine and we predicted default.

**Bottom Right** - True Negatives: Loan fine and we predicted fine.

Perfect precission would output a matrix with all values on the diagonal...


In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[1, 0])

ax = plt.subplot()
sns.heatmap(
    matrix, 
    annot=True, 
    fmt='d',
    linewidths=.5,
    cmap="YlGnBu");

# labels and titles
ax.set_xlabel('Predicted Values')
ax.set_ylabel('Actual Values')
ax.xaxis.set_ticklabels(['Defaulted', 'Fully Paid'])
ax.yaxis.set_ticklabels(['Defaulted', 'Fully Paid'])

### Print classification report

**precision** - What percent of the predicted defaults that were correct ?

**recall** – What percent of the defaults did we catch ?

**f1 score** – Weighted average over precission and recall

**support** - number of records

In [None]:
from sklearn.metrics import classification_report

knn_report = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# predict probabilities
probs = knn_classifier.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_test, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, probs)
# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()

## 4.3 Random Forest classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_features=20,
    max_depth=100,
    random_state=4)

rf_classifier.fit(X_train, y_train)

In [None]:
y_pred = rf_classifier.predict(X_test)

Prettyprint visualization of confusion matrix as heatmap

**Top Left** - True Positives: Loan default and we predicted default

**Top Right** - False Negatives: Loan default and we predicted fine.

**Bottom Left** - False Positive: Loan fine and we predicted default.

**Bottom Right** - True Negatives: Loan fine and we predicted fine.

In [None]:
from sklearn.metrics import confusion_matrix 
import seaborn as sns

matrix = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[1, 0])

ax = plt.subplot()
sns.heatmap(
    matrix, 
    annot=True, 
    fmt='d',
    linewidths=.5,
    cmap="YlGnBu");

# labels and titles
ax.set_xlabel('Predicted Values')
ax.set_ylabel('Actual Values')
ax.xaxis.set_ticklabels(['Defaulted', 'Fully Paid'])
ax.yaxis.set_ticklabels(['Defaulted', 'Fully Paid'])

### Print classification report

**precision** - What percent of the predicted defaults that were correct ?

**recall** – What percent of the defaults did we catch ?

**f1 score** – Weighted average over precission and recall

**support** - number of records

In [None]:
from sklearn.metrics import classification_report

rf_report = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# predict probabilities
probs = rf_classifier.predict_proba(X_test)

# keep probabilities for the positive outcome only
probs = probs[:, 1]

# calculate AUC
auc = roc_auc_score(y_test, probs)
print('AUC: %.3f' % auc)

# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fpr, tpr, marker='.')
plt.show()

### Plot features importance

In [None]:
import pandas as pd

feature_importances = pd.DataFrame(
    rf_classifier.feature_importances_,
    index = X.columns,
    columns=['importance']).sort_values('importance', ascending=False)

feature_importances.head(10)

# Other

Things that didn't make it into the demo.

-----

## 2.6 Home Ownership

The home ownership status provided by the borrower during registration. 

Possible values are: RENT, OWN, MORTGAGE, OTHER.

In [None]:
bar = go.Bar(
    x=sorted(loans["Home Ownership"].unique()),
    y=loans.groupby(by="Home Ownership").mean()["defaulted"].values)

layout = go.Layout(
    barmode='stack',
    title="Default Rate by Home Ownership Type")

data = [bar]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

## 2.2 FICO Score

A credit score created by the Fair Isaac Corporation

In [None]:
defaulted = loans.loc[loans['defaulted'] == 1]['FICO Score']
fine = loans.loc[loans['defaulted'] == 0]['FICO Score']

hist1 = go.Histogram(
    x=fine,
    name="Fully Paid"
)
hist2 = go.Histogram(
    x=defaulted,
    name="Defaulted"
)

data = [hist1, hist2]
layout = go.Layout(
    barmode="stack",
    title="Histograms on FICO Score")
fig = go.Figure(data=data, layout=layout)

py.iplot(fig)

## Logistic Regression

In [None]:
from sklearn.linear_model import SGDClassifier

logrec_classifier = SGDClassifier(
    loss='log', 
    max_iter=1000, 
    tol=1e-3, 
    random_state=1, 
    warm_start=True,
    alpha=0.01, 
    penalty='l2')

logrec_classifier.fit(X_train, y_train)

In [None]:
y_pred = logrec_classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix 

matrix = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[1, 0])
print(matrix)

### Print classification report

**precision** - What percent of the predicted defaults that were correct ?

**recall** – What percent of the defaults did we catch ?

**f1 score** – Weighted average over precission and recall

**support** - number of records

In [None]:
from sklearn.metrics import classification_report

logrec_report = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# predict probabilities
probs = logrec_classifier.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_test, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, probs)
# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()

## PCA Dimension Reduction and Visualization in 2D

_PCA is essentially a method that reduces the dimension of the feature space in such a way that new variables are orthogonal to each other (i.e. they are independent or not correlated)._


In [None]:
import pandas as pd
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_train)
principal_df = pd.DataFrame(
    data = principal_components, 
    columns = ['principal component 1', 'principal component 2'])

In [None]:
data = [go.Scatter(
    x = principal_df["principal component 1"],
    y = principal_df["principal component 2"],
    mode = 'markers',
    marker = dict(
        color = y_train.values
        ),
    )]

py.iplot(data)