In [None]:
import pandas as pd 
import numpy as np 
import matplotlib as plt 
import sklearn
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import plotly.express as px
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('acceptedloansdata.csv')

In [None]:
len(data)

# Loan Status

In [None]:
data = data[(data.loan_status != 'In Grace Period')]
data = data[(data.loan_status != 'Current')]
data = data[(data.loan_status != 'In Grace Period')]
data = data[(data.loan_status != 'Late (31-120 days)')]
data = data[(data.loan_status != 'Late (16-30 days)')]

In [None]:
data['charged_off'] = (data['loan_status'] == "Charged Off").astype(int)

In [None]:
my_r = data.corr(method="spearman")
my_r['charged_off']

## SVM 1 - Linear Kernel

In [None]:
model = SVC(probability=True)

### charged off

In [None]:
data['charged_off'] = (data['loan_status'] == "Charged Off").astype(int)
#data.head()

svmdata2 = data.drop(['recoveries', 'collection_recovery_fee', 'sub_grade','Unnamed: 0','grade','issue_d','Month','pymnt_plan','purpose','zip_code','addr_state','earliest_cr_line','application_type','revol_util'], axis='columns')
svmdata2 = svmdata2.select_dtypes(exclude=['object'])
svmdata2 = svmdata2.dropna()
x = svmdata2.drop('charged_off',axis = 1)
y = svmdata2.charged_off
x2train, x2test,y2train,y2test = train_test_split(x,y,test_size=0.33)
model.fit(x2train,y2train)
model.score(x2test,y2test)
y2pred = model.fit(x2train,y2train).predict(x2test)
print("SVM 1 - Linear Kernel")
print(classification_report(y2test, y2pred))

In [None]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y2test, y2pred, pos_label=2)

In [None]:

# example of a roc curve for a predictive model
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from matplotlib import pyplot
# predict probabilities
yhat = model.predict_proba(x2test)
# retrieve just the probabilities for the positive class
pos_probs = yhat[:, 1]
# plot no skill roc curve
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill ROC Curve')
# calculate roc curve for model
fpr, tpr, _ = roc_curve(y2test, pos_probs)
# plot model roc curve
pyplot.plot(fpr, tpr, marker='.', label='Linear SVM')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
svm_grade_dummy = pd.get_dummies(data, columns=['grade'])

In [None]:
svmdata2 = svmdata2.dropna()
x = svmdata2.drop('charged_off',axis = 1)
y = svmdata2.charged_off
x2train, x2test,y2train,y2test = train_test_split(x,y,test_size=0.33)
model.fit(x2train,y2train)
model.score(x2test,y2test)
y2pred = model.fit(x2train,y2train).predict(x2test)
print("SVM 1 - Linear Kernel")
print(classification_report(y2test, y2pred))

## SVM 2 - Gaussian Kernel

In [None]:
non_linear_model = SVC(kernel='rbf', probability=True)

### charged off

In [None]:
y2pred = non_linear_model.fit(x2train,y2train).predict(x2test)
print("SVM 2 - Gaussian Kernel")
print(classification_report(y2test, y2pred))

In [None]:

# example of a roc curve for a predictive model
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from matplotlib import pyplot
# predict probabilities
yhat = non_linear_model.predict_proba(x2test)
# retrieve just the probabilities for the positive class
pos_probs = yhat[:, 1]
# plot no skill roc curve
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill')
# calculate roc curve for model
fpr, tpr, _ = roc_curve(y2test, pos_probs)
# plot model roc curve
pyplot.plot(fpr, tpr, marker='.', label='Gaussian SVM')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

## SVM 3 - Polynomial Kernel

In [None]:
non_linear_model_poly = SVC(kernel='poly', probability=True)
# using poly kernel, C=1, default value of gamma

### charged off

In [None]:
y2pred = non_linear_model_poly.fit(x2train,y2train).predict(x2test)
print("SVM 3 - Polynomial Kernel")
print(classification_report(y2test, y2pred))

In [None]:

# example of a roc curve for a predictive model
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from matplotlib import pyplot
# predict probabilities
yhat = non_linear_model_poly.predict_proba(x2test)
# retrieve just the probabilities for the positive class
pos_probs = yhat[:, 1]
# plot no skill roc curve
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill')
# calculate roc curve for model
fpr, tpr, _ = roc_curve(y2test, pos_probs)
# plot model roc curve
pyplot.plot(fpr, tpr, marker='.', label='SVM 3 - Polynomial Kernel')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

# KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)

### charged off

In [None]:
y2pred = knn.fit(x2train,y2train).predict(x2test)
print("KNN")
print(classification_report(y2test, y2pred))

In [None]:

# example of a roc curve for a predictive model
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from matplotlib import pyplot
# predict probabilities
yhat = knn.predict_proba(x2test)
# retrieve just the probabilities for the positive class
pos_probs = yhat[:, 1]
# plot no skill roc curve
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill')
# calculate roc curve for model
fpr, tpr, _ = roc_curve(y2test, pos_probs)
# plot model roc curve
pyplot.plot(fpr, tpr, marker='.', label='KNN')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

# Gradient Boosting

In [None]:
clf = GradientBoostingClassifier()

### charged off

In [None]:
y2pred = clf.fit(x2train,y2train).predict(x2test)
print("Gradient Boosting")
print(classification_report(y2test, y2pred))

In [None]:
# example of a roc curve for a predictive model
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from matplotlib import pyplot
# predict probabilities
yhat = knn.predict_proba(x2test)
# retrieve just the probabilities for the positive class
pos_probs = yhat[:, 1]
# plot no skill roc curve
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill')
# calculate roc curve for model
fpr, tpr, _ = roc_curve(y2test, pos_probs)
# plot model roc curve
pyplot.plot(fpr, tpr, marker='.', label='GB')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

## Logistic Regression

In [None]:
model = LogisticRegression().fit(x2train, y2train)
y_pred = model.predict(x2test)

In [None]:
print(classification_report(y2test, y2pred)) # I doubt this

In [None]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from matplotlib import pyplot
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, random_state=1)
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2)
# fit a model

In [None]:
# fit a model
model = LogisticRegression(solver='lbfgs')
model.fit(trainX, trainy)
# predict probabilities
yhat = model.predict_proba(testX)
# retrieve just the probabilities for the positive class
pos_probs = yhat[:, 1]
# plot no skill roc curve
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill')
# calculate roc curve for model
fpr, tpr, _ = roc_curve(testy, pos_probs)
# plot model roc curve
pyplot.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

# Net Annualized Return

In [None]:
import pandas as pd
data = pd.read_csv('acceptedloansdata.csv')

In [None]:
data['last_pymnt_d']

In [None]:
data['last_pymnt_d'] = pd.to_datetime(data['last_pymnt_d'], format='%b-%y')

In [None]:
data['issue_d'] = pd.to_datetime(data['issue_d'])

In [None]:
data['days'] = data['last_pymnt_d'] - data['issue_d']

In [None]:
data = data[data['days'] >'0 days 00:00:00']

In [None]:
%matplotlib inline
import seaborn; seaborn.set()

In [None]:
data['d'] = data['days'] 
data['d'] = data['d'].astype(str)
data['d'] = data['d'].str.rstrip('days')
data['d'] = data['d'].astype(int)
data['nar'] = (data['total_pymnt'] / data['funded_amnt'])**(1 / (365/data['d'])) - 1 

In [None]:
data = data[data['funded_amnt']>0]
#2. columns such as issue_d, loan_states and last_pymnt_d are essential for calculating the loan period. remove rows with missing data
data = data[(data['issue_d'].notnull()) & (data['loan_status'].notnull()) & (data['last_pymnt_d'].notnull())]

In [None]:
data['roi'] = ((data['total_pymnt'] - data['funded_amnt']) / data['funded_amnt'])

In [None]:
data.roi

In [None]:
fig = px.box(data, x='last_pymnt_d', y="nar")
fig.show() # April 2018 decline in NAR

In [None]:
fig = px.box(data, x='last_pymnt_d', y="roi")
fig.show() # 2018 increase in funded 

In [None]:
fig = px.box(data, x='last_pymnt_d', y="int_rate")
fig.show() # 2018 increase in interest rate

In [None]:
fig = px.box(data, x='last_pymnt_d', y="nar", color="loan_status")
fig.show() 

In [None]:
fig = px.box(data, x='grade', y="nar")
fig.show()

In [None]:
data['years'] = (1 / (365/data['d']))

In [None]:
fig = px.histogram(data, x='years', log_y=True)
fig.show()

In [None]:
data = data.drop(['sub_grade','Unnamed: 0','issue_d','Month','pymnt_plan','zip_code','earliest_cr_line','revol_util'], axis='columns')

In [None]:
data = data.drop(['last_pymnt_d', 'days' ], axis='columns')

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
data = data.dropna()

In [None]:
x = pd.get_dummies(data, columns = ['loan_status'])

In [None]:
x = x.select_dtypes(exclude=['object'])

In [None]:
x = x.drop(['nar', 'roi', 'd', 'years'], axis='columns')

In [None]:
x.info()

In [None]:
y = data.nar

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

In [None]:
LR = LinearRegression()

In [None]:
model = LR.fit(x_train,y_train)

In [None]:
y_prediction =  LR.predict(x_test)
y_prediction

In [None]:
cdf = pd.DataFrame(model.coef_, x.columns, columns=['Coefficients'])
cdf.sort_values(by='Coefficients', ascending=False)

In [None]:
model.intercept_

In [None]:
# importing r2_score module
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# predicting the accuracy score
score=r2_score(y_test,y_prediction)
print('r2 socre is', score)
print('mean_sqrd_error is==' ,mean_squared_error(y_test,y_prediction))
print('root_mean_squared error of is==',np.sqrt(mean_squared_error(y_test,y_prediction)))

In [None]:
import statsmodels.api as sm
from scipy import stats

In [None]:
X2 = sm.add_constant(x)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())