In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Loading and Preprocessing Loans Encoded Data

Load the `sba_loans_encoded.csv` in a pandas DataFrame called `df_loans`.

In [2]:
# Loading data
stroke_info_df = pd.read_csv("data/stroke_data.csv")
stroke_info_df

Unnamed: 0.1,Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Ever Married,Work Type,Residence Type,Average Glucose Level,BMI,Smoking Status,Stroke
0,0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,5105,Female,80.0,1,0,Yes,Private,Urban,83.75,28.9,never smoked,0
5106,5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
# Convert categorical data to numeric with `pd.get_dummies`
stroke_info_df = pd.get_dummies(stroke_info_df)
stroke_info_df.head()

Unnamed: 0.1,Unnamed: 0,Age,Hypertension,Heart Disease,Average Glucose Level,BMI,Stroke,Gender_Female,Gender_Male,Gender_Other,...,Work Type_Never_worked,Work Type_Private,Work Type_Self-employed,Work Type_children,Residence Type_Rural,Residence Type_Urban,Smoking Status_formerly smoked,Smoking Status_never smoked,Smoking Status_smokes,Smoking Status_unknown
0,0,67.0,0,1,228.69,36.6,1,False,True,False,...,False,True,False,False,False,True,True,False,False,False
1,1,61.0,0,0,202.21,28.9,1,True,False,False,...,False,False,True,False,True,False,False,True,False,False
2,2,80.0,0,1,105.92,32.5,1,False,True,False,...,False,True,False,False,True,False,False,True,False,False
3,3,49.0,0,0,171.23,34.4,1,True,False,False,...,False,True,False,False,False,True,False,False,True,False
4,4,79.0,1,0,174.12,24.0,1,True,False,False,...,False,False,True,False,True,False,False,True,False,False


Create the target vector by assigning the values of the `Default` column from the `df_loans` DataFrame.

In [4]:
# Get the target variable. 
y = stroke_info_df["Stroke"]

In [5]:
# Get the features. 
X = stroke_info_df.drop("Stroke", axis=1)

Split the data into training and testing sets.

In [6]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

Use the `StandardScaler` to scale the features data, remember that only `X_train` and `X_testing` DataFrames should be scaled.

In [7]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [8]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [9]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Decision Tree Model

Once data is scaled, create a decision tree instance and train it with the training data (`X_train_scaled` and `y_train`).

In [10]:
# Create the decision tree classifier instance
decision_tree_model = tree.DecisionTreeClassifier()

In [11]:
# Fit the model
decision_tree_model = decision_tree_model.fit(X_train_scaled, y_train)

## Making Predictions Using the Tree Model

Validate the trained model, by predicting fraudulent loan applications using the testing data (`X_test_scaled`).

In [12]:
# Making predictions using the testing data
predictions = decision_tree_model.predict(X_test_scaled)

## Model Evaluation

Evaluate model's results, by using `sklearn` to calculate the confusion matrix, the accuracy score and to generate the classification report.

In [14]:
# Calculate the classification report
predictions = decision_tree_model.predict(X_test)

testing_report = classification_report(y_test, predictions)
print(testing_report)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1210
           1       0.00      0.00      0.00        68

    accuracy                           0.95      1278
   macro avg       0.47      0.50      0.49      1278
weighted avg       0.90      0.95      0.92      1278



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# Save the model
import pickle

filename = 'models/decision_tree_model.sav'
pickle.dump(decision_tree_model, open(filename, 'wb'))