In [1]:
# Import Dependencies

import numpy as np
import pandas as pd
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [3]:
# Import the Data

df = pd.read_csv('./Resources/full_data.csv')

# Check To See How Many Rows 

print(df.count())

df.head()

FileNotFoundError: [Errno 2] File ./Resources/full_data.csv does not exist: './Resources/full_data.csv'

In [None]:
# Filter df To Show Adults Only:
# We eliminated children due to them not likely having strokes. Also, children have different ranges for average glucose levels and BMIs.

child_free_df = df[df.age >= 18]

# Check To Make Sure There Are No Children 

print(child_free_df.count())

child_free_df.head()

In [None]:
# Check To See How Many Unique Values There Are Per Column

child_free_df.nunique()

In [None]:
# Verify The Data Types

child_free_df.info()

In [None]:
# Since testing doesn't work on object/string dtypes, we're going to use .get_dummies to convert our strings into numerical data.   

dummy_df = pd.get_dummies(child_free_df, prefix={'gender':'gender',
                                      'ever_married':'ever_married',
                                      'work_type':'work_type',
                                      'Residence_type':'Residence_type',
                                      'smoking_status':'smoking_status',}, drop_first=True)

dummy_df

In [None]:
dummy_df.info()

In [None]:
# Split The Data

X = dummy_df.drop("stroke", axis=1)
y = dummy_df["stroke"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Scale The Data

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train A Random Forest Classifier Model And Print The Model Testing/Training Scores

rf = RandomForestClassifier(random_state=42).fit(X_train_scaled, y_train)
print(f'Training Score: {rf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {rf.score(X_test_scaled, y_test)}')

In [None]:
# Get The Parameters Currently In Use

rf = RandomForestClassifier(random_state=42)

# Look At Parameters Used By Our Current Forest

print('Parameters currently in use:\n')
pprint(rf.get_params())

In [None]:
# Since tuning hyperparameters isn't an exact science, we're going to choose some parameters and turn them into hyperparameters and use those to create the random grid variable,
# which we're going to feed into our RandomizedSearchCV.

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use The Newly Created Random Grid To Search For The Best Hyperparameters:
# -Since we're testing our new hyper parameters from our original RandomForestRegressor, we're going to assign it a variable and set it to be the same for this cross validation.
# -Here, we're setting param_distributions to equal the random grid variable
# -Next, we're setting the n_iter equal to the number of iterations we want run (although 10 is the default, but 100 is preferred. This allows for enough iterations to run to 
#   where the results will start to level out. Too few iterations, and you might hinder your model.)
# -Set cv to the number of folds you want to use for the cross validation (5 is the default)
# -Set random_state equal to the random state used in the previous model (We are only fine-tuning the parameters... Everything else needs to be the same.) 

# So, Let's Call The Function

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

# *If Actually Ran, THIS WILL Take F*O*R*E*V*E*R Because it is A LOT of Fits (~10-15 mins based on internet speed)

In [None]:
# Find The Best Parameters 

rf_random.best_params_

In [None]:
# Finally, We're Going To Train Another Random Forest Classifier Model, And Input The Results From 
#.best_params_ And Print The New Model's Training/Testing Scores

hyper_rf = RandomForestClassifier(random_state=42, n_estimators=200, min_samples_split=5, 
                                  min_samples_leaf=4, max_features='log2', 
                                  bootstrap=True, max_depth=20).fit(X_train_scaled, y_train)
print(f'Training Score: {hyper_rf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {hyper_rf.score(X_test_scaled, y_test)}')

In [None]:
# Check To See What Are The Most Important Features

important_features_df = pd.DataFrame({"feature": list(X.columns), 
                                      "importance": hyper_rf.feature_importances_}).sort_values("importance", 
                                                                                                ascending=False)

# Display

important_features_df

In [None]:
# Visualize The Important Features By Creating A Plot

# Create The Bar Plot 

sns.barplot(x=important_features_df.feature, y=important_features_df.importance)

# Add Labels To Axes 

plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Visualizing Important Features")
plt.xticks(
    rotation=45, horizontalalignment="right", fontweight="light", fontsize="x-large"
)
plt.savefig('./Images/import_feat.png', dpi=50,  bbox_inches='tight')
plt.show()


In [None]:
# Visualization of data

correlation = df.corr().round(2)
plt.figure(figsize = (14,7))
correlation_map = sns.heatmap(correlation, annot = True, cmap = 'YlOrBr')
print(correlation_map)

In [None]:
figure = correlation_map.get_figure()    
figure.savefig('./Images/correlation_map.png', dpi=100)