In [1]:
 # Import dependancies
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
# Import the data
data = Path('LLCP2021XPT/selected_data.csv')
df = pd.read_csv(data)
df.head(5)

Unnamed: 0,has_primary_doc,checkup_within_last_year,skin_cancer,any_other_cancer,income,difficulty_consentrating_remembering_decisions,difficulty_walking_or_stairs,has_health_plan,30_min_physical_activity,BMI,overweight_or_obese,adult_current_smoker,bing_drinkers,eat_fruit_daily,eat_veg_daily,urban_rural
0,Yes,No,No,No,"$25,000 to < $35,000",No,No,Yes,No,1454.0,No,No,No,No,No,Urban
1,Yes,Yes,No,No,,Yes,Yes,Yes,Yes,,,No,No,No,No,Urban
2,Yes,Yes,No,No,"$15,000 to < $20,000",No,No,Yes,No,2829.0,Yes,No,No,No,Yes,Urban
3,Yes,Yes,No,No,"$50,000 to < $75,000",No,No,Yes,Yes,3347.0,Yes,No,Yes,No,No,Urban
4,Yes,Yes,No,No,"$20,000 to < $25,000",No,Yes,Yes,Yes,2873.0,Yes,No,No,No,No,Urban


In [3]:
df.shape

(438693, 16)

In [4]:
# Count NAs
for i in df:
    if df[i].isna().sum() >0:
        
        print(f'{i} has {df[i].isna().sum()} NAs')

has_primary_doc has 3705 NAs
checkup_within_last_year has 5905 NAs
skin_cancer has 1441 NAs
any_other_cancer has 1364 NAs
income has 94413 NAs
difficulty_consentrating_remembering_decisions has 19980 NAs
difficulty_walking_or_stairs has 19830 NAs
has_health_plan has 17397 NAs
30_min_physical_activity has 928 NAs
BMI has 46852 NAs
overweight_or_obese has 46852 NAs
adult_current_smoker has 24970 NAs
bing_drinkers has 35322 NAs
eat_fruit_daily has 51087 NAs
eat_veg_daily has 60127 NAs
urban_rural has 7054 NAs


In [5]:
# Drop income & BMI

df = df.drop(columns = ["BMI","income", "difficulty_consentrating_remembering_decisions", "difficulty_walking_or_stairs" ], axis = 1)

In [6]:
# Drop NAs
df = df.dropna()
df.shape

(317251, 12)

In [7]:
df.columns

Index(['has_primary_doc', 'checkup_within_last_year', 'skin_cancer',
       'any_other_cancer', 'has_health_plan', '30_min_physical_activity',
       'overweight_or_obese', 'adult_current_smoker', 'bing_drinkers',
       'eat_fruit_daily', 'eat_veg_daily', 'urban_rural'],
      dtype='object')

In [14]:
df.head()

Unnamed: 0,has_primary_doc,checkup_within_last_year,skin_cancer,any_other_cancer,has_health_plan,30_min_physical_activity,overweight_or_obese,adult_current_smoker,bing_drinkers,eat_fruit_daily,eat_veg_daily,urban_rural
0,Yes,No,No,No,Yes,No,No,No,No,No,No,Urban
2,Yes,Yes,No,No,Yes,No,Yes,No,No,No,Yes,Urban
3,Yes,Yes,No,No,Yes,Yes,Yes,No,Yes,No,No,Urban
4,Yes,Yes,No,No,Yes,Yes,Yes,No,No,No,No,Urban
5,Yes,Yes,No,No,Yes,No,No,No,No,Yes,Yes,Rural


In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_ml = df.copy()

clmns = ['has_primary_doc', 'checkup_within_last_year', 'skin_cancer',
       'any_other_cancer', 'has_health_plan', '30_min_physical_activity',
       'overweight_or_obese', 'adult_current_smoker', 'bing_drinkers',
       'eat_fruit_daily', 'eat_veg_daily', 'urban_rural']

df_ml[clmns] = df_ml[clmns].apply(le.fit_transform)







In [17]:
df_ml.head()

Unnamed: 0,has_primary_doc,checkup_within_last_year,skin_cancer,any_other_cancer,has_health_plan,30_min_physical_activity,overweight_or_obese,adult_current_smoker,bing_drinkers,eat_fruit_daily,eat_veg_daily,urban_rural
0,1,0,0,0,1,0,0,0,0,0,0,1
2,1,1,0,0,1,0,1,0,0,0,1,1
3,1,1,0,0,1,1,1,0,1,0,0,1
4,1,1,0,0,1,1,1,0,0,0,0,1
5,1,1,0,0,1,0,0,0,0,1,1,0


In [18]:
#Target
y = df_ml["any_other_cancer"]

#Features
X = df_ml.drop(columns="any_other_cancer")

In [19]:
# Check the balance of our target values
y.value_counts()

0    285387
1     31864
Name: any_other_cancer, dtype: int64

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    train_size=0.80)
X_train.shape

(253800, 11)

In [21]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

In [22]:
# Fitting the model

brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [23]:
#Make Prediction
predictions = brf.predict(X_test)

In [26]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, predictions)

In [27]:
# Display the confusion matrix
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,31875,25194
Actual 1,2229,4153


Accuracy Score : 0.6046354259872424


In [28]:
# Print the imbalanced classification report

print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.56      0.65      0.70      0.60      0.36     57069
          1       0.14      0.65      0.56      0.23      0.60      0.37      6382

avg / total       0.85      0.57      0.64      0.65      0.60      0.36     63451



In [29]:
# List the features sorted in descending order by feature importance
importances = brf.feature_importances_
importances

array([0.13704626, 0.16437326, 0.31366272, 0.04718174, 0.0705737 ,
       0.03349131, 0.03490985, 0.10206999, 0.03390198, 0.03146598,
       0.03132321])