In [None]:
###Data Modeling###
#Import what I need for modeling
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from imblearn.metrics import classification_report_imbalanced

In [None]:
import pandas as pd

data = pd.read_csv('./Data_Files/final_data.csv')

In [None]:
#My list of all columns with dummy variables
data.columns

In [None]:
#I want to standardize the data so make the mean 0 and the standard deviation 1

ss = StandardScaler()
scaled = ss.fit_transform(data)

corr_data = pd.DataFrame(scaled, columns=data.columns)

In [None]:
#I decided to not use PCA or visualizations and use Tableau instead
import matplotlib.pyplot as plt
import seaborn as sns

#plt.figure(figsize=(20,15))
#heatmap = sns.heatmap(corr_data.corr(), vmin=0, annot=True, fmt='.2f', cmap='YlGnBu')

#plt.title('Correlations on Scaled Data')

In [None]:
#This of course is inaccurate but I wanted to see if my variables generally made sense.
#Based on my research, it makes sense. High cholestral, smoking, and worse health increases cancer rates
#Managerial jobs cause cancer because of lack of exercise and increased health problems
data.corr()['cancer_count'].sort_values(ascending=False).head(15)

In [None]:
#PCA - This is how I would have used PCA
#import seaborn as sns
#g = sns.PairGrid(data)
#g = g.map_lower(sns.regplot)    # Regression plots in lower triangle.
#g = g.map_upper(sns.kdeplot, cmap="Blues", shade=True, shade_lowest=False)  # KDE plots in upper triangle.
#g = g.map_diag(plt.hist)        # Histograms along diagonal.

#plt.show()

In [None]:
#I reran models multiple times, dropped unimportant variables using Tableau as feature selection
#Selected variables that had the best accuracy and highest metrics. Kept about 20 columns

In [None]:
#feature selection - train/test data 
X = data.drop(columns=['cancer_count'])
y = data['cancer_count']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
#Baseline using dummy regressor - sensitivity/recall
dummy_regr = DummyClassifier()
dummy_regr.fit(X_train, y_train)

dummy_regr.predict(X_test)

In [None]:
#Baseline will be 0.5 using balanced accuracy score - 
#I avoided using regular accuracy score since my data is so imbalanced 
y_pred = dummy_regr.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from imblearn.over_sampling import SMOTE

In [None]:
#SMOTE - only use dummy variables processing categorical data (X-values)
#Only use smote on training test not testing data to prevent data leakage
#Testing data represents real life so I won't use smote
#Make training data 50/50 to balance it out

from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state=100)
X_train, y_train = ada.fit_resample(X_train, y_train)

In [None]:
y_test.shape

In [None]:
#train data now 50/50
y_train.value_counts(normalize=True)

In [None]:
#testing data still the same, very imbalanced
y_test.value_counts(normalize=True)

In [None]:
#Used pipeline to fit standardscaler and various model types
#I tried using polynomial features but it keeps crashing from overload
pipe_ada = Pipeline([
    ('ss', StandardScaler()),
    ('abc', AdaBoostClassifier(learning_rate=0.5, n_estimators=50, random_state=1))
])

pipe_grad = Pipeline([
    ('ss', StandardScaler()),
    ('gbc', GradientBoostingClassifier(random_state=1))
])

pipe_km = Pipeline([
    ('ss', StandardScaler()),
    ('km', KMeans(random_state=1))
])

pipe_dbs = Pipeline([
    ('ss', StandardScaler()),
    ('dbs', DBSCAN())
])

pipe_xgb = Pipeline([
    ('ss', StandardScaler()),
    ('xgb', xgb.XGBClassifier(n_estimators = 200))
])

pipe_log = Pipeline([
#    ("vectorizer", CountVectorizer())
    ('ss', StandardScaler()),
    ('log', LogisticRegression())
])

#n_estimators=70, random_state=42, max_features=none, min_samples_leaf=10
pipe_forest = Pipeline([
    ('ss', StandardScaler()),
    ('forest', RandomForestClassifier(n_estimators=70, random_state=42, min_samples_leaf=10))
])

pipe_svc = Pipeline([
    ('ss', StandardScaler()),
    ('svc', RandomForestClassifier())
])

pipe_dectree = Pipeline([
    ('ss', StandardScaler()),
    ('dec', DecisionTreeClassifier())
])

pipe_svm = Pipeline([
    ('ss', StandardScaler()),
    ('dec', SVC())
])

In [None]:
pipe2 = Pipeline([  # giving the pipeline a list of things to do!
    ('pf', PolynomialFeatures(include_bias=False)),  # inside the tuple: a name for the step, and the class we want to use
    ('ss', StandardScaler()),
    ('dec', DecisionTreeClassifier())              # last step: estimator
])

In [None]:
pd.DataFrame(zip(pipe['pf'].get_feature_names_out(), pipe['dec'].coef_)).sort_values(by=1, ascending=False).head(10)

In [None]:
#Logarithmic regression has the highest accuracy

pipe_log.fit(X_train, y_train)

pipe_log.score(X_train, y_train), pipe_log.score(X_test, y_test)

In [None]:
#Used balanced accuracy score to compensate for imbalanced y-value
y_pred = pipe_log.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [20]:
pipe_ada.fit(X_train, y_train)

pipe_ada.score(X_train, y_train), pipe_ada.score(X_test, y_test)

(0.9605483630380035, 0.9267498227663773)

In [21]:
y_pred = pipe_ada.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7279498520428997

In [22]:
pipe_grad.fit(X_train, y_train)

pipe_grad.score(X_train, y_train), pipe_grad.score(X_test, y_test)

(0.9773878842404156, 0.9571573642965262)

In [23]:
y_pred = pipe_grad.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5388275856277444

In [24]:
pipe_xgb.fit(X_train, y_train)

pipe_xgb.score(X_train, y_train), pipe_xgb.score(X_test, y_test)

(0.9816588015344533, 0.959418290509858)

In [25]:
y_pred = pipe_xgb.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5126622988093101

In [26]:
pipe_km.fit(X_train, y_train)

pipe_km.score(X_train, y_train), pipe_km.score(X_test, y_test)

(-13787654.725806171, -2537975.8882192723)

In [27]:
pipe_dectree.fit(X_train, y_train)

pipe_dectree.score(X_train, y_train), pipe_dectree.score(X_test, y_test)

(1.0, 0.933168553965243)

In [28]:
y_pred = pipe_dectree.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5950478721785016

In [29]:
pipe_svc.fit(X_train, y_train)

pipe_svc.score(X_train, y_train), pipe_svc.score(X_test, y_test)

(0.9999799330432543, 0.9566400337222892)

In [41]:
y_pred = pipe_svc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [31]:
pipe_forest.fit(X_train, y_train)

pipe_forest.score(X_train, y_train), pipe_forest.score(X_test, y_test)

(0.9795317041194118, 0.9605104328332471)

In [40]:
y_pred = pipe_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [33]:
##I used metrics such as F1 score, preceision, and recall
#For cancer, true positives and true negatives are good because accuracy is important

#False positives - false alarms, when people test positive for cancer when they don't have cancer
#False positives result in costly medical procedures or an extra scan which are unnecessary

#False negative - very deadly, person test negative for cancer when they don't have cancer
#Avoid at all costs - could result in lives lost

#accuracy - important measure to see how correct your model is
#precision - if you want to be more confident of your true positives
#recall - useful if false positives are far better than false negatives (I chose method, would rather get false alarms than lives lost)
#specificity - useful if you don't want any false alarms

#decision tree is second favorite model. High recall scores and high accuracy
y_test = data['cancer_count']  
predict_grad = pipe_dectree.predict(data.drop(columns = ['cancer_count']))
print(classification_report_imbalanced(y_test, predict_grad, target_names=['No', 'Yes']))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.99      0.99      0.81      0.99      0.90      0.82    200558
        Yes       0.78      0.81      0.99      0.79      0.90      0.79      8206

avg / total       0.98      0.98      0.82      0.98      0.90      0.81    208764



In [34]:
#Logarithmic regression is my preferred model. I want false positives to be as low as possible
y_test = data['cancer_count']  
predict_grad = pipe_log.predict(data.drop(columns = ['cancer_count']))
print(classification_report_imbalanced(y_test, predict_grad, target_names=['No', 'Yes']))

                   pre       rec       spe        f1       geo       iba       sup

         No       1.00      0.83      0.90      0.91      0.87      0.75    200558
        Yes       0.18      0.90      0.83      0.30      0.87      0.76      8206

avg / total       0.96      0.84      0.90      0.88      0.87      0.75    208764



In [35]:
y_test = data['cancer_count']  
predict_grad = pipe_ada.predict(data.drop(columns = ['cancer_count']))
print(classification_report_imbalanced(y_test, predict_grad, target_names=['No', 'Yes']))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.98      0.94      0.51      0.96      0.69      0.50    200558
        Yes       0.27      0.51      0.94      0.35      0.69      0.46      8206

avg / total       0.95      0.93      0.52      0.94      0.69      0.50    208764



In [36]:
y_test = data['cancer_count']  
predict_grad = pipe_grad.predict(data.drop(columns = ['cancer_count']))
print(classification_report_imbalanced(y_test, predict_grad, target_names=['No', 'Yes']))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.96      0.99      0.08      0.98      0.28      0.09    200558
        Yes       0.33      0.08      0.99      0.13      0.28      0.07      8206

avg / total       0.94      0.96      0.12      0.94      0.28      0.09    208764



In [37]:
y_test = data['cancer_count']  
predict_grad = pipe_xgb.predict(data.drop(columns = ['cancer_count']))
print(classification_report_imbalanced(y_test, predict_grad, target_names=['No', 'Yes']))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.96      1.00      0.10      0.98      0.31      0.10    200558
        Yes       0.82      0.10      1.00      0.17      0.31      0.09      8206

avg / total       0.96      0.96      0.13      0.95      0.31      0.10    208764



In [40]:

y_test = data['cancer_count']  
predict_grad = pipe_forest.predict(data.drop(columns = ['cancer_count']))
print(classification_report_imbalanced(y_test, predict_grad, target_names=['No', 'Yes']))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.96      1.00      0.01      0.98      0.11      0.01    200558
        Yes       0.63      0.01      1.00      0.03      0.11      0.01      8206

avg / total       0.95      0.96      0.05      0.94      0.11      0.01    208764



In [48]:
#predict for decision trees
y_pred = pipe_forest.predict(X_test)
df = pd.DataFrame(y_pred)
df

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
52186,0
52187,0
52188,0
52189,0


In [55]:
#R squared for decision trees


In [39]:
#I wanted to use polynomialfeatures but comptuer kept crashing

In [2]:
##Neural Network 
import pandas as pd
data = pd.read_csv('./Data_Files/final_data.csv')

In [None]:
#train/test split
from sklearn.model_selection import train_test_split
X = data.drop(columns=['cancer_count'])
y = data['cancer_count']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

In [None]:
#standardize data
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
#SMOTE - only use dummy variables processing categorical data (X-values)
#Only use smote on training test not training data, prevent data leakage, test represent real life

from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state=100)
X_train_ada, y_train_ada = ada.fit_resample(X_train, y_train)

In [None]:
#Use neural networks, balance classes using smote or oversample from cancer class
#Look at metrics - add more
#When look at onehotencoded variables - correlation is not accurate (are important but could be random luck)
#Visualizations - delete variables before onehotencoding
#Boxplot to visualize features

#balanced accuracy, I did NOT balance my data as I ran out of time.
#WOULD NOT RECOMMEND as low accuracy

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input

from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dropout

model = Sequential()
#kernel regularizer did not work, dropout does not work, binaryaccuracy, dropout not work
# hidden w/ 60 nodes, two layers

#loss kept increasing for val_accuracy
model.add(Input(shape = X_train.shape[1]))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
#use metrics - precision, recall
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['Accuracy'])
history = model.fit(
    X_train_ada, y_train_ada,
    validation_data=(X_test, y_test),
    epochs=15,
    verbose=1)

In [None]:
#Struggled with accuracy, high losses for testing
#Tries multiple methods but ran out of time
train_loss = history.history['loss']
test_loss = history.history['val_loss']

plt.plot(train_loss, label='Training Loss', color='navy')
plt.plot(test_loss, label='Testing Loss', color='skyblue')
plt.legend();

In [None]:
#Binary accuracy only slightly helped

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input

from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dropout

model = Sequential()
#kernel regularizer did not work, dropout does not work, binaryaccuracy, dropout not work
# hidden w/ 60 nodes, two layers

#loss kept increasing for val_accuracy
model.add(Input(shape = X_train.shape[1]))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
#use metrics - precision, recall
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['BinaryAccuracy'])
history = model.fit(
    X_train_ada, y_train_ada,
    validation_data=(X_test, y_test),
    epochs=15,
    verbose=1)