# Multi-model
Data file: https://raw.githubusercontent.com/vjavaly/Baruch-CIS-STA-3920/main/data/Framingham_4000.csv

## Requirements
* Load and examine data
* Prepare data for model training
  * Perform the necessary steps that we learned during the semester
* Train 3 separate models
  * From the various Classification algorithms that we learned during the semester, train 3 different Classification algorithms
* Print accuracy of each model

In [1]:
from datetime import datetime
print(f'Run time: {datetime.now().strftime("%D %T")}')

Run time: 02/13/24 16:12:09


### Import libraries

In [2]:
import pandas as pd
# Add all other necessary imports below
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

### Load data

The goal of the Framingham_4000.csv dataset is to predict whether the patient has a 10-year risk of future (CHD) coronary heart disease.  
The dataset contains:
* over 4,000 records
* 15 features (independent variables)
* the target variable (dependent variable) is 'TenYearCHD'

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/vjavaly/Baruch-CIS-9660/main/data/Framingham_4000.csv')

### Examine data

In [4]:
pd.set_option('display.max_rows', None)

In [5]:
df.shape

(4000, 16)

In [6]:
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,37,4.0,0,0.0,0.0,0,0,0,169.0,104.0,66.0,20.84,70.0,72.0,0
1,0,54,3.0,0,0.0,0.0,0,1,0,227.0,168.0,94.0,22.7,75.0,70.0,0
2,0,50,2.0,0,0.0,1.0,0,1,0,241.0,132.0,85.0,23.81,55.0,84.0,0
3,0,52,3.0,0,0.0,0.0,0,0,0,325.0,119.5,86.0,24.56,64.0,,1
4,1,45,3.0,1,30.0,0.0,0,1,0,233.0,147.0,101.0,24.32,75.0,99.0,0


### Prepare data for model training

In [7]:
df.isnull().sum()

male                 0
age                  0
education            0
currentSmoker        0
cigsPerDay           0
BPMeds               0
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             47
sysBP                0
diaBP                0
BMI                 18
heartRate            0
glucose            372
TenYearCHD           0
dtype: int64

In [8]:
df[df.isnull().any(axis = 1)]

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
3,0,52,3.0,0,0.0,0.0,0,0,0,325.0,119.5,86.0,24.56,64.0,,1
10,0,53,3.0,0,0.0,0.0,0,0,0,218.0,125.0,80.0,24.96,72.0,,1
57,0,50,3.0,0,0.0,0.0,0,1,0,,165.0,100.0,24.59,75.0,,1
58,1,61,1.0,0,0.0,0.0,0,0,0,239.0,143.0,80.0,25.74,48.0,,0
63,0,58,2.0,0,0.0,0.0,0,1,0,345.0,188.0,102.0,28.89,95.0,,0
95,0,61,2.0,0,0.0,1.0,0,1,0,209.0,133.0,93.0,,80.0,,1
107,0,59,1.0,0,0.0,0.0,0,1,0,,153.5,89.5,26.08,71.0,113.0,0
131,1,64,1.0,0,0.0,0.0,0,1,0,217.0,147.0,87.0,29.73,77.0,,0
140,0,51,2.0,1,30.0,0.0,0,1,0,295.0,176.0,99.0,26.27,82.0,,0
157,1,45,1.0,1,3.0,0.0,0,0,0,,126.0,85.0,28.24,72.0,,0


In [9]:
imp_mean = SimpleImputer(missing_values = np.nan, strategy = 'mean')

In [10]:
tmp_1 = df[['totChol', 'BMI', 'glucose']]
imp_mean.fit(tmp_1)

In [11]:
tmp_2 = imp_mean.transform(tmp_1)

In [12]:
tmp_3 = imp_mean.fit_transform(tmp_1)

In [13]:
np.array_equal(tmp_2, tmp_3)

True

In [14]:
tmp_3

array([[169.        ,  20.84      ,  72.        ],
       [227.        ,  22.7       ,  70.        ],
       [241.        ,  23.81      ,  84.        ],
       ...,
       [219.        ,  21.63      ,  68.        ],
       [200.        ,  31.44      ,  74.        ],
       [225.        ,  24.36      ,  81.99807056]])

In [15]:
df2 = pd.DataFrame(tmp_3,
                  columns=['totChol', 'BMI', 'glucose'])
df2.head(100)

Unnamed: 0,totChol,BMI,glucose
0,169.0,20.84,72.0
1,227.0,22.7,70.0
2,241.0,23.81,84.0
3,325.0,24.56,81.998071
4,233.0,24.32,99.0
5,195.0,23.22,85.0
6,173.0,21.98,79.0
7,306.0,25.38,84.0
8,243.0,30.24,85.0
9,221.0,25.97,78.0


### Display first 100 rows of final dataframe used for model training

In [16]:
df2['male'] = df['male']
df2['age'] = df['age']
df2['education'] = df['education']
df2['currentSmoker'] = df['currentSmoker']                
df2['cigsPerDay'] = df['cigsPerDay']
df2['BPMeds'] = df['BPMeds']
df2['prevalentStroke'] = df['prevalentStroke']
df2['prevalentHyp'] = df['prevalentHyp']
df2['diabetes'] = df['diabetes']
df2['sysBP'] = df['sysBP']
df2['diaBP'] = df['diaBP']
df2['heartRate'] = df['heartRate']
df2['TenYearCHD'] = df['TenYearCHD']
df2.head(100)

Unnamed: 0,totChol,BMI,glucose,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,sysBP,diaBP,heartRate,TenYearCHD
0,169.0,20.84,72.0,0,37,4.0,0,0.0,0.0,0,0,0,104.0,66.0,70.0,0
1,227.0,22.7,70.0,0,54,3.0,0,0.0,0.0,0,1,0,168.0,94.0,75.0,0
2,241.0,23.81,84.0,0,50,2.0,0,0.0,1.0,0,1,0,132.0,85.0,55.0,0
3,325.0,24.56,81.998071,0,52,3.0,0,0.0,0.0,0,0,0,119.5,86.0,64.0,1
4,233.0,24.32,99.0,1,45,3.0,1,30.0,0.0,0,1,0,147.0,101.0,75.0,0
5,195.0,23.22,85.0,0,45,3.0,1,8.0,0.0,0,0,0,111.0,79.0,86.0,0
6,173.0,21.98,79.0,0,42,2.0,1,10.0,0.0,0,0,0,105.0,70.0,60.0,0
7,306.0,25.38,84.0,0,56,2.0,0,0.0,0.0,0,0,0,120.0,87.0,82.0,0
8,243.0,30.24,85.0,0,55,1.0,0,0.0,0.0,0,1,0,142.0,92.0,70.0,0
9,221.0,25.97,78.0,1,49,3.0,0,0.0,0.0,0,1,0,175.0,107.5,63.0,0


In [17]:
df2.isnull().sum()

totChol            0
BMI                0
glucose            0
male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
sysBP              0
diaBP              0
heartRate          0
TenYearCHD         0
dtype: int64

In [18]:
df2.shape

(4000, 16)

# Separate independent and dependent variables
    Independent variables: All except TenYearCHD
    Dependent variable: 'TenYearCHD'

In [19]:
X = df2.drop("TenYearCHD", axis = 1)
y = df2["TenYearCHD"]

Standardize the data

In [20]:
scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X[X.columns])
X.head()

Unnamed: 0,totChol,BMI,glucose,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,sysBP,diaBP,heartRate
0,-1.545534,-1.213161,-0.4330224,-0.866341,-1.464521,1.985175,-0.980686,-0.756244,-0.176617,-0.077693,-0.673415,-0.163383,-1.285127,-1.416191,-0.488334
1,-0.219713,-0.756993,-0.5196436,-0.866341,0.524284,1.003872,-0.980686,-0.756244,-0.176617,-0.077693,1.484967,-0.163383,1.616925,0.934699,-0.075035
2,0.100312,-0.484764,0.08670475,-0.866341,0.05633,0.02257,-0.980686,-0.756244,5.661965,-0.077693,1.484967,-0.163383,-0.01548,0.179056,-1.728233
3,2.020466,-0.300826,-6.154805e-16,-0.866341,0.290307,1.003872,-0.980686,-0.756244,-0.176617,-0.077693,-0.673415,-0.163383,-0.582287,0.263016,-0.984294
4,-0.082559,-0.359686,0.7363637,1.15428,-0.528613,1.003872,1.019694,1.759746,-0.176617,-0.077693,1.484967,-0.163383,0.664689,1.522422,-0.075035


In [21]:
y.head()

0    0
1    0
2    0
3    1
4    0
Name: TenYearCHD, dtype: int64

# Define global variables

In [22]:
global TEST_SIZE, RANDOM_STATE
TEST_SIZE = .3
RANDOM_STATE = 42

# Split data into training and test sets

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

### Train Classification model 1
    Logistic Regression model

In [24]:
LogReg_model = LogisticRegression(random_state=RANDOM_STATE)
LogReg_model.fit(X_train, y_train)

### Evaluate Classification model 1 performance

In [25]:
LogReg_predictions = LogReg_model.predict(X_test)
LogReg_accuracy = accuracy_score(y_test, LogReg_predictions)
print("Logistic regression accuracy = ", round((LogReg_accuracy * 100), 5), "%")

Logistic regression accuracy =  86.83333 %


### Train Classification model 2
    Random Forest Classifier model

In [26]:
RF_model = RandomForestClassifier(random_state=RANDOM_STATE)
RF_model.fit(X_train, y_train)

### Evaluate Classification model 2 performance

In [27]:
RF_predictions = RF_model.predict(X_test)
RF_accuracy = accuracy_score(y_test, RF_predictions)
print("Random forest accuracy = ", round((RF_accuracy * 100), 5), "%")

Random forest accuracy =  86.08333 %


### Train Classification model 3
    Naïve Bayes 

In [28]:
model = GaussianNB()
model.fit(X_train, y_train)

In [29]:
predict = model.predict(X_test)

### Evaluate Classification model 3 performance

In [30]:
print('Accuracy on training set: {:.4f}'.format(model.score(X_train, y_train)))
print('Accuracy on test set: {:.4f}'.format(model.score(X_test, y_test)))

Accuracy on training set: 0.8186
Accuracy on test set: 0.8375
