In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Load Data Set One
df1 = pd.read_csv('data.csv', header=None)
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
df1.columns = column_names
print("Data Set One - First 5 Rows:")
print(df1.head())

# Load Data Set Two
df2 = pd.read_csv('heart.dat', header=None, sep=' ')
df2.columns = column_names
print("\nData Set Two - First 5 Rows:")
print(df2.head())

# Function to clean data
def clean_data(df):
    df.replace('?', np.nan, inplace=True)
    df.dropna(inplace=True)
    return df

df1 = clean_data(df1)
df2 = clean_data(df2)

print("\nData Set One - After Cleaning - First 5 Rows:")
print(df1.head())
print("\nData Set Two - After Cleaning - First 5 Rows:")
print(df2.head())


Data Set One - First 5 Rows:
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca thal  num  
0    3.0  0.0  6.0    0  
1    2.0  3.0  3.0    2  
2    2.0  2.0  7.0    1  
3    3.0  0.0  3.0    0  
4    1.0  0.0  3.0    0  

Data Set Two - First 5 Rows:
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  70.0  1.0  4.0     130.0  322.0  0.0      2.0    109.0    0.0      2.4   
1  67.0  0.0  3.0     115.0  564.0  0.0      2.0    160.0    0.0      1.6   
2  57.0  1.0  2.0     124.0  261.0  0.0      0.0    141.0    0.0      0.3   
3  64.0  1.0  

In [2]:
def transform_data(df):
    X = df.iloc[:, :-1].apply(pd.to_numeric, errors='coerce')
    y = pd.to_numeric(df.iloc[:, -1], errors='coerce')
    X.fillna(X.mean(), inplace=True)
    return X, y

X1, y1 = transform_data(df1)
X2, y2 = transform_data(df2)

print("\nData Set One - Features (First 5 Rows):")
print(X1.head())
print("\nData Set One - Target (First 5 Values):")
print(y1.head())

print("\nData Set Two - Features (First 5 Rows):")
print(X2.head())
print("\nData Set Two - Target (First 5 Values):")
print(y2.head())



Data Set One - Features (First 5 Rows):
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca  thal  
0    3.0  0.0   6.0  
1    2.0  3.0   3.0  
2    2.0  2.0   7.0  
3    3.0  0.0   3.0  
4    1.0  0.0   3.0  

Data Set One - Target (First 5 Values):
0    0
1    2
2    1
3    0
4    0
Name: num, dtype: int64

Data Set Two - Features (First 5 Rows):
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  70.0  1.0  4.0     130.0  322.0  0.0      2.0    109.0    0.0      2.4   
1  67.0  0.0  3.0     115.0  564.0  0.0      2.0    160.0    0.0     

In [3]:
def feature_selection(X, y):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X, y)
    sfm = SelectFromModel(clf, threshold=0.05)
    sfm.fit(X, y)
    X_important = sfm.transform(X)
    feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
    return X_important, feature_importances

X1_important, feature_importances1 = feature_selection(X1, y1)
X2_important, feature_importances2 = feature_selection(X2, y2)

print("\nData Set One - Feature Importances:")
print(feature_importances1)
print("\nData Set One - Important Features (First 5 Rows):")
print(pd.DataFrame(X1_important).head())

print("\nData Set Two - Feature Importances:")
print(feature_importances2)
print("\nData Set Two - Important Features (First 5 Rows):")
print(pd.DataFrame(X2_important).head())



Data Set One - Feature Importances:
          importance
thalach     0.139389
oldpeak     0.117658
age         0.115094
chol        0.113813
trestbps    0.099277
ca          0.094458
thal        0.082432
cp          0.075628
exang       0.046204
slope       0.042644
restecg     0.029843
sex         0.027268
fbs         0.016293

Data Set One - Important Features (First 5 Rows):
      0    1      2      3      4    5    6    7
0  63.0  1.0  145.0  233.0  150.0  2.3  0.0  6.0
1  67.0  4.0  160.0  286.0  108.0  1.5  3.0  3.0
2  67.0  4.0  120.0  229.0  129.0  2.6  2.0  7.0
3  37.0  3.0  130.0  250.0  187.0  3.5  0.0  3.0
4  41.0  2.0  130.0  204.0  172.0  1.4  0.0  3.0

Data Set Two - Feature Importances:
          importance
oldpeak     0.122941
thalach     0.122419
thal        0.121363
ca          0.109587
cp          0.109037
age         0.093253
chol        0.083779
trestbps    0.077635
slope       0.048688
exang       0.047828
sex         0.033263
restecg     0.020953
fbs         0.

In [4]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1_important, y1, test_size=0.3, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_important, y2, test_size=0.3, random_state=42)

print("\nData Set One - Train Features (First 5 Rows):")
print(pd.DataFrame(X1_train).head())
print("\nData Set One - Test Features (First 5 Rows):")
print(pd.DataFrame(X1_test).head())

print("\nData Set Two - Train Features (First 5 Rows):")
print(pd.DataFrame(X2_train).head())
print("\nData Set Two - Test Features (First 5 Rows):")
print(pd.DataFrame(X2_test).head())



Data Set One - Train Features (First 5 Rows):
      0    1      2      3      4    5    6    7
0  61.0  4.0  140.0  207.0  138.0  1.9  1.0  7.0
1  57.0  4.0  150.0  276.0  112.0  0.6  1.0  6.0
2  48.0  2.0  110.0  229.0  168.0  1.0  0.0  7.0
3  67.0  4.0  106.0  223.0  142.0  0.3  2.0  3.0
4  47.0  3.0  108.0  243.0  152.0  0.0  0.0  3.0

Data Set One - Test Features (First 5 Rows):
      0    1      2      3      4    5    6    7
0  45.0  2.0  112.0  160.0  138.0  0.0  0.0  3.0
1  52.0  4.0  112.0  230.0  160.0  0.0  1.0  3.0
2  54.0  3.0  135.0  304.0  170.0  0.0  0.0  3.0
3  70.0  4.0  130.0  322.0  109.0  2.4  3.0  3.0
4  56.0  2.0  120.0  236.0  178.0  0.8  0.0  3.0

Data Set Two - Train Features (First 5 Rows):
      0    1      2      3      4    5    6    7
0  57.0  4.0  110.0  201.0  126.0  1.5  0.0  6.0
1  44.0  2.0  130.0  219.0  188.0  0.0  0.0  3.0
2  54.0  4.0  124.0  266.0  109.0  2.2  1.0  7.0
3  58.0  4.0  125.0  300.0  171.0  0.0  2.0  7.0
4  62.0  4.0  120.0  267.0 

In [5]:
scaler1 = StandardScaler()
X1_train_scaled = scaler1.fit_transform(X1_train)
X1_test_scaled = scaler1.transform(X1_test)

scaler2 = StandardScaler()
X2_train_scaled = scaler2.fit_transform(X2_train)
X2_test_scaled = scaler2.transform(X2_test)

print("\nData Set One - Scaled Train Features (First 5 Rows):")
print(pd.DataFrame(X1_train_scaled).head())
print("\nData Set One - Scaled Test Features (First 5 Rows):")
print(pd.DataFrame(X1_test_scaled).head())

print("\nData Set Two - Scaled Train Features (First 5 Rows):")
print(pd.DataFrame(X2_train_scaled).head())
print("\nData Set Two - Scaled Test Features (First 5 Rows):")
print(pd.DataFrame(X2_test_scaled).head())



Data Set One - Scaled Train Features (First 5 Rows):
          0         1         2         3         4         5         6  \
0  0.728560  0.843465  0.500301 -0.786293 -0.442406  0.787175  0.298941   
1  0.283030  0.843465  1.086392  0.503842 -1.555556 -0.370992  0.298941   
2 -0.719412 -1.235074 -1.257973 -0.374946  0.841998 -0.014633 -0.732405   
3  1.396855  0.843465 -1.492409 -0.487131 -0.271152 -0.638262  1.330286   
4 -0.830795 -0.195804 -1.375191 -0.113179  0.156983 -0.905531 -0.732405   

          7  
0  1.166259  
1  0.650414  
2  1.166259  
3 -0.897123  
4 -0.897123  

Data Set One - Scaled Test Features (First 5 Rows):
          0         1         2         3         4         5         6  \
0 -1.053560 -1.235074 -1.140755 -1.665081 -0.442406 -0.905531 -0.732405   
1 -0.273883  0.843465 -1.140755 -0.356248  0.499491 -0.905531  0.298941   
2 -0.051118 -0.195804  0.207255  1.027375  0.927625 -0.905531 -0.732405   
3  1.731002  0.843465 -0.085790  1.363932 -1.683997  1.232

In [6]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

print("\nData Set One Evaluation:")
train_and_evaluate(X1_train_scaled, X1_test_scaled, y1_train, y1_test)

print("\nData Set Two Evaluation:")
train_and_evaluate(X2_train_scaled, X2_test_scaled, y2_train, y2_test)



Data Set One Evaluation:
Accuracy: 0.5888888888888889
Confusion Matrix:
[[47  2  0  0  0]
 [ 9  1  2  5  0]
 [ 3  2  3  2  0]
 [ 2  3  4  2  0]
 [ 2  1  0  0  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.96      0.84        49
           1       0.11      0.06      0.08        17
           2       0.33      0.30      0.32        10
           3       0.22      0.18      0.20        11
           4       0.00      0.00      0.00         3

    accuracy                           0.59        90
   macro avg       0.28      0.30      0.29        90
weighted avg       0.49      0.59      0.53        90


Data Set Two Evaluation:
Accuracy: 0.7777777777777778
Confusion Matrix:
[[45  4]
 [14 18]]
Classification Report:
              precision    recall  f1-score   support

           1       0.76      0.92      0.83        49
           2       0.82      0.56      0.67        32

    accuracy                           0.78  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the model
clf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X1_train_scaled, y1_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters found:  {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}


In [8]:
#HyperParameter Tuning (using Grid Search)
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the model
clf = RandomForestClassifier(random_state=42)

#for first dataset
grid_search1 = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search1.fit(X1_train, y1_train)

# Best parameters for the first dataset
print("Best parameters for the first dataset: ", grid_search1.best_params_)

#for second dataset
grid_search2 = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search2.fit(X2_train, y2_train)

# Best parameters for the second dataset
print("Best parameters for the second dataset: ", grid_search2.best_params_)


#Evaulate models again 
#dataset one 
y1_pred = grid_search1.best_estimator_.predict(X1_test)
print("Accuracy for the first dataset: ", accuracy_score(y1_test, y1_pred))
print("Confusion Matrix:\n", confusion_matrix(y1_test, y1_pred))
print("Classification Report:\n", classification_report(y1_test, y1_pred))
#dataset two
y2_pred = grid_search2.best_estimator_.predict(X2_test)
print("Accuracy for the second dataset: ", accuracy_score(y2_test, y2_pred))
print("Confusion Matrix:\n", confusion_matrix(y2_test, y2_pred))
print("Classification Report:\n", classification_report(y2_test, y2_pred))


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters for the first dataset:  {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters for the second dataset:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy for the first dataset:  0.5888888888888889
Confusion Matrix:
 [[49  0  0  0  0]
 [12  0  2  3  0]
 [ 4  3  2  1  0]
 [ 3  1  5  2  0]
 [ 2  0  0  1  0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      1.00      0.82        49
           1       0.00      0.00      0.00        17
           2       0.22      0.20      0.21        10
           3       0.29      0.18      0.22        11
           4       0.00      0.00      0.00         3

    accuracy                           0.59        90
   macro avg       0.24      0.28      0.25        9

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
#FINAL TEST HERE!!!!!!!!!!!!!!

# Import necessary libraries for data manipulation, machine learning, and evaluation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler



In [10]:
# Load the first dataset from a CSV file and assign column names
df1 = pd.read_csv('data.csv', header=None)
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
df1.columns = column_names
print("Data Set One - First 5 Rows:")
print(df1.head())

Data Set One - First 5 Rows:
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca thal  num  
0    3.0  0.0  6.0    0  
1    2.0  3.0  3.0    2  
2    2.0  2.0  7.0    1  
3    3.0  0.0  3.0    0  
4    1.0  0.0  3.0    0  


In [11]:
# Load the second dataset from a space-separated file and assign column names
df2 = pd.read_csv('heart.dat', header=None, sep=' ')
df2.columns = column_names

In [12]:
print("\nData Set Two - First 5 Rows:")
print(df2.head())



Data Set Two - First 5 Rows:
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  70.0  1.0  4.0     130.0  322.0  0.0      2.0    109.0    0.0      2.4   
1  67.0  0.0  3.0     115.0  564.0  0.0      2.0    160.0    0.0      1.6   
2  57.0  1.0  2.0     124.0  261.0  0.0      0.0    141.0    0.0      0.3   
3  64.0  1.0  4.0     128.0  263.0  0.0      0.0    105.0    1.0      0.2   
4  74.0  0.0  2.0     120.0  269.0  0.0      2.0    121.0    1.0      0.2   

   slope   ca  thal  num  
0    2.0  3.0   3.0    2  
1    2.0  0.0   7.0    1  
2    1.0  0.0   7.0    2  
3    2.0  1.0   7.0    1  
4    1.0  1.0   3.0    1  


In [13]:
# Function to clean the dataset by replacing missing values with NaN and dropping rows with NaN values
def clean_data(df):
    df.replace('?', np.nan, inplace=True)  # Replace '?' with NaN
    df.dropna(inplace=True)  # Drop rows with NaN values
    return df

In [14]:
# Clean both datasets
df1 = clean_data(df1)
df2 = clean_data(df2)


In [15]:
print("\nData Set One - After Cleaning - First 5 Rows:")
print(df1.head())
print("\nData Set Two - After Cleaning - First 5 Rows:")
print(df2.head())


Data Set One - After Cleaning - First 5 Rows:
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca thal  num  
0    3.0  0.0  6.0    0  
1    2.0  3.0  3.0    2  
2    2.0  2.0  7.0    1  
3    3.0  0.0  3.0    0  
4    1.0  0.0  3.0    0  

Data Set Two - After Cleaning - First 5 Rows:
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  70.0  1.0  4.0     130.0  322.0  0.0      2.0    109.0    0.0      2.4   
1  67.0  0.0  3.0     115.0  564.0  0.0      2.0    160.0    0.0      1.6   
2  57.0  1.0  2.0     124.0  261.0  0.0      0.0    141.

In [16]:
# Function to transform data: convert features to numeric types and fill missing values with column means
def transform_data(df):
    X = df.iloc[:, :-1].apply(pd.to_numeric, errors='coerce')  # Convert all columns except the last one to numeric
    y = pd.to_numeric(df.iloc[:, -1], errors='coerce')  # Convert the last column to numeric
    X.fillna(X.mean(), inplace=True)  # Fill NaN values with column means
    return X, y

In [17]:
# Transform both datasets
X1, y1 = transform_data(df1)
X2, y2 = transform_data(df2)

print("\nData Set One - Features (First 5 Rows):")
print(X1.head())
print("\nData Set One - Target (First 5 Values):")
print(y1.head())

print("\nData Set Two - Features (First 5 Rows):")
print(X2.head())
print("\nData Set Two - Target (First 5 Values):")
print(y2.head())


Data Set One - Features (First 5 Rows):
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca  thal  
0    3.0  0.0   6.0  
1    2.0  3.0   3.0  
2    2.0  2.0   7.0  
3    3.0  0.0   3.0  
4    1.0  0.0   3.0  

Data Set One - Target (First 5 Values):
0    0
1    2
2    1
3    0
4    0
Name: num, dtype: int64

Data Set Two - Features (First 5 Rows):
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  70.0  1.0  4.0     130.0  322.0  0.0      2.0    109.0    0.0      2.4   
1  67.0  0.0  3.0     115.0  564.0  0.0      2.0    160.0    0.0     

In [18]:
# Function to select important features using a RandomForestClassifier
def feature_selection(X, y):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)  # Initialize RandomForestClassifier
    clf.fit(X, y)  # Fit the classifier on the data
    sfm = SelectFromModel(clf, threshold=0.05)  # Select features with importance greater than 0.05
    sfm.fit(X, y)  # Fit the feature selector on the data
    X_important = sfm.transform(X)  # Transform the data to select important features
    feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
    return X_important, feature_importances


In [19]:
# Select important features for both datasets
X1_important, feature_importances1 = feature_selection(X1, y1)
X2_important, feature_importances2 = feature_selection(X2, y2)

print("\nData Set One - Feature Importances:")
print(feature_importances1)
print("\nData Set One - Important Features (First 5 Rows):")
print(pd.DataFrame(X1_important).head())

print("\nData Set Two - Feature Importances:")
print(feature_importances2)
print("\nData Set Two - Important Features (First 5 Rows):")
print(pd.DataFrame(X2_important).head())



Data Set One - Feature Importances:
          importance
thalach     0.139389
oldpeak     0.117658
age         0.115094
chol        0.113813
trestbps    0.099277
ca          0.094458
thal        0.082432
cp          0.075628
exang       0.046204
slope       0.042644
restecg     0.029843
sex         0.027268
fbs         0.016293

Data Set One - Important Features (First 5 Rows):
      0    1      2      3      4    5    6    7
0  63.0  1.0  145.0  233.0  150.0  2.3  0.0  6.0
1  67.0  4.0  160.0  286.0  108.0  1.5  3.0  3.0
2  67.0  4.0  120.0  229.0  129.0  2.6  2.0  7.0
3  37.0  3.0  130.0  250.0  187.0  3.5  0.0  3.0
4  41.0  2.0  130.0  204.0  172.0  1.4  0.0  3.0

Data Set Two - Feature Importances:
          importance
oldpeak     0.122941
thalach     0.122419
thal        0.121363
ca          0.109587
cp          0.109037
age         0.093253
chol        0.083779
trestbps    0.077635
slope       0.048688
exang       0.047828
sex         0.033263
restecg     0.020953
fbs         0.

In [20]:
# Split both datasets into training and testing sets (70% training, 30% testing)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1_important, y1, test_size=0.3, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_important, y2, test_size=0.3, random_state=42)

print("\nData Set One - Train Features (First 5 Rows):")
print(pd.DataFrame(X1_train).head())
print("\nData Set One - Test Features (First 5 Rows):")
print(pd.DataFrame(X1_test).head())

print("\nData Set Two - Train Features (First 5 Rows):")
print(pd.DataFrame(X2_train).head())
print("\nData Set Two - Test Features (First 5 Rows):")
print(pd.DataFrame(X2_test).head())



Data Set One - Train Features (First 5 Rows):
      0    1      2      3      4    5    6    7
0  61.0  4.0  140.0  207.0  138.0  1.9  1.0  7.0
1  57.0  4.0  150.0  276.0  112.0  0.6  1.0  6.0
2  48.0  2.0  110.0  229.0  168.0  1.0  0.0  7.0
3  67.0  4.0  106.0  223.0  142.0  0.3  2.0  3.0
4  47.0  3.0  108.0  243.0  152.0  0.0  0.0  3.0

Data Set One - Test Features (First 5 Rows):
      0    1      2      3      4    5    6    7
0  45.0  2.0  112.0  160.0  138.0  0.0  0.0  3.0
1  52.0  4.0  112.0  230.0  160.0  0.0  1.0  3.0
2  54.0  3.0  135.0  304.0  170.0  0.0  0.0  3.0
3  70.0  4.0  130.0  322.0  109.0  2.4  3.0  3.0
4  56.0  2.0  120.0  236.0  178.0  0.8  0.0  3.0

Data Set Two - Train Features (First 5 Rows):
      0    1      2      3      4    5    6    7
0  57.0  4.0  110.0  201.0  126.0  1.5  0.0  6.0
1  44.0  2.0  130.0  219.0  188.0  0.0  0.0  3.0
2  54.0  4.0  124.0  266.0  109.0  2.2  1.0  7.0
3  58.0  4.0  125.0  300.0  171.0  0.0  2.0  7.0
4  62.0  4.0  120.0  267.0 

In [21]:
# Standardize the features to have mean=0 and variance=1
scaler1 = StandardScaler()
X1_train_scaled = scaler1.fit_transform(X1_train)  # Fit and transform the training data
X1_test_scaled = scaler1.transform(X1_test)  # Transform the test data

scaler2 = StandardScaler()
X2_train_scaled = scaler2.fit_transform(X2_train)  # Fit and transform the training data
X2_test_scaled = scaler2.transform(X2_test)  # Transform the test data


In [22]:
print("\nData Set One - Scaled Train Features (First 5 Rows):")
print(pd.DataFrame(X1_train_scaled).head())
print("\nData Set One - Scaled Test Features (First 5 Rows):")
print(pd.DataFrame(X1_test_scaled).head())

print("\nData Set Two - Scaled Train Features (First 5 Rows):")
print(pd.DataFrame(X2_train_scaled).head())
print("\nData Set Two - Scaled Test Features (First 5 Rows):")
print(pd.DataFrame(X2_test_scaled).head())


Data Set One - Scaled Train Features (First 5 Rows):
          0         1         2         3         4         5         6  \
0  0.728560  0.843465  0.500301 -0.786293 -0.442406  0.787175  0.298941   
1  0.283030  0.843465  1.086392  0.503842 -1.555556 -0.370992  0.298941   
2 -0.719412 -1.235074 -1.257973 -0.374946  0.841998 -0.014633 -0.732405   
3  1.396855  0.843465 -1.492409 -0.487131 -0.271152 -0.638262  1.330286   
4 -0.830795 -0.195804 -1.375191 -0.113179  0.156983 -0.905531 -0.732405   

          7  
0  1.166259  
1  0.650414  
2  1.166259  
3 -0.897123  
4 -0.897123  

Data Set One - Scaled Test Features (First 5 Rows):
          0         1         2         3         4         5         6  \
0 -1.053560 -1.235074 -1.140755 -1.665081 -0.442406 -0.905531 -0.732405   
1 -0.273883  0.843465 -1.140755 -0.356248  0.499491 -0.905531  0.298941   
2 -0.051118 -0.195804  0.207255  1.027375  0.927625 -0.905531 -0.732405   
3  1.731002  0.843465 -0.085790  1.363932 -1.683997  1.232

In [23]:

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Function to train and evaluate the RandomForestClassifier
def train_and_evaluate(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)  # Initialize RandomForestClassifier
    clf.fit(X_train, y_train)  # Train the classifier
    y_pred = clf.predict(X_test)  # Make predictions on the test data
    accuracy = accuracy_score(y_test, y_pred)  # Calculate the accuracy of the model
    print(f"Accuracy: {accuracy}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))  # Print the confusion matrix
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))  # Print the classification report with zero_division=0

# Train and evaluate the model on both datasets
print("\nData Set One Evaluation:")
train_and_evaluate(X1_train_scaled, X1_test_scaled, y1_train, y1_test)

print("\nData Set Two Evaluation:")
train_and_evaluate(X2_train_scaled, X2_test_scaled, y2_train, y2_test)



Data Set One Evaluation:
Accuracy: 0.5888888888888889
Confusion Matrix:
[[47  2  0  0  0]
 [ 9  1  2  5  0]
 [ 3  2  3  2  0]
 [ 2  3  4  2  0]
 [ 2  1  0  0  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.96      0.84        49
           1       0.11      0.06      0.08        17
           2       0.33      0.30      0.32        10
           3       0.22      0.18      0.20        11
           4       0.00      0.00      0.00         3

    accuracy                           0.59        90
   macro avg       0.28      0.30      0.29        90
weighted avg       0.49      0.59      0.53        90


Data Set Two Evaluation:
Accuracy: 0.7777777777777778
Confusion Matrix:
[[45  4]
 [14 18]]
Classification Report:
              precision    recall  f1-score   support

           1       0.76      0.92      0.83        49
           2       0.82      0.56      0.67        32

    accuracy                           0.78  

In [24]:
# Train and evaluate the model on both datasets
print("\nData Set One Evaluation:")
train_and_evaluate(X1_train_scaled, X1_test_scaled, y1_train, y1_test)

print("\nData Set Two Evaluation:")
train_and_evaluate(X2_train_scaled, X2_test_scaled, y2_train, y2_test)



Data Set One Evaluation:
Accuracy: 0.5888888888888889
Confusion Matrix:
[[47  2  0  0  0]
 [ 9  1  2  5  0]
 [ 3  2  3  2  0]
 [ 2  3  4  2  0]
 [ 2  1  0  0  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.96      0.84        49
           1       0.11      0.06      0.08        17
           2       0.33      0.30      0.32        10
           3       0.22      0.18      0.20        11
           4       0.00      0.00      0.00         3

    accuracy                           0.59        90
   macro avg       0.28      0.30      0.29        90
weighted avg       0.49      0.59      0.53        90


Data Set Two Evaluation:
Accuracy: 0.7777777777777778
Confusion Matrix:
[[45  4]
 [14 18]]
Classification Report:
              precision    recall  f1-score   support

           1       0.76      0.92      0.83        49
           2       0.82      0.56      0.67        32

    accuracy                           0.78  

In [25]:
# addresses the error above ^^
# Hyperparameter tuning using Grid Search
from sklearn.model_selection import GridSearchCV

In [26]:
# Define the parameter grid for GridSearch
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [27]:
# Initialize the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Perform Grid Search on the first dataset
grid_search1 = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search1.fit(X1_train, y1_train)
print("Best parameters for the first dataset: ", grid_search1.best_params_)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END ma

In [28]:
# Perform Grid Search on the second dataset
grid_search2 = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search2.fit(X2_train, y2_train)
print("Best parameters for the second dataset: ", grid_search2.best_params_)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters for the second dataset:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


In [29]:
# Evaluate the models with the best parameters from Grid Search
# First dataset evaluation
y1_pred = grid_search1.best_estimator_.predict(X1_test)
print("Accuracy for the first dataset: ", accuracy_score(y1_test, y1_pred))
print("Confusion Matrix:\n", confusion_matrix(y1_test, y1_pred))
print("Classification Report:\n", classification_report(y1_test, y1_pred))


Accuracy for the first dataset:  0.5888888888888889
Confusion Matrix:
 [[49  0  0  0  0]
 [12  0  2  3  0]
 [ 4  3  2  1  0]
 [ 3  1  5  2  0]
 [ 2  0  0  1  0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      1.00      0.82        49
           1       0.00      0.00      0.00        17
           2       0.22      0.20      0.21        10
           3       0.29      0.18      0.22        11
           4       0.00      0.00      0.00         3

    accuracy                           0.59        90
   macro avg       0.24      0.28      0.25        90
weighted avg       0.44      0.59      0.50        90



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
# Second dataset evaluation
y2_pred = grid_search2.best_estimator_.predict(X2_test)
print("Accuracy for the second dataset: ", accuracy_score(y2_test, y2_pred))
print("Confusion Matrix:\n", confusion_matrix(y2_test, y2_pred))
print("Classification Report:\n", classification_report(y2_test, y2_pred))

Accuracy for the second dataset:  0.7901234567901234
Confusion Matrix:
 [[46  3]
 [14 18]]
Classification Report:
               precision    recall  f1-score   support

           1       0.77      0.94      0.84        49
           2       0.86      0.56      0.68        32

    accuracy                           0.79        81
   macro avg       0.81      0.75      0.76        81
weighted avg       0.80      0.79      0.78        81



In [31]:

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Function to train and evaluate the RandomForestClassifier
def train_and_evaluate(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)  # Initialize RandomForestClassifier
    clf.fit(X_train, y_train)  # Train the classifier
    y_pred = clf.predict(X_test)  # Make predictions on the test data
    accuracy = accuracy_score(y_test, y_pred)  # Calculate the accuracy of the model
    print(f"Accuracy: {accuracy}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))  # Print the confusion matrix
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))  # Print the classification report with zero_division=0

# Train and evaluate the model on both datasets
print("\nData Set One Evaluation:")
train_and_evaluate(X1_train_scaled, X1_test_scaled, y1_train, y1_test)

print("\nData Set Two Evaluation:")
train_and_evaluate(X2_train_scaled, X2_test_scaled, y2_train, y2_test)


Data Set One Evaluation:
Accuracy: 0.5888888888888889
Confusion Matrix:
[[47  2  0  0  0]
 [ 9  1  2  5  0]
 [ 3  2  3  2  0]
 [ 2  3  4  2  0]
 [ 2  1  0  0  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.96      0.84        49
           1       0.11      0.06      0.08        17
           2       0.33      0.30      0.32        10
           3       0.22      0.18      0.20        11
           4       0.00      0.00      0.00         3

    accuracy                           0.59        90
   macro avg       0.28      0.30      0.29        90
weighted avg       0.49      0.59      0.53        90


Data Set Two Evaluation:
Accuracy: 0.7777777777777778
Confusion Matrix:
[[45  4]
 [14 18]]
Classification Report:
              precision    recall  f1-score   support

           1       0.76      0.92      0.83        49
           2       0.82      0.56      0.67        32

    accuracy                           0.78  

In [32]:
# Hyperparameter tuning using Grid Search
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearch
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [33]:
# Initialize the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)


In [34]:
# Perform Grid Search on the first dataset
grid_search1 = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search1.fit(X1_train, y1_train)
print("Best parameters for the first dataset: ", grid_search1.best_params_)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=10,

In [35]:
# Perform Grid Search on the second dataset
grid_search2 = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search2.fit(X2_train, y2_train)
print("Best parameters for the second dataset: ", grid_search2.best_params_)



Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.2s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   0.2s
[CV] END max_depth=20, min_

In [36]:
# Evaluate the models with the best parameters from Grid Search
# First dataset evaluation
y1_pred = grid_search1.best_estimator_.predict(X1_test)
print("Accuracy for the first dataset: ", accuracy_score(y1_test, y1_pred))
print("Confusion Matrix:\n", confusion_matrix(y1_test, y1_pred))
print("Classification Report:\n", classification_report(y1_test, y1_pred))


Accuracy for the first dataset:  0.5888888888888889
Confusion Matrix:
 [[49  0  0  0  0]
 [12  0  2  3  0]
 [ 4  3  2  1  0]
 [ 3  1  5  2  0]
 [ 2  0  0  1  0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      1.00      0.82        49
           1       0.00      0.00      0.00        17
           2       0.22      0.20      0.21        10
           3       0.29      0.18      0.22        11
           4       0.00      0.00      0.00         3

    accuracy                           0.59        90
   macro avg       0.24      0.28      0.25        90
weighted avg       0.44      0.59      0.50        90



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
# Second dataset evaluation
y2_pred = grid_search2.best_estimator_.predict(X2_test)
print("Accuracy for the second dataset: ", accuracy_score(y2_test, y2_pred))
print("Confusion Matrix:\n", confusion_matrix(y2_test, y2_pred))
print("Classification Report:\n", classification_report(y2_test, y2_pred))

Accuracy for the second dataset:  0.7901234567901234
Confusion Matrix:
 [[46  3]
 [14 18]]
Classification Report:
               precision    recall  f1-score   support

           1       0.77      0.94      0.84        49
           2       0.86      0.56      0.68        32

    accuracy                           0.79        81
   macro avg       0.81      0.75      0.76        81
weighted avg       0.80      0.79      0.78        81

