In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier



###Exercise 1

In [2]:
!kaggle datasets download -d utkarshx27/heart-disease-diagnosis-dataset


Dataset URL: https://www.kaggle.com/datasets/utkarshx27/heart-disease-diagnosis-dataset
License(s): CC0-1.0
Downloading heart-disease-diagnosis-dataset.zip to /content
  0% 0.00/3.29k [00:00<?, ?B/s]
100% 3.29k/3.29k [00:00<00:00, 3.83MB/s]


In [3]:
!unzip heart-disease-diagnosis-dataset.zip

Archive:  heart-disease-diagnosis-dataset.zip
  inflating: dataset_heart.csv       


In [4]:
df = pd.read_csv('dataset_heart.csv')
df.head()

Unnamed: 0,age,sex,chest pain type,resting blood pressure,serum cholestoral,fasting blood sugar,resting electrocardiographic results,max heart rate,exercise induced angina,oldpeak,ST segment,major vessels,thal,heart disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,2
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,1
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,2
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,1
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,1


In [5]:
x = df[['age', 'chest pain type','sex ', 'resting blood pressure', 'serum cholestoral', 'fasting blood sugar', 'resting electrocardiographic results', 'max heart rate', 'exercise induced angina', 'oldpeak', 'ST segment', 'major vessels', 'thal']]
y = df['heart disease']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

print(x_train)
print(y_train)
print(x_test)
print(y_test)

     age  chest pain type  sex   resting blood pressure  serum cholestoral  \
78    42                3     0                     120                209   
121   54                4     1                     122                286   
27    51                3     0                     120                295   
198   69                1     0                     140                239   
218   54                3     1                     120                258   
..   ...              ...   ...                     ...                ...   
86    62                2     1                     128                208   
109   45                2     0                     112                160   
225   41                2     1                     135                203   
128   52                2     1                     134                201   
130   63                4     0                     108                269   

     fasting blood sugar  resting electrocardiographic results 

###Exercise 2

In [6]:
!kaggle datasets download -d uciml/iris

Dataset URL: https://www.kaggle.com/datasets/uciml/iris
License(s): CC0-1.0
Downloading iris.zip to /content
  0% 0.00/3.60k [00:00<?, ?B/s]
100% 3.60k/3.60k [00:00<00:00, 6.98MB/s]


In [7]:
!unzip iris.zip

Archive:  iris.zip
  inflating: Iris.csv                
  inflating: database.sqlite         


In [15]:
df = pd.read_csv('Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [10]:
model = LogisticRegression()


In [16]:
X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y = df['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [17]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Predict the test set results
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)


Accuracy: 0.9667

Confusion Matrix:
 [[10  0  0]
 [ 0  9  1]
 [ 0  0 10]]

Classification Report:
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      0.90      0.95        10
 Iris-virginica       0.91      1.00      0.95        10

       accuracy                           0.97        30
      macro avg       0.97      0.97      0.97        30
   weighted avg       0.97      0.97      0.97        30



###Exercise 3

In [19]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

In [22]:
model = LogisticRegression(max_iter=200, random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

# Fit the model using GridSearchCV
grid_search.fit(X_train, y_train)

print (f'Tuned Logistic Regression Parameters: {grid_search.best_params_}')
print (f'Best score is {grid_search.best_score_}')

Tuned Logistic Regression Parameters: {'C': 0.4393970560760795}
Best score is 0.9666666666666668


Test accuracy: 0.9333

Confusion Matrix:
 [[10  0  0]
 [ 0  9  1]
 [ 0  1  9]]

Classification Report:
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.90      0.90      0.90        10
 Iris-virginica       0.90      0.90      0.90        10

       accuracy                           0.93        30
      macro avg       0.93      0.93      0.93        30
   weighted avg       0.93      0.93      0.93        30



###Exercise 4

In [28]:
X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y = df['Species']
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = svm.SVC(kernel='linear')
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)

print(f'Accuracy: {accuracy_score(Y_test, y_pred)}')

Accuracy: 1.0


###Exercise 5

In [31]:
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space, 'kernel':['linear','rbf','poly'],
              'gamma':[0.001, 0.01, 0.1, 1, 10]}


In [32]:
model = SVC()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

# Fit the model using GridSearchCV
grid_search.fit(X_train, Y_train)

print (f'Tuned Logistic Regression Parameters: {grid_search.best_params_}')
print (f'Best score is {grid_search.best_score_}')

Tuned Logistic Regression Parameters: {'C': 3.727593720314938, 'gamma': 0.001, 'kernel': 'linear'}
Best score is 0.9666666666666668


###Exercise 6

In [40]:
X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y = df['Species']

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

X_train, X_test, Y_train, Y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

model = XGBClassifier()
model.fit(X_train, Y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 1.0000


###Exercise 7

In [41]:
param_grid = {
    'n_estimators': [50, 100, 150],        # Number of trees in the ensemble
    'max_depth': [3, 4, 5],                # Maximum depth of a tree
    'learning_rate': [0.01, 0.1, 0.2],     # Step size shrinkage
    'gamma': [0, 0.1, 0.3],                # Minimum loss reduction to create a partition
    'subsample': [0.7, 0.8, 1.0],          # Subsample ratio of the training instances
    'colsample_bytree': [0.7, 0.8, 1.0],   # Subsample ratio of columns when constructing each tree
    'reg_lambda': [0.01, 0.1, 1.0]         # L2 regularization term on weights
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, Y_train)

accuracy = accuracy_score(Y_test, y_pred)
print(f"Best Accuracy: {accuracy:.4f}")

Best Accuracy: 1.0000
