Q1. Design a binary classifier for classifying the following emails as spam (1) or ham (0). Use email.csv for training.
- Print all the parameter values learnt after training
- Show the accuracy on the test set.

In [29]:
import pandas as pd
import numpy as np

df = pd.read_csv('email.csv')
X = df[['x1', 'x2', 'x3', 'x4', 'x5']]
y = df['class']

y = np.where(y == 'spam', 1, 0) # Convert labels to 0 and 1
X = np.c_[np.ones((len(X), 1)), X] # Add a column of ones to X for the intercept term

theta = np.zeros(X.shape[1]) # Initialize parameters

# Define the sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Define the cost function
def cost_function(X, y, theta):
    m = len(y)
    h = sigmoid(X @ theta)
    cost = -(1/m) * (y.T @ np.log(h) + (1 - y).T @ np.log(1 - h))
    return cost

# Perform gradient descent
alpha = 0.01
epochs = 1000

for epoch in range(epochs):
    h = sigmoid(X @ theta)
    gradient = X.T @ (h - y) / len(y)
    theta -= alpha * gradient

# Assuming the test emails are in X_test
X_test = np.array([
    [0, 1, 0, 1, 0],
    [1, 1, 1, 0, 1],
    [0, 1, 1, 0, 0],
    [1, 0, 1, 0, 0],
    [0, 1, 0, 0, 1],
    [0, 0, 0, 1, 1],
    [0, 1, 0, 1, 1],
    [0, 0, 0, 0, 1]
])

# Add a column of ones to X_test for the intercept term
X_test = np.c_[np.ones((len(X_test), 1)), X_test]

# Predict labels for the test set
predictions = np.round(sigmoid(X_test @ theta))

# Print the predicted labels for the test set
label_mapping = {1: 'spam', 0: 'ham'}
predicted_labels = [label_mapping[prediction] for prediction in predictions]
print("Predicted Labels for Test Set:", predicted_labels)

true_labels = np.array([1, 1, 1, 1, 0, 0, 0, 0]) # Assuming the true labels for the test set

accuracy = np.mean(predictions == true_labels)
print("Accuracy on Test Set:", accuracy)


Predicted Labels for Test Set: ['ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham']
Accuracy on Test Set: 0.5


Q2. Use appropriate Scikit Library function to apply logistic regression on the same dataset and compare the results with your implementation.

In [30]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

df_train = pd.read_csv('email.csv')
X_train = df_train[['x1', 'x2', 'x3', 'x4', 'x5']]
y_train = df_train['class']

model = LogisticRegression() # Create and train a logistic regression model
model.fit(X_train, y_train)

print("Learned Coefficients:", model.coef_) # Step 3: Print Parameter Values
print("Learned Intercept:", model.intercept_)

# Test emails
X_test = pd.DataFrame({
    'x1': [0, 1, 0, 1, 0, 0, 0, 0],
    'x2': [1, 1, 1, 0, 1, 0, 1, 0],
    'x3': [0, 1, 1, 1, 0, 0, 0, 0],
    'x4': [0, 0, 0, 0, 0, 1, 1, 0],
    'x5': [0, 1, 0, 0, 1, 1, 1, 1]
})

predictions = model.predict(X_test) #Make Predictions on Test Set
print("Predictions for Test Emails:", predictions)

true_labels = [1, 1, 1, 1, 0, 0, 0, 0] # Assuming the true labels for the test set

accuracy = accuracy_score(true_labels, predictions)
print("Accuracy on Test Set:", accuracy)

label_mapping = {1: 'spam', 0: 'ham'}
labels = [label_mapping[prediction] for prediction in predictions]
print("Test Emails:", labels)

true_labels = ['spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham'] # Assuming the true labels for the test set

accuracy = accuracy_score(true_labels, labels)
print("Accuracy on Test Set:", accuracy)

Learned Coefficients: [[-0.09425666  1.057532   -0.09425617 -1.3631898   1.36318972]]
Learned Intercept: [0.15827465]
Predictions for Test Emails: [1 1 1 0 1 1 1 1]
Accuracy on Test Set: 0.375
Test Emails: ['spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam']
Accuracy on Test Set: 0.375


Q3. Also use logistic regression model to predict the risk of having heart disease using the given dataset (heart.csv). You need to show complete pre-processing steps (identifying null or missing values, normalization, etc.)

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('heart.csv')
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Encode categorical variables
X_encoded = pd.get_dummies(X, columns=['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope'])

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42) # Split the data into training and testing sets

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(random_state=42) # Create and train a logistic regression model
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled) # Make predictions on the test set

accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)
print("Accuracy on Test Set:", accuracy)
print("Classification Report:\n", classification_report_output)

Accuracy on Test Set: 0.8532608695652174
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.87      0.83        77
           1       0.90      0.84      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted avg       0.86      0.85      0.85       184

