# Practice Activity #2: Applying metrics and cross-validation

[link](https://www.coursera.org/learn/ai-and-machine-learning-algorithms-and-techniques/supplement/Xl0WF/practice-activity-applying-metrics-and-cross-validation)

In [135]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score

In [136]:
# Sample dataset: Study hours, previous exam scores, and pass/fail labels
# Load the data from the student_data.csv
data = pd.read_csv('student_data.csv')

df = pd.DataFrame(data)

# Features and target variable
X = df[['StudyHours', 'PrevExamScore']]
y = df['Pass']
print(df.head(5))

   StudyHours  PrevExamScore  Pass
0           5             83     0
1           5             74     0
2           9             72     1
3           5             76     0
4           6             69     0


In [137]:
import random
randonmaxboundery = len(df) - 1
print(f"randonmaxboundery: {randonmaxboundery}")

random_state = random.randint(0, randonmaxboundery)
print(f"random_state: {random_state}")
# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state,shuffle=True)

# Display the shape of the training and testing sets
print(f"Training data: {X_train.shape}, {y_train.shape}")
print(f"Testing data: {X_test.shape}, {y_test.shape}")

randonmaxboundery: 999
random_state: 856
Training data: (800, 2), (800,)
Testing data: (200, 2), (200,)


In [138]:
# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept


In [139]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

Accuracy: 0.995
Precision: 1.0
Recall: 0.9921259842519685
F1-Score: 0.9960474308300395


## Introducing cross-validation

_Cross-validation allows you to split the dataset into multiple subsets and reliably calculate model performance_
Cross-validation involves splitting the data into multiple folds, training the model on some folds, and testing it on the remaining folds. The process is repeated for each fold, and the average performance is taken across all folds.

### Performing k-fold cross-validation

You will use k-fold cross-validation, where the dataset is split into k equal parts (folds). Each fold is used as a test set while the remaining folds are used for training:

In [140]:
from sklearn.model_selection import cross_val_score

# Initialize the model
model = LogisticRegression()

# Perform 5-fold cross-validation and calculate accuracy for each fold
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# Display the accuracy for each fold and the mean accuracy
print(f'Cross-validation accuracies: {cv_scores}')
print(f'Mean cross-validation accuracy: {np.mean(cv_scores)}')

Cross-validation accuracies: [0.975 0.995 1.    0.99  0.99 ]
Mean cross-validation accuracy: 0.99


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept


### Cross-validation with multiple metrics

Calculate multiple metrics during cross-validation using the scoring parameter. Use k-fold cross-validation to calculate accuracy, precision, recall, and F1 score:

In [141]:
from sklearn.model_selection import cross_validate

# Define multiple scoring metrics
scoring = ['accuracy', 'precision', 'recall', 'f1']

# Perform cross-validation
cv_results = cross_validate(model, X, y, cv=5, scoring=scoring)

# Print results for each metric
print(f"Cross-validation Accuracy: {cv_results['test_accuracy']} <> mean: {np.mean(cv_results['test_accuracy'])}")
print(f"Cross-validation Accuracy: {cv_results['test_precision']} <> mean: {np.mean(cv_results['test_precision'])}")
print(f"Cross-validation Accuracy: {cv_results['test_recall']} <> mean: {np.mean(cv_results['test_recall'])}")
print(f"Cross-validation Accuracy: {cv_results['test_f1']} <> mean: {np.mean(cv_results['test_f1'])}")

Cross-validation Accuracy: [0.975 0.995 1.    0.99  0.99 ] <> mean: 0.99
Cross-validation Accuracy: [1. 1. 1. 1. 1.] <> mean: 1.0
Cross-validation Accuracy: [0.96062992 0.9921875  1.         0.984375   0.984375  ] <> mean: 0.9843134842519685
Cross-validation Accuracy: [0.97991968 0.99607843 1.         0.99212598 0.99212598] <> mean: 0.992050015718269


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept


### Cross-validation with a regression model

For regression tasks, use metrics such as mean absolute error (MAE), mean squared error (MSE), and R-squared. Apply these metrics with cross-validation for a regression model:

In [142]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Sample dataset for regression
X_reg = df[['StudyHours']]
y_reg = df['PrevExamScore']

# Initialize a linear regression model
reg_model = LinearRegression()

# Perform 5-fold cross-validation using R-squared as the metric
cv_scores_r2 = cross_val_score(reg_model, X_reg, y_reg, cv=5, scoring='r2')

print(f'Cross-validation R-squared scores: {cv_scores_r2}')
print(f'Mean R-squared score: {np.mean(cv_scores_r2)}')

Cross-validation R-squared scores: [-0.00266315 -0.00136984 -0.00899793 -0.00407431 -0.01346386]
Mean R-squared score: -0.006113819703703482
