In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, precision_score, accuracy_score, f1_score, mean_squared_error, mean_absolute_error, recall_score, confusion_matrix

print("All imports successful")

All imports successful


In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer(as_frame=True)
df = data.frame

model = LogisticRegression(random_state=42, max_iter=4000)

features = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension']

target = 'target'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

predictions = model.predict(X_test)

cm = confusion_matrix(y_test, predictions)

In [3]:
# Task 1: Write function that takes y_true, y_pred as inputs
# Returns dictionary with keys: 'accuracy', 'precision', 'recall', 'f1'
# Use sklearn metrics inside the function

# Basic pattern
# def function_name(param1, param2):
    # calculations
#    return {'key1': value1, 'key2': value2}

# Task 2: Test your function with sample data
# Print results formatted nicely



def confusion_matrix_results(true, pred):
    accuracy = accuracy_score(true, pred)
    precision = precision_score(true, pred)
    recall = recall_score(true, pred)
    f1 = f1_score(true, pred)
    return {
        "accuracy": round(accuracy, 3), 
        "precision": round(precision, 3), 
        "recall": round(recall, 3), 
        "f1": round(f1, 3)
    }

confusion_matrix_results(y_test, predictions)

{'accuracy': 0.956, 'precision': 0.946, 'recall': 0.986, 'f1': 0.966}

In [7]:
# Task: From memory, complete this workflow:
# 1. Import confusion_matrix and all 4 metric functions
# 2. Create confusion matrix from y_test, predictions
# 3. Extract TN, FP, FN, TP using .ravel()
# 4. Calculate all 4 metrics
# 5. Print formatted results

tn, fp, fn, tp = cm.ravel()

accuracy = accuracy_score(y_test, predictions)
recall = recall_score(y_test, predictions)
precision = precision_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print(f"Accuracy: {accuracy:.3f}")
print(f"\nRecall: {recall:.3f}")
print(f"\nPrecision: {precision:.3f}")
print(f"\nF1 Score: {f1:.3f}")

# Write imports without looking:
# from sklearn.metrics import ___

# Verbal check:
# - Precision answers what question?
# - Recall answers what question?


Accuracy: 0.956

Recall: 0.986

Precision: 0.946

F1 Score: 0.966


In [10]:
# -- Pattern: Percentage calculation
# SELECT category,
#       COUNT(*) as count,
#       ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
# FROM table
# GROUP BY category

# -- Key syntax:
# -- ROUND(value, 2)  -- 2 decimal places
# -- 100.0 not 100    -- forces float division (avoids integer division trap)
# -- SUM(...) OVER()  -- window function for total across all rows

# -- Verbal tasks:
# -- 1. Why 100.0 instead of 100?
# -- 2. What does OVER() with empty parentheses mean?

import sqlite3 as sql

conn = sql.connect(":memory:")
df.to_sql('breast_cancer', conn, index=False)

query = """ Select "target",
    COUNT(*) as count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
    FROM breast_cancer
    GROUP BY "target"
    """
result = pd.read_sql(query, conn)
print(result.head(5))

conn.close()

   target  count  percentage
0       0    212       37.26
1       1    357       62.74
