<a href="https://colab.research.google.com/github/dopey-tim/Bus4-118S/blob/main/ML_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Source: Approximate values inspired by public housing market data from Zillow Research
# (https://www.zillow.com/research/data/) – simplified for demonstration purposes.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Generate more realistic sample data (prices in USD, square footage, locations)
data = {
    'square_footage': [1200, 1500, 2000, 2200, 1800, 2500, 3000, 3500, 4000, 2800],
    'location': ['Downtown', 'Suburb', 'Downtown', 'Rural', 'Suburb',
                 'Downtown', 'Rural', 'Suburb', 'Downtown', 'Rural'],
    'price': [350000, 420000, 600000, 320000, 450000,
              720000, 400000, 500000, 850000, 370000]
}
df = pd.DataFrame(data)

# Features and target
X = df[['square_footage', 'location']]
y = df['price']

# Preprocessing: One-hot encode the location column
preprocessor = ColumnTransformer(
    transformers=[
        ('location', OneHotEncoder(sparse_output=False), ['location'])
    ],
    remainder='passthrough'
)

# Create pipeline with preprocessing and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model.fit(X_train, y_train)

# Make prediction for a new house: 2000 sq ft in Downtown
new_house = pd.DataFrame({'square_footage': [2000], 'location': ['Downtown']})
predicted_price = model.predict(new_house)[0]

print(f"Predicted price for a 2000 sq ft house in Downtown: ${predicted_price:,.2f}")

# Display model coefficients
feature_names = (
    model.named_steps['preprocessor']
    .named_transformers_['location']
    .get_feature_names_out(['location'])
).tolist() + ['square_footage']

coefficients = model.named_steps['regressor'].coef_
intercept = model.named_steps['regressor'].intercept_

print("\nModel Coefficients:")
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:,.2f}")

print(f"\nIntercept (baseline price): ${intercept:,.2f}")

# Model performance
r2_score = model.score(X_test, y_test)
print(f"\nModel R² Score on test data: {r2_score:.2f}")


Predicted price for a 2000 sq ft house in Downtown: $568,841.40

Model Coefficients:
location_Downtown: 153,216.71
location_Rural: -133,456.25
location_Suburb: -19,760.46
square_footage: 121.75

Intercept (baseline price): $172,130.04

Model R² Score on test data: 0.91


In [9]:
# Source: Values inspired by the public "Telco Customer Churn" dataset on Kaggle
# https://www.kaggle.com/blastchar/telco-customer-churn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Sample synthetic customer churn data
data = {
    'age': [25, 42, 51, 30, 60, 36, 48, 29, 55, 33],
    'monthly_usage_hours': [15, 80, 25, 10, 95, 45, 30, 12, 70, 50],
    'purchase_amount': [120, 400, 180, 90, 500, 250, 200, 100, 420, 300],
    'customer_service_calls': [3, 1, 7, 5, 0, 2, 6, 4, 1, 3],
    'region': ['North', 'South', 'West', 'East', 'South',
               'North', 'West', 'East', 'South', 'North'],
    'churn': [1, 0, 1, 1, 0, 0, 1, 1, 0, 0]  # 1 = churned, 0 = not churned
}
df = pd.DataFrame(data)

# Features and target
X = df.drop('churn', axis=1)
y = df['churn']

# Preprocessing: scale numerical + one-hot encode categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), make_column_selector(dtype_include=np.number)),
        ('cat', OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=object))
    ]
)

# Pipeline with logistic regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model.fit(X_train, y_train)

# Predict churn probability for a new customer
new_customer = pd.DataFrame({
    'age': [40],
    'monthly_usage_hours': [35],
    'purchase_amount': [250],
    'customer_service_calls': [2],
    'region': ['West']
})
churn_probability = model.predict_proba(new_customer)[0][1]
churn_prediction = int(churn_probability > 0.5)

print(f"Churn Probability for new customer: {churn_probability:.2f}")
print(f"Churn Prediction (1 = churn, 0 = no churn): {churn_prediction}")

# Model coefficients
feature_names = (
    model.named_steps['preprocessor']
    .get_feature_names_out()
)
coefficients = model.named_steps['classifier'].coef_[0]
intercept = model.named_steps['classifier'].intercept_[0]

print("\nModel Coefficients:")
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept: {intercept:.2f}")

# Model performance
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print(f"ROC-AUC Score: {roc_score:.2f}")



Churn Probability for new customer: 0.53
Churn Prediction (1 = churn, 0 = no churn): 1

Model Coefficients:
num__age: -0.03
num__monthly_usage_hours: -0.73
num__purchase_amount: -0.71
num__customer_service_calls: 0.66
cat__region_East: 0.17
cat__region_North: -0.38
cat__region_South: -0.02
cat__region_West: 0.23
Intercept: 0.62

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

Confusion Matrix:
[[2]]
ROC-AUC Score: nan




In [10]:
# Source: Synthetic values inspired by UCI Online Retail Dataset
# (https://archive.ics.uci.edu/ml/datasets/online+retail)
# Used for demonstration of customer segmentation.

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Generate more realistic customer data
data = {
    'annual_spending': [500, 3000, 15000, 700, 12000, 4500, 2000, 8000, 1000, 6000,
                        9500, 2500, 400, 11000, 5200],
    'purchase_frequency': [5, 18, 45, 3, 40, 22, 12, 35, 6, 28,
                           30, 15, 4, 42, 20],
    'age': [25, 34, 52, 28, 48, 36, 41, 29, 47, 33,
            55, 39, 26, 50, 44],
    'region': ['North', 'South', 'West', 'East', 'South',
               'North', 'West', 'East', 'South', 'North',
               'East', 'West', 'South', 'North', 'East']
}
df = pd.DataFrame(data)

# Select and scale numerical features
features = ['annual_spending', 'purchase_frequency', 'age']
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Determine optimal number of clusters using elbow method
inertia = []
K = range(1, 6)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(8, 5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.savefig('elbow_plot.png')
plt.close()

# Apply K-Means with chosen K (assume 3 based on elbow)
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Evaluate clustering with silhouette score
sil_score = silhouette_score(X_scaled, df['cluster'])
print(f"Silhouette Score: {sil_score:.2f}")

# Analyze clusters
cluster_summary = df.groupby('cluster')[features].mean().round(2)
cluster_counts = df['cluster'].value_counts()

print("\nCluster Characteristics:")
print(cluster_summary)
print("\nCluster Counts:")
print(cluster_counts)

# Example of targeted strategies
for cluster in range(optimal_k):
    print(f"\nCluster {cluster} Strategy:")
    if cluster_summary.loc[cluster, 'annual_spending'] > 8000:
        print("💎 High-spending customers: Offer VIP perks, loyalty rewards, and exclusive promotions.")
    elif cluster_summary.loc[cluster, 'purchase_frequency'] > 20:
        print("📦 Frequent buyers: Provide bulk discounts or subscription plans.")
    else:
        print("📢 Low-engagement customers: Send personalized re-engagement campaigns or starter offers.")

# Save results
df.to_csv('customer_segments.csv', index=False)
cluster_summary.to_csv('cluster_summary.csv')


Silhouette Score: 0.52

Cluster Characteristics:
         annual_spending  purchase_frequency    age
cluster                                            
0                4025.00               19.50  37.88
1               11875.00               39.25  51.25
2                 533.33                4.00  26.33

Cluster Counts:
cluster
0    8
1    4
2    3
Name: count, dtype: int64

Cluster 0 Strategy:
📢 Low-engagement customers: Send personalized re-engagement campaigns or starter offers.

Cluster 1 Strategy:
💎 High-spending customers: Offer VIP perks, loyalty rewards, and exclusive promotions.

Cluster 2 Strategy:
📢 Low-engagement customers: Send personalized re-engagement campaigns or starter offers.
