<a href="https://colab.research.google.com/github/dopey-tim/Bus4-118S/blob/main/ML_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Generate synthetic dataset with 150 entries
np.random.seed(42)
n_samples = 150

# Square footage between 800 and 4000
square_footage = np.random.randint(800, 4000, n_samples)

# Randomly assign locations
locations = np.random.choice(['Downtown', 'Suburb', 'Rural'], n_samples)

# Base price per sq ft by location
price_per_sqft = {
    'Downtown': 350,
    'Suburb': 200,
    'Rural': 120
}

# Generate prices with noise
prices = [
    sqft * price_per_sqft[loc] + np.random.randint(-20000, 20000)
    for sqft, loc in zip(square_footage, locations)
]

# Create DataFrame
df = pd.DataFrame({
    'square_footage': square_footage,
    'location': locations,
    'price': prices
})

# Features and target
X = df[['square_footage', 'location']]
y = df['price']

# Preprocessing: One-hot encode location
preprocessor = ColumnTransformer(
    transformers=[
        ('location', OneHotEncoder(sparse_output=False), ['location'])
    ],
    remainder='passthrough'
)

# Pipeline with preprocessing + regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model.fit(X_train, y_train)

# Make prediction for a new house: 2000 sq ft in Downtown
new_house = pd.DataFrame({'square_footage': [2000], 'location': ['Downtown']})
predicted_price = model.predict(new_house)[0]
print(f"Predicted price for a 2000 sq ft house in Downtown: ${predicted_price:,.2f}")

# Display model coefficients
feature_names = (
    model.named_steps['preprocessor']
    .named_transformers_['location']
    .get_feature_names_out(['location'])
).tolist() + ['square_footage']

coefficients = model.named_steps['regressor'].coef_
intercept = model.named_steps['regressor'].intercept_

print("\nModel Coefficients:")
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept: {intercept:,.2f}")

# Model performance
r2_score = model.score(X_test, y_test)
print(f"\nModel R² Score on test data: {r2_score:.2f}")



Predicted price for a 2000 sq ft house in Downtown: $739,790.61

Model Coefficients:
location_Downtown: 303119.17
location_Rural: -260346.04
location_Suburb: -42773.12
square_footage: 209.35
Intercept: 17,970.39

Model R² Score on test data: 0.92


In [2]:
# Synthetic churn dataset with 150 entries
# Inspired by Kaggle Telco Customer Churn dataset
# https://www.kaggle.com/blastchar/telco-customer-churn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

np.random.seed(42)
n_samples = 150

# Generate synthetic features
age = np.random.randint(20, 70, n_samples)
monthly_usage_hours = np.random.randint(5, 100, n_samples)
purchase_amount = np.random.randint(50, 1000, n_samples)
customer_service_calls = np.random.randint(0, 10, n_samples)
regions = np.random.choice(['North', 'South', 'East', 'West'], n_samples)

# Simulate churn with some logic
# High churn chance if low usage + high service calls + low purchases
churn = [
    1 if (usage < 20 and calls > 5) or (spend < 200) else 0
    for usage, calls, spend in zip(monthly_usage_hours, customer_service_calls, purchase_amount)
]

# Create DataFrame
df = pd.DataFrame({
    'age': age,
    'monthly_usage_hours': monthly_usage_hours,
    'purchase_amount': purchase_amount,
    'customer_service_calls': customer_service_calls,
    'region': regions,
    'churn': churn
})

# Features and target
X = df.drop('churn', axis=1)
y = df['churn']

# Preprocessing: scale numerical + one-hot encode categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'monthly_usage_hours', 'purchase_amount', 'customer_service_calls']),
        ('cat', OneHotEncoder(sparse_output=False), ['region'])
    ]
)

# Pipeline with logistic regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model.fit(X_train, y_train)

# Predict churn probability for a new customer
new_customer = pd.DataFrame({
    'age': [40],
    'monthly_usage_hours': [25],
    'purchase_amount': [300],
    'customer_service_calls': [4],
    'region': ['West']
})
churn_probability = model.predict_proba(new_customer)[0][1]
churn_prediction = int(churn_probability > 0.5)

print(f"Churn Probability for new customer: {churn_probability:.2f}")
print(f"Churn Prediction (1 = churn, 0 = no churn): {churn_prediction}")

# Model coefficients
feature_names = (
    model.named_steps['preprocessor']
    .get_feature_names_out()
)
coefficients = model.named_steps['classifier'].coef_[0]
intercept = model.named_steps['classifier'].intercept_[0]

print("\nModel Coefficients:")
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept: {intercept:.2f}")

# Model performance
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print(f"ROC-AUC Score: {roc_score:.2f}")




Churn Probability for new customer: 0.30
Churn Prediction (1 = churn, 0 = no churn): 0

Model Coefficients:
num__age: -0.27
num__monthly_usage_hours: -0.58
num__purchase_amount: -1.67
num__customer_service_calls: 0.41
cat__region_East: 0.32
cat__region_North: 0.49
cat__region_South: -0.72
cat__region_West: -0.09
Intercept: -2.73

Classification Report:
              precision    recall  f1-score   support

           0       0.85      1.00      0.92        22
           1       1.00      0.50      0.67         8

    accuracy                           0.87        30
   macro avg       0.92      0.75      0.79        30
weighted avg       0.89      0.87      0.85        30

Confusion Matrix:
[[22  0]
 [ 4  4]]
ROC-AUC Score: 0.97


In [3]:
# Synthetic customer segmentation dataset with 150 entries
# Inspired by UCI Online Retail dataset
# https://archive.ics.uci.edu/ml/datasets/online+retail

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Generate synthetic dataset
np.random.seed(42)
n_samples = 150

annual_spending = np.random.randint(500, 20000, n_samples)
purchase_frequency = np.random.randint(1, 50, n_samples)
age = np.random.randint(20, 70, n_samples)
region = np.random.choice(['North', 'South', 'East', 'West'], n_samples)

df = pd.DataFrame({
    'annual_spending': annual_spending,
    'purchase_frequency': purchase_frequency,
    'age': age,
    'region': region
})

# Select numerical features and scale
features = ['annual_spending', 'purchase_frequency', 'age']
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Elbow method to find optimal clusters
inertia = []
K = range(1, 8)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.savefig('elbow_plot.png')
plt.close()

# Apply KMeans with chosen K (e.g., 3)
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Evaluate clustering
sil_score = silhouette_score(X_scaled, df['cluster'])
print(f"Silhouette Score: {sil_score:.2f}")

# Cluster summary
cluster_summary = df.groupby('cluster')[features].mean().round(2)
cluster_counts = df['cluster'].value_counts()

print("\nCluster Characteristics:")
print(cluster_summary)
print("\nCluster Counts:")
print(cluster_counts)

# Example targeted strategies
for cluster in range(optimal_k):
    print(f"\nCluster {cluster} Strategy:")
    if cluster_summary.loc[cluster, 'annual_spending'] > 12000:
        print("💎 High-spending customers: Offer VIP programs, loyalty rewards, and exclusive promotions.")
    elif cluster_summary.loc[cluster, 'purchase_frequency'] > 25:
        print("📦 Frequent buyers: Provide subscription services, bulk discounts, or early access to products.")
    else:
        print("📢 Low-engagement customers: Send personalized re-engagement campaigns and starter offers.")

# Save datasets
df.to_csv('customer_segments.csv', index=False)
cluster_summary.to_csv('cluster_summary.csv')



Silhouette Score: 0.26

Cluster Characteristics:
         annual_spending  purchase_frequency    age
cluster                                            
0                4418.74               15.26  52.26
1               13244.22               18.08  34.78
2               11251.97               39.31  51.40

Cluster Counts:
cluster
2    58
1    49
0    43
Name: count, dtype: int64

Cluster 0 Strategy:
📢 Low-engagement customers: Send personalized re-engagement campaigns and starter offers.

Cluster 1 Strategy:
💎 High-spending customers: Offer VIP programs, loyalty rewards, and exclusive promotions.

Cluster 2 Strategy:
📦 Frequent buyers: Provide subscription services, bulk discounts, or early access to products.
