In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Libraries (Aligned with Week 8-10)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:


# ==========================================
# 1. DATA PREPARATION
# ==========================================
print("--- 1. Loading and Preparing Data ---")

# Load Data

df_songs = pd.read_excel('Song_Data_With_Genres.xlsx')
df_climate = pd.read_csv('Global_Climate_Data_Formatted.csv', delimiter=';')

# Clean Climate Data (Fix decimals)
if df_climate['Average_Temp_Celsius'].dtype == 'object':
    df_climate['Average_Temp_Celsius'] = df_climate['Average_Temp_Celsius'].str.replace(',', '.').astype(float)

# Preprocessing: Explode "Pop / R&B" into separate samples
# This is crucial so we don't treat "Pop / R&B" as a unique class
df_songs['Genre_List'] = df_songs['Genre'].astype(str).str.split(r'\s*/\s*')
df_expanded = df_songs.explode('Genre_List').rename(columns={'Genre_List': 'Target_Genre'})

# Merge
df_ml = pd.merge(df_expanded, df_climate, on='Country', how='inner')

# Filter for Top 8 Genres (Class Imbalance Handling - Week 8 Concept)
# We remove rare classes to ensure the model has enough data to learn
top_genres = df_ml['Target_Genre'].value_counts().head(8).index
df_final = df_ml[df_ml['Target_Genre'].isin(top_genres)].copy()

print(f"Data Ready. Classes: {list(top_genres)}")

In [None]:


# ==========================================
# 2. SUPERVISED LEARNING: DECISION TREES 
# Goal: Predict Genre from Temp & Rain
# ==========================================
print("\n--- 2. Decision Tree Analysis ---")

# Features (X) & Target (y)
X = df_final[['Average_Temp_Celsius', 'Rainfall_mm_per_year']]
y = df_final['Target_Genre']

# Train-Test Split (Week 8 Requirement)
# We use stratify=y to maintain genre proportions in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Model Training (Week 9)
# max_depth=3 is chosen to prevent Overfitting (High Variance)
clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation (Week 8 Metrics)
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# VISUALIZATION 1: Confusion Matrix (The "Week 8" Update)
# This shows exactly which genres are being confused with each other
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=clf.classes_, yticklabels=clf.classes_)
plt.title('Confusion Matrix: Where does the model make mistakes?')
plt.xlabel('Predicted Genre')
plt.ylabel('Actual Genre')
plt.savefig('ml_confusion_matrix.png')
plt.show()

# VISUALIZATION 2: The Decision Tree (Week 9)
plt.figure(figsize=(20, 10))
plot_tree(clf, feature_names=['Temp', 'Rain'], class_names=clf.classes_, 
          filled=True, rounded=True, fontsize=10)
plt.title("Decision Tree Rules")
plt.savefig('ml_decision_tree.png')
plt.show()


In [None]:

# ==========================================
# 3. UNSUPERVISED LEARNING: CLUSTERING 
# Goal: Find natural "Climate-Music Zones"
# ==========================================
print("\n--- 3. K-Means Clustering Analysis ---")

# Prepare Data
X_cluster = df_final[['Average_Temp_Celsius', 'Rainfall_mm_per_year']].drop_duplicates()

# Scaling (Crucial for K-Means)
# Since Rain (1000s) is much larger than Temp (20s), we must scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

# VISUALIZATION 3: The Elbow Method 
# Mathematically justifying the number of clusters (k)
inertia = []
k_range = range(1, 10)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(k_range, inertia, marker='o', linestyle='--')
plt.title('Elbow Method: Finding Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (Sum of Squared Distances)')
plt.grid(True)
plt.savefig('ml_elbow_method.png')
plt.show()

# Run Final Clustering with Optimal k=3 (based on likely elbow)
kmeans_final = KMeans(n_clusters=3, random_state=42)
X_cluster['Cluster'] = kmeans_final.fit_predict(X_scaled)

# VISUALIZATION 4: Clusters on 2D Plane
plt.figure(figsize=(10, 6))
sns.scatterplot(data=X_cluster, x='Average_Temp_Celsius', y='Rainfall_mm_per_year', 
                hue='Cluster', palette='viridis', s=100, style='Cluster')
plt.title('K-Means Clusters: Natural Climate Groups')
plt.savefig('ml_clusters.png')
plt.show()

# Interpret Clusters
print("\nMost Common Genre in Each Cluster:")
# Merge back to find which genres map to these clusters
df_analysis = pd.merge(df_final, X_cluster, on=['Average_Temp_Celsius', 'Rainfall_mm_per_year'], how='left')
print(df_analysis.groupby('Cluster')['Target_Genre'].agg(
    lambda x: x.value_counts().index[0] if len(x) > 0 else "None"
))

In [None]:
# ==========================================
# 4. CAUSAL INSIGHTS 
# Goal: Understand "Feature Importance" (What drives the prediction?)
# ==========================================
print("\n--- 4. Causal Insights: Feature Importance ---")

# Extract Feature Importance from the Decision Tree
importances = clf.feature_importances_
features = ['Average_Temp_Celsius', 'Rainfall_mm_per_year']

# Plot
plt.figure(figsize=(8, 5))
sns.barplot(x=features, y=importances, palette='magma')
plt.title('Feature Importance: Which variable drives Genre more?', fontsize=14)
plt.ylabel('Importance Score (0-1)')
plt.ylim(0, 1)

# Add text labels
for i, v in enumerate(importances):
    plt.text(i, v + 0.02, f"{v:.2f}", ha='center', fontsize=12, fontweight='bold')

plt.savefig('ml_feature_importance.png')
plt.show()

print("INTERPRETATION:")
if importances[0] > importances[1]:
    print("Temperature has a higher causal weight (importance) in determining Genre than Rainfall.")
else:
    print("Rainfall appears to be the stronger predictor/driver of Genre preference.")