# Week 9: Advanced Supervised Learning Algorithms 
- Theory: Study advanced classification algorithms (Random Forest, Gradient Boosting, SVM) and ensemble learning.
- Hands-On: Implement a Random Forest or XGBoost classifier.
- Client Project: Implement an advanced classifier for client data (e.g., customer churn prediction).
- Submit: Python script and model evaluation (on Google Classroom)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load Titanic data
df = pd.read_csv("titanic.csv")
df.head()

In [None]:
# Drop irrelevant columns
df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1, inplace=True)

In [None]:
df.columns

In [None]:
df["Sex"].unique()

In [None]:
df["Embarked"].unique()

In [None]:
df.isnull().sum()

In [None]:
# Fill missing values

df['Age'].fillna(df['Age'].median(), inplace=True)

df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

In [None]:
# Encode categorical variables
for col in ['Sex', 'Embarked']:
    df[col] = LabelEncoder().fit_transform(df[col])

df.head()

In [None]:
# Train Test Split

X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Train Random Forest Classifier

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)


In [None]:
#Predicting 

y_pred_rf = rf.predict(X_test)

### Evaluate Random Forest

In [None]:
print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))
print("RF Precision:", precision_score(y_test, y_pred_rf))
print("RF Recall:", recall_score(y_test, y_pred_rf))
print("RF F1-Score:", f1_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap="Blues")
plt.title("RF Confusion Matrix")
plt.show()


### Train XGBoost Classifier

In [None]:

xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train, y_train)

In [None]:
#predicting

y_pred_xgb = xgb_clf.predict(X_test)

In [None]:
print("XGB Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGB Precision:", precision_score(y_test, y_pred_xgb))
print("XGB Recall:", recall_score(y_test, y_pred_xgb))
print("XGB F1-Score:", f1_score(y_test, y_pred_xgb))
print("\nXGB Classification Report:\n", classification_report(y_test, y_pred_xgb))

In [None]:
# Confusion Matrix

sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, fmt='d', cmap="Greens")
plt.title("XGB Confusion Matrix")
plt.show()

### Comparison Table

In [None]:
results = {
    "Model": ["Random Forest", "XGBoost"],
    "Accuracy": [accuracy_score(y_test, y_pred_rf), accuracy_score(y_test, y_pred_xgb)],
    "Precision": [precision_score(y_test, y_pred_rf), precision_score(y_test, y_pred_xgb)],
    "Recall": [recall_score(y_test, y_pred_rf), recall_score(y_test, y_pred_xgb)],
    "F1-Score": [f1_score(y_test, y_pred_rf), f1_score(y_test, y_pred_xgb)]
}

pd.DataFrame(results)


In [None]:
# Feature Importance

importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=True)
importances.tail(10).plot(kind='barh', figsize=(8,6))
plt.title("Top 10 Feature Importances (Random Forest)")
plt.show()


# Client Project: Implement an advanced classifier for client data (e.g., customer churn prediction).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [None]:
 
df = pd.read_csv("Walmart_customer_purchases.csv")
df.head(3)


In [None]:
print(df.shape)
df.describe()

In [None]:

df.isnull().sum()

In [None]:
## sample rows and columns
df.sample()


In [None]:
# Inspect unique values in target

df['Repeat_Customer'].unique()

### Basic cleaning of the dataset


In [None]:
df.columns

In [None]:
# Check unique values in target
print(df['Repeat_Customer'].value_counts(dropna=False))

In [None]:
df = df[df['Repeat_Customer'].isin(['Yes','No'])]
df['Repeat_Customer'] = df['Repeat_Customer'].map({'Yes':1, 'No':0})

In [None]:
for col in df.columns:
    if col not in ['Customer_ID','Purchase_Date','Repeat_Customer']:
        if df[col].dtype=='object':
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna(df[col].median())

In [None]:
categorical_cols = ['Gender','City','Category','Product_Name','Payment_Method','Discount_Applied']
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

## EDA (Exploratory Data Analysis)

In [None]:
# Target distribution
sns.countplot(x='Repeat_Customer', data=df)
plt.title("Repeat Customer Distribution")
plt.show()

In [None]:
# Age distribution
sns.histplot(df['Age'], bins=20, kde=True)
plt.title("Age Distribution")
plt.show()

In [None]:
# Purchase Amount distribution
sns.histplot(df['Purchase_Amount'], bins=20, kde=True)
plt.title("Purchase Amount Distribution")
plt.show()

In [None]:
# assinging features and targets

X = df.drop(['Customer_ID','Purchase_Date','Repeat_Customer'], axis=1)
y = df['Repeat_Customer']


In [None]:
# Spliting the data into 80:20

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Random FOrest model

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [None]:
# Predict

y_pred = rf.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

## Thank You