In [None]:
# =============================
# STEP 0: Install Required Packages
# =============================
!pip install pandas numpy matplotlib seaborn scikit-learn --quiet

# =============================
# STEP 1: Import Libraries
# =============================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay

# Ensure plots appear in the notebook
%matplotlib inline

# =============================
# STEP 2: Create Sample Dataset
# =============================
np.random.seed(42)
n_samples = 1000

data = pd.DataFrame({
    'MinTemp': np.random.uniform(5, 25, n_samples),
    'MaxTemp': np.random.uniform(15, 35, n_samples),
    'Rainfall': np.random.uniform(0, 20, n_samples),
    'WindGustSpeed': np.random.uniform(20, 100, n_samples),
    'Humidity3pm': np.random.uniform(20, 100, n_samples),
    'Pressure3pm': np.random.uniform(980, 1030, n_samples),
    'RainToday': np.random.choice(['Yes', 'No'], n_samples),
    'RainTomorrow': np.random.choice(['Yes', 'No'], n_samples)
})

print("Sample data (first 5 rows):")
display(data.head())

# =============================
# STEP 3: Handle Missing Values
# =============================
numerical_cols = data.select_dtypes(include=np.number).columns
categorical_cols = data.select_dtypes(include='object').columns

for col in numerical_cols:
    data[col] = data[col].fillna(data[col].mean())
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

print("\nMissing values check:")
print(data.isnull().sum())

# =============================
# STEP 4: Encode Categorical Features
# =============================
le = LabelEncoder()
data['RainTomorrow'] = le.fit_transform(data['RainTomorrow'])  # 1 = Rain, 0 = No Rain
data = pd.get_dummies(data, columns=['RainToday'], drop_first=True)

# =============================
# STEP 5: Split Data into Features and Target
# =============================
X = data.drop('RainTomorrow', axis=1)
y = data['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTraining set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

# =============================
# STEP 6: Build Random Forest Classifier Pipeline
# =============================
pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

grid_search_rf = GridSearchCV(
    pipeline_rf,
    param_grid_rf,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_rf.fit(X_train, y_train)

print("\nRandom Forest - Best Parameters:", grid_search_rf.best_params_)
print("Random Forest - Best Cross-Validation Accuracy:", grid_search_rf.best_score_)

# =============================
# STEP 7: Evaluate Random Forest
# =============================
y_pred_rf = grid_search_rf.predict(X_test)

print("\nRandom Forest Test Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest Confusion Matrix")
plt.show()

# ROC AUC
roc_auc = roc_auc_score(y_test, grid_search_rf.predict_proba(X_test)[:,1])
print("Random Forest ROC AUC:", roc_auc)
RocCurveDisplay.from_estimator(grid_search_rf, X_test, y_test)
plt.show()

# =============================
# STEP 8: Logistic Regression
# =============================
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

param_grid_lr = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__penalty': ['l2']
}

grid_search_lr = GridSearchCV(
    pipeline_lr,
    param_grid_lr,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_lr.fit(X_train, y_train)
y_pred_lr = grid_search_lr.predict(X_test)

print("\nLogistic Regression Test Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))


Sample data (first 5 rows):


Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,Humidity3pm,Pressure3pm,RainToday,RainTomorrow
0,12.490802,18.702659,5.234114,73.81624,65.75967,999.681776,Yes,No
1,24.014286,25.838019,4.939576,83.734512,84.434586,1003.671783,Yes,Yes
2,19.639879,32.458917,18.125092,40.037432,80.812874,1022.72737,Yes,Yes
3,16.97317,29.644498,4.990924,69.989928,32.311992,997.000219,Yes,Yes
4,8.120373,31.131223,5.438995,65.739679,31.939958,1023.482484,Yes,No



Missing values check:
MinTemp          0
MaxTemp          0
Rainfall         0
WindGustSpeed    0
Humidity3pm      0
Pressure3pm      0
RainToday        0
RainTomorrow     0
dtype: int64

Training set shape: (800, 7)
Test set shape: (200, 7)
