# Typical Project Workflow and Best Practices
### Pandas, Numpy, Scikit-learn

### 1. Analyze the problem

In [None]:
'''
DECISION TREE 1: IDENTIFY PROBLEM TYPE
├── Target is continuous (price, temperature, count) → REGRESSION
│   ├── Evaluation metric mentioned?
│   │   ├── MAE → Use mean_absolute_error
│   │   ├── MSE/RMSE → Use mean_squared_error
│   │   ├── R² → Use r2_score
│   │   └── Not mentioned → Assume MAE or RMSE
│   └── Default models: RandomForest, GradientBoosting
│
└── Target is categorical (yes/no, classes) → CLASSIFICATION
    ├── Binary (2 classes) → BINARY CLASSIFICATION
    │   ├── Metric: accuracy, precision, recall, F1, AUC
    │   └── Models: LogisticRegression, RandomForest, GradientBoosting
    └── Multi-class (3+ classes) → MULTICLASS CLASSIFICATION
        ├── Metric: accuracy, weighted F1
        └── Models: RandomForest, GradientBoosting
'''

### 2. Load data & Exploratory Data Analysis (EDA)

In [None]:
'''
2.1: Data Loading Strategy
2.2: Quick EDA Checks
├── Missing Values: <5%, 5-30%, >30%
├── Data Types: numerical, categorical, dates
├── Target Analysis: outliers, imbalance
└── Feature Types: low/high cardinality
'''
# 2.1: Load data
df = pd.read_csv('dataset.csv')
df = pd.read_excel('dataset.csv', parse_dates=['date', 'timestamp'])

# LOAD MULTIPLE FILES (cars_0.csv, cars_1.csv, etc.)
dfs = []
for i in range(4):  # cars_0 through cars_3
    df = pd.read_csv(f'cars_{i}.csv')
    dfs.append(df)

train = pd.concat(dfs, ignore_index=True)
print(f"✓ Combined {len(dfs)} files into {train.shape[0]} rows")

#2.2: Quick EDA
print(train.shape, val.shape, test.shape)

print(train.head(10)) 
print(train.sample(10, random_state=42))

print(train.dtypes)
print(train.info())
print(train.describe())
print(train.isnull().sum())

n_duplicates = train.duplicated().sum()
duplicates = train[train.duplicated(keep=False)]
print(duplicates.head(10))

# more on missing vals

cols_with_missing = train.columns[train.isnull().any()].tolist()

rows_with_missing = train.isnull().any(axis=1).sum()

# summary stats
print(train.describe())
print(train.describe(include='all')) # include categorical cols

# stats on target col (regression)
print(train[target_col].describe()) # basic stats

print(f"Skewness: {train[target_col].skew():.3f}") # target col distribution
print(f"Kurtosis: {train[target_col].kurt():.3f}")

Q1 = train[target_col].quantile(0.25) # outlier check
Q3 = train[target_col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = train[(train[target_col] < lower_bound) | (train[target_col] > upper_bound)]
outliers.head()
percent_of_outliers = len(outliers)/len(train)*100

print(f"Zero values: {(train[target_col] == 0).sum()}")
print(f"Negative values: {(train[target_col] < 0).sum()}")

# stats on target col (classification)
n_classes = train[target_col].nunique()
counts_perclass = train[target_col].value_counts()

minority_class_percent = counts.min() / counts.sum() * 100
print(f"Minority class: {minority_class_percent:.1f}%")
if minority_class_pct < 30:
    print("IMBALANCED - Use class_weight='balanced'")
else:
    print("Balanced")

# Breakdown by type
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()

cat_cols = train.select_dtypes(include=['object']).columns.tolist()

date_cols = train.select_dtypes(include=['datetime64']).columns.tolist()

# stats on numerical cols
print(train[numerical_cols].describe())
n_unique = train[col].nunique() # in case numerical is actually categorical -- n_unique <=10
neg_count = (train[col] < 0).sum() # check for neg vals

if train[col].std() == 0: # check for constant cols
        print(f"⚠️ {col} is constant (std=0) - Consider dropping")

# stats on categorical cols

for col in cat_cols: # check for natural order and cardinality
    n_unique = train[col].nunique()
    print(f"{col}: {n_unique} unique values")
    print(f"Value Counts:{train[col].value_counts()}")

binary_cols = []
low_card_cols = []
high_card_cols = []

for col in cat_cols:
    n_unique = train[col].nunique()
    if n_unique == 2:
        binary_cols.append(col)
    elif n_unique <= 10:
        low_card_cols.append(col)
    else:
        high_card_cols.append(col)

# stats on date cols
print(f"Date range: {train[date_col].min()} to {train[date_col].max()}")
train['year'] = train[date_col].dt.year
train['month'] = train[date_col].dt.month
train['day'] = train[date_col].dt.day

print(train['year'].value_counts().sort_index()) # check distribution
print(train['month'].value_counts().sort_index())

# stats on correlations
correlations = train[numeric_cols].corr()[target_col].sort_values(ascending=False) # WITH TARGET
print(correlations)

corr_matrix = train[numeric_cols].corr() # WITH EACHOTHER (multicollinearity)
print(corr_matrix)

### 3. Preprocessing

In [None]:
'''
3.1: Handle missing values
├── <5% missing → simple imputation: mean/median/mode
├── 5-30% missing → KNN imputation, group-wise mean imputation, or drop
└── >30% missing → DROP column

3.2: Categorical Encoding
├── 2 values → Label encoding
├── 3-10 values → Ordinal or One-hot
└── >10 values → Label encoding (trees) or Target encoding

3.3: Scaling Strategy
├── Tree-based → NO scaling needed
├── Linear models → StandardScaler REQUIRED
└── Neural nets → MinMaxScaler or StandardScaler

3.4: Feature Engineering
├── Time available >5 min → Try interactions
└── <5 min → Skip
'''
##########################
# 3.1: MISSING VALS
##########################
# IF <5% MISSING & NUMERICAL COL: Simple median imputation (robust to outliers)
numeric_cols = train.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if train[col].isnull().sum() > 0:
        median = train[col].median()
        train[col].fillna(median, inplace=True)
        val[col].fillna(median, inplace = True)
        test[col].fillna(median, inplace = True)
        
# IF <5% MISSING & CATEGORICAL COL: Simple mode imputation (robust to outliers)
mode = train['category'].mode()[0]
train['category'].fillna(mode, inplace=True)
val['category'].fillna(mode, inplace=True)
test['category'].fillna(mode, inplace=True)

# IF 5-30% MISSING & MODE/MEDIAN VARIES MEANGINFULLY ACROSS GROUPS: Group-wise mode imputation 
# categorical example: calculate mode color by brand group
color_by_brand = train.groupby('brand')['color'].agg(lambda x: x.mode()[0] if len(x.mode()) > 0 else 'Unknown').to_dict() # on TRAIN ONLY!
train['color'] = train.apply(
    lambda row: color_by_brand.get(row['brand'], 'Unknown') if pd.isna(row['color']) else row['color'],
    axis=1
)
# Numerical example: Calculate median price per category
price_by_category = train.groupby('category')['price'].median().to_dict()
for df in [train, val, test]:
    df['price'] = df.apply(
        lambda row: price_by_category.get(row['category'], train['price'].median()) if pd.isna(row['price']) else row['price'],
        axis=1
    )

# IF 5-30% MISSING & MISSING MEANS "NONE" OR ABSENCE: Fill with zeroes or 'unknown' (e.g. num_extra_bedrooms = NaN means no extra bedrooms)
train['num_views'].fillna(0, inplace=True) # numerical case
val['num_views'].fillna(0, inplace=True)
test['num_views'].fillna(0, inplace=True)

train['category'].fillna('Unknown', inplace=True) # categorical case
val['category'].fillna('Unknown', inplace=True)
test['category'].fillna('Unknown', inplace=True)

# IF 5-30% MISSING & MISSINGNESS IS INFORMATIVE/PREDICTIVE/HAS A PATTERN: Simple median/mode imputation AND missing flag column
train['income_missing'] = train['income'].isnull().astype(int)
val['income_missing'] = val['income'].isnull().astype(int)
test['income_missing'] = test['income'].isnull().astype(int)
# fill = train['income'].median().....

# IF >30% MISSING AND NOT CRTICIAL: Drop cols
cols_to_drop = ['col1', 'col2', 'col3'] # # DROPPING COLS
train.drop(cols_to_drop, axis=1, inplace=True)
val.drop(cols_to_drop, axis=1, inplace=True)
test.drop(cols_to_drop, axis=1, inplace=True)

# CLIPPING OUTLIERS
lower = train['price'].quantile(0.01)
upper = train['price'].quantile(0.99)

train['price'] = train['price'].clip(lower, upper)
val['price'] = val['price'].clip(lower, upper)

##########################
# 3.2: ENCODING
##########################
# IF BINARY CATEGORICAL COLUMNS: binary Label Encoding
train['has_sunroof'] = train['has_sunroof'].map({'No': 0, 'Yes': 1})
val['has_sunroof'] = val['has_sunroof'].map({'No': 0, 'Yes': 1})
test['has_sunroof'] = test['has_sunroof'].map({'No': 0, 'Yes': 1})

# IF NATURAL ORDER: ordinal encoding
condition_map = {'Poor': 0, 'Fair': 1, 'Good': 2, 'Excellent': 3}
train['col'] = train['col'].map(condition_map)
val['col'] = val['col'].map(condition_map)
test['col'] = test['col'].map(condition_map)

# IF NO ORDER, LOW CARDINALITY (<10 categories), AND USING TREE MODEL: Label Encoding
from sklearn.preprocessing import LabelEncoder

cat_cols = train.select_dtypes(include=['object']).columns.tolist()
cat_cols = [c for c in cat_cols if c not in ['id', 'car_id', 'customer_id']]

for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))
    val[col] = le.transform(val[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# IF NO ORDER, LOW CARDINALITY (<10 categories), AND USING LINEAR MODEL: One-hot encoding
train = pd.get_dummies(train, columns=['color'], prefix='clr')
val = pd.get_dummies(val, columns=['color'], prefix='clr')
test = pd.get_dummies(test, columns=['color'], prefix='clr')

# IF NO ORDER, HIGH CARDINALITY (>10 categories): Target encoding
target_means = training.groupby('col')['targetcol'].mean().to_dict() # get means on training ONLY

training['col_encoded'] = training['col'].map(target_means)
val['col_encoded'] = val['col'].map(target_means)
test['col_encoded'] = test['col'].map(target_means)

##########################
# 3.3: SCALING
##########################

# Standardizing (Distribution of Normal(0,1)): Feature already normally dist'd. Useful for linear models--Linreg, logreg, svms, NNs, etc
from sklearn.preprocessing import StandardScaler

feature_cols = ['mileage', 'horsepower', 'age']

scaler = StandardScaler()
train[feature_cols] = scaler.fit_transform(train[feature_cols])
val[feature_cols] = scaler.transform(val[feature_cols])
test[feature_cols] = scaler.transform(test[feature_cols])

# Log Plus 1 Transformation (--> Normal dist): Feature right-skewed/skewness>1, wide range of vals, many outliers. Useful for linear models.
train['price_log'] = np.log1p(train['price'])
val['price_log'] = np.log1p(val['price']) # often must STANDARDIZE AFTER
test['price_log'] = np.log1p(test['price'])

# Min-Max scaling (Normalizing 0 to 1): Features have known bounds, all features are >= 0, many zeros, outliers. Useful for NNs, pixel data.
from sklearn.preprocessing import MinMaxScaler
cols_to_scale = ['mileage', 'horsepower', 'age']

scaler = MinMaxScaler()
train[cols_to_scale] = scaler.fit_transform(train[cols_to_scale])
val[cols_to_scale] = scaler.transform(val[cols_to_scale])
test[cols_to_scale] = scaler.transform(test[cols_to_scale])

##########################
# 3.4: FEATURE ENGINEERING
##########################
# sum
train['total_rooms'] = train['bedrooms'] + train['bathrooms']
val['total_rooms'] = val['bedrooms'] + val['bathrooms']
test['total_rooms'] = test['bedrooms'] + test['bathrooms']

# multiplication
train['age_mileage'] = train['age'] * train['mileage']
val['age_mileage'] = val['age'] * val['mileage']
test['age_mileage'] = test['age'] * test['mileage']

# division or ratios (add 1 to avoid /0)
train['views_per_age'] = train['num_views'] / (train['age'] + 1)
val['views_per_age'] = val['num_views'] / (val['age'] + 1)
test['views_per_age'] = test['num_views'] / (test['age'] + 1)

# date cols
train['year'] = train['date'].dt.year # do for all 3 sets
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['is_weekend'] = train['date'].dt.dayofweek.isin([5, 6]).astype(int)
train['dayofweek'] = train['date'].dt.dayofweek

##########################
# 3.5: PREPARE X AND Y FOR MODEL TRAINING
##########################
target_col = 'price'
id_col = 'car_id'
feature_cols = [c for c in train.columns if c not in [id_col, target_col]]

# IF VAL SET PROVIDED: simple x-y split
X_train = train[feature_cols]
y_train = train[target_col]

X_val = val[feature_cols]
y_val = val[target_col]

X_test = test[feature_cols]
test_ids = test[id_col]

print(f"Features: {len(feature_cols)}")
print(f"X_train: {X_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"X_test: {X_test.shape}")

# IF VAL SET NOT PROVIDED: 80/20 train-test SPLIT
from sklearn.model_selection import train_test_split

X = train.drop(['id', 'target'], axis=1)
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) # create validation set

X_test = test.drop(['id'], axis=1)
test_ids = test['id']

# IF CLASSIFICATION PROBLEM: STRATIFIED SPLIT
X = train.drop(['id', 'target'], axis=1)
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  # Keep class balance
)

### 4. Train multiple models (predict on validation set)

#### 4.0: Model Bank

In [None]:
######################################
# LINEAR REGRESSION FAMILY
########################################
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# Simple Linear Regression (no regularization)
lr = LinearRegression()

# Ridge (L2 regularization) - most common
ridge = Ridge(alpha=1.0)

# Lasso (L1 regularization) - feature selection
lasso = Lasso(alpha=1.0)

'''Assumptions:
1. linearity (straight-line relationship between x and y) | Residuals vs. fitted values plot 
2. independence (errors unrelated) | depends how data collected
3. homoscedasticity (variance of errors/residuals should be constant across all vals of indep var) | plot(x=fitted vals, y= stdized residuals)
4. Normality (residual errors should be normally distributed centered around 0) | qqplot of residuals, points along qqline
'''
# CHECK IF LINEAR MODEL IS APPROPRIATE

# 1. Check for linearity (correlation with target)
print(train[numeric_cols].corr()['target'].sort_values(ascending=False))

# 2. Check for skewness (should be < 1.0)
for col in numeric_cols:
    skew = train[col].skew()
    if abs(skew) > 1.0:
        print(f"{col}: skewness={skew:.2f} - consider log transform")

# 3. Check multicollinearity
corr_matrix = train[numeric_cols].corr()
high_corr = (corr_matrix.abs() > 0.8) & (corr_matrix != 1.0)
if high_corr.any().any():
    print("High multicollinearity detected - use Ridge")

########################################
# LOGISTIC REGRESSION
########################################
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(
    max_iter=1000,           # Increase if convergence warning
    random_state=42,
    class_weight='balanced'  # Use if imbalanced
)
'''Assumptions:
1. binary/multinomial outcome | target is categorical var
2. Linearity with log-odds | Assumes: log(P/(1-P)) is linear in X, Fix by: Add polynomial features, interactions
3. independence (errors unrelated) | depends how data collected
4. homoscedasticity (variance of errors/residuals should be constant across all vals of indep var) | plot(x=fitted vals, y= stdized residuals)
5. Large Sample Size (At least 10 samples per feature per class) | Need n >> p (many more samples than features), fix by: L2 regularization (reduce feats, get more data)
'''
#########################################
# DECISION TREE
########################################
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

dt = DecisionTreeRegressor(
    max_depth=10,
    min_samples_split=20,
    random_state=42
)
'''
✅ Assumptions: NONE! 
✅ No linearity assumption, normality assumption, or homoscedasticity assumption
✅ Can handle non-linear patterns
✅ Can handle feature interactions automatically
✅ No scaling needed
'''
#########################################
# ENSEMBLE METHODS
#########################################
# RANDOM FOREST (uses bagging/boostrap aggregation)
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)
""" How it works:
1. Create N bootstrap samples (random sampling with replacement)
2. Train independent model on each sample
3. Average predictions (parallel)
Models trained independently in parallel, with goal: Reduce variance (overfitting)
"""
# GRADIENT BOOSTING (sequential learning)
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

gb = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
'''
How it works:
1. Train model on data
2. Train next model on errors of previous model
3. Combine predictions with weights (sequential)
Models trained sequentially, with goal: Reduce bias (underfitting)
'''

'''
Overall ensemble method Assumptions: NONE! 
✅ No linearity assumption, normality assumption, or homoscedasticity assumption
✅ Can handle non-linear patterns
✅ Can handle feature interactions automatically
✅ No scaling needed
'''
#########################################
# DISTANCE-BASED MODELS
#########################################

# K-NEAREST NEIGHBORS
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

knn = KNeighborsRegressor(
    n_neighbors=5,           # Number of neighbors
    weights='distance',      # Weight by distance
    metric='euclidean'       # Distance metric
)
'''
Assumptions:
1. Similar Points → Similar Outcomes (Nearby points in feature space have similar targets) | Distance in feature space is meaningful
2. Features are Scaled (All features must be on same scale, If violated: Large-scale features dominate) |Fix: ALWAYS use StandardScaler or MinMaxScaler
3. Low Dimensionality (Works with (< 20 features)) | "Curse of dimensionality" - IF VIOLATED, All points equally distant, poor performance. Fix by: Dimensionality reduction (PCA), feature selection
'''
# SUPPORT VECTOR MACHINES
from sklearn.svm import SVR, SVC

# Regression
svr = SVR(
    kernel='rbf',            # 'linear', 'poly', 'rbf'
    C=1.0,                   # Regularization
    gamma='scale'            # Kernel coefficient
)

# Classification
svc = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    random_state=42
)
'''
Assumptions:
1. Data is Scaled | Fix: ALWAYS use StandardScaler
2. Kernel Choice Matters | Linear kernel: Assumes linear separability, RBF kernel: Can handle non-linear patterns, Polynomial: Can handle polynomial relationships
3. Not Too Many Features (Prefers: n_samples > n_features) | fix: feature selection, regularization w/ c param


#### 4.1 Model Selection step-by-step

In [3]:
'''
4.1: Model Selection
SUPERVISED LEARNING
├── REGRESSION (continuous target)
│   ├── Linear Models 
│   ├── Tree-based Models 
│   ├── Distance-based Models
│   ├── Small dataset (<1000): Ridge → RF → GB
│   └── Large dataset (>1000): RF → GB
└── CLASSIFICATION (categorical target)
    ├── Linear Models 
    ├── Tree-based Models 
    ├── Distance-based Models
    ├── Balanced classes: RF → GB → LogReg
    └── Imbalanced classes: RF(class_weight) → SMOTE

4.2: Hyperparameter Tuning
├── >5 min → GridSearchCV
├── 2-5 min → Manual tuning
└── <2 min → Default params
'''
##########################
# 4.1: Regression
##########################
# Linear Regression
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# RANDOM FOREST
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

rf = RandomForestRegressor(
    n_estimators=100, # more trees = better performance, less variance/overfitting, BUT slower training
    max_depth=10, # hgiher depth = capture complex patterns, better on training, BUT risk of overfitting, slower training
    min_samples_split=5, # higher = less overfitting, faster training, BUT might underfit/high bias
    min_samples_leaf=2, # hgiher = smoother predicitions, less overfitting, BUT can underfit
    random_state=42,
    n_jobs=-1 # number CPU cores to use, -1 is ALWAYS use all cores
)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)

mae = mean_absolute_error(y_val, y_pred)
print(f"RandomForest MAE: ${mae:,.2f}")

# try different tree depths
for depth in [5, 10, 15, 20]: 
    rf = RandomForestRegressor(n_estimators=100, max_depth=depth, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    mae = mean_absolute_error(y_val, rf.predict(X_val))
    print(f"depth={depth}: MAE=${mae:,.2f}")

# GRADIENT BOOSTING
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(
    n_estimators=100, # more trees = better training performance, BUT risk overfitting, slower training, 
    learning_rate=0.1, # smaller step size for each tree's contribution = better generalization, less overfitting, need more trees to compensate
    max_depth=5, # deeper tree = learn complex patterns, BUT hgih overfit risk, slower training
    min_samples_split=5, # sams as RF
    subsample = 1, # lower subsample = adds randomness (like bagging), less overfitting, faster training, 
    random_state=42
)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f"GradientBoosting MAE: ${mae:,.2f}")

# try different learnign rates
for lr in [0.01, 0.05, 0.1, 0.2]:
    gb = GradientBoostingRegressor(n_estimators=100, learning_rate=lr, random_state=42)
    gb.fit(X_train, y_train)
    mae = mean_absolute_error(y_val, gb.predict(X_val))
    print(f"lr={lr}: MAE=${mae:,.2f}")
    
# RIDGE (needs scaling!)
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f"Ridge MAE: ${mae:,.2f}")

# LASSO
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=1.0)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f"Lasso MAE: ${mae:,.2f}")

#XGBOOST
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f"XGBoost MAE: ${mae:,.2f}")

# Light GBM
from lightgbm import LGBMRegressor

lgb = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
lgb.fit(X_train, y_train)
y_pred = lgb.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f"LightGBM MAE: ${mae:,.2f}")

##########################
# 4.2: Classification (metric: accuracy, precision, recall, f1)
##########################

# Logistic Regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000, 
                            class_weight='balanced', # important if class imbalance
                            random_state=42)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"LogisticRegression Accuracy: {acc:.4f}")

# Random Forest Classifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier(
    # ...
    class_weight='balanced',  # DIFFERENT: Important for imbalanced data!
    # ...
    )
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
acc = accuracy_score(y_val, y_pred) # DIFFERENT: accuracy metric
print(f"RandomForest Accuracy: {acc:.4f}")
    
# Gradient Boosting Classifier 
from sklearn.ensemble import GradientBoostingClassifier
# ...
acc = accuracy_score(y_val, y_pred) # DIFFERENT: accuracy meyric
print(f"GradientBoosting Accuracy: {acc:.4f}")

# XGBOOST  CLASSIFIER
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"XGBoost Accuracy: {acc:.4f}")

# SVMs (NEED SCALING!)
from sklearn.svm import SVC

svm = SVC(kernel='rbf', C=1.0, random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"SVM Accuracy: {acc:.4f}")


'\n4.1: Model Selection\n├── REGRESSION\n│   ├── Small dataset (<1000): Ridge → RF → GB\n│   └── Large dataset (>1000): RF → GB\n└── CLASSIFICATION\n    ├── Balanced classes: RF → GB → LogReg\n    └── Imbalanced classes: RF(class_weight) → SMOTE\n\n4.2: Hyperparameter Tuning\n├── >5 min → GridSearchCV\n├── 2-5 min → Manual tuning\n└── <2 min → Default params\n'

In [None]:
##########################
# REGRESSION: COMPARE 3 MODELS QUICKLY
##########################
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define models
models = {
    'RF': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'GB': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Ridge': Ridge(alpha=1.0)
}

# Train and evaluate
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    results[name] = [mae,rmse,r2]
    print(f"{name}: MAE=${mae:,.2f}, RMSE=${rmse:,.2f},r2=${r2:,.2f}")

# Get best model
best_name = min(results, key=results.get)
best_model = models[best_name]
print(f"\n✅ Best model: {best_name} (MAE=${results[best_name]:,.2f})")

##########################
# CLASSIFICATION: COMPARE 3 MODELS QUICKLY
##########################
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {
    'RF': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'GB': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'LogReg': LogisticRegression(max_iter=1000, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, average='weighted')
    rec = recall_score(y_val, y_pred, average='weighted')
    f1 = f1_score(y_val, y_pred, average='weighted')
    
    results[name] = [acc,prec,rec,f1]
    print(f"{name:<10} {acc:>10.4f} {prec:>10.4f} {rec:>10.4f} {f1:>10.4f}")

# Get best
best_name = max(results, key=results.get)
best_model = models[best_name]
print(f"\n Best model: {best_name} (Accuracy={results[best_name]:.4f})")

In [None]:
##########################
# HYPERPARAMETER TUNING
##########################
# Random Forest (regression with Grid Search)
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5] # dont use this in classification scenarios, otherwise process is same
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
grid = GridSearchCV(rf, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
grid.fit(X_train, y_train)

print(f"Best params: {grid.best_params_}")
print(f"Best score: ${-grid.best_score_:,.2f}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f"Validation MAE: ${mae:,.2f}")

# same for GB
param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7]
}

gb = GradientBoostingRegressor(random_state=42)
grid = GridSearchCV(gb, param_grid, cv=3, scoring='neg_mean_absolute_error')
grid.fit(X_train, y_train)

print(f"Best params: {grid.best_params_}")
best_model = grid.best_estimator_

# RANDOM FOREST REGRESSION WITH RANDOMIZED SEARCH (faster than grid search)
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
random_search = RandomizedSearchCV(
    rf, 
    param_dist, 
    n_iter=10,  # Try 10 random combinations
    cv=3, 
    scoring='neg_mean_absolute_error',
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train)

print(f"Best params: {random_search.best_params_}")
best_model = random_search.best_estimator_

# MANUAL TUNING (2d grid here)
best_mae = float('inf')
best_params = {}

for n in [50, 100, 150]: # tuning n_estimators
    for depth in [5, 10, 15]: # tuning max_depth
        rf = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=42, n_jobs=-1)
        rf.fit(X_train, y_train)
        mae = mean_absolute_error(y_val, rf.predict(X_val))
        print(f"n={n}, depth={depth}: MAE=${mae:,.2f}")
        
        if mae < best_mae:
            best_mae = mae
            best_params = {'n_estimators': n, 'max_depth': depth}

print(f"\n✅ Best params: {best_params}, MAE=${best_mae:,.2f}")

### 5. Choose best model & Re-train on full data (train + val)

In [None]:
'''
5.1: Validation 
├── Val set provided → Use it, retrain on train+val
└── No val set → Create 80/20 split or CV

5.2: Final Training
└── Always combine train + val before final predictions

5.3: Output Validation
├── Format checks
├── Value range checks
└── File format checks
'''
##########################
# 5.1: Cross-Val (if validation set not provided and dataset is tiny)
##########################
from sklearn.model_selection import cross_val_score

rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
scores = -scores  # Convert to positive

print(f"CV MAE scores: {scores}")
print(f"Mean MAE: ${scores.mean():,.2f}")
print(f"Std MAE: ${scores.std():,.2f}")

# multiple models with Cross-Val
models = {
    'RF': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'GB': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
    scores = -scores
    print(f"{name}: Mean MAE=${scores.mean():,.2f} (+/- ${scores.std():,.2f})")

##########################
# STRATIFIED K-FOLD CROSS-VAL (for classification)
##########################
from sklearn.model_selection import StratifiedKFold, cross_val_score

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf, X_train, y_train, cv=skf, scoring='accuracy')

print(f"CV Accuracy scores: {scores}")
print(f"Mean Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")

# Feature Importance
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

print(importance_df.head(10))

##########################
# 5.2: FULL MODEL EVALUATION/METRICS
##########################
# ALL REGRESSION METRICS
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y_pred = best_model.predict(X_val)

mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

# ALL CLASSIFICATION METRICS
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_model.predict(X_val)

acc = accuracy_score(y_val, y_pred)
prec = precision_score(y_val, y_pred, average='weighted')
rec = recall_score(y_val, y_pred, average='weighted')
f1 = f1_score(y_val, y_pred, average='weighted')

print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

# BINARY CLASSIFICATION METRICS
from sklearn.metrics import roc_auc_score
y_pred_proba = best_model.predict_proba(X_val)[:, 1]  # Probabilities for class 1
roc_auc = roc_auc_score(y_val, y_pred_proba)
print(f"ROC AUC: {roc_auc:.4f}")

# 5.3: FINAL TRAINING: RE-TRAIN BEST MODEL ON TRAIN + VAL SET
# RE-DO FULL METRICS