# Scratchbook

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import glm

# Example dataset: Predicting claims frequency
data = pd.DataFrame({
    'Age': [25, 40, 35, 50, 30],
    'Driving_Experience': [5, 20, 10, 25, 7],
    'Claims': [2, 0, 3, 1, 4]
})

# Fit a Poisson regression model
model = glm("Claims ~ Age + Driving_Experience", data=data, family=sm.families.Poisson()).fit()
print(model.summary())


Approach 1: Hyperparameter Tuning Directly on the Dev Set

•	In this approach, the dev set is used to evaluate and tune hyperparameters.


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Sample dataset
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Split into train, dev, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Train on train set and tune on dev set
model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Evaluate best model on dev set
best_model = grid_search.best_estimator_
y_dev_pred = best_model.predict(X_dev)
dev_accuracy = accuracy_score(y_dev, y_dev_pred)
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Dev Set Accuracy: {dev_accuracy:.4f}")

# Final evaluation on test set
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")


Best Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 50}
Dev Set Accuracy: 0.8667
Test Set Accuracy: 0.8467


Approach 2: K-Fold Cross-Validation on Train Set with Dev Set for Final Validation

•	In this approach, k-fold cross-validation is used for hyperparameter tuning, and the dev set is reserved for independent validation of the current parameter set.


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score

# Sample dataset
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Split into train, dev, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# K-Fold Cross-Validation on train set
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=kfold, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Evaluate best model on dev set
best_model = grid_search.best_estimator_
y_dev_pred = best_model.predict(X_dev)
dev_accuracy = accuracy_score(y_dev, y_dev_pred)
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Dev Set Accuracy: {dev_accuracy:.4f}")

# Final evaluation on test set
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")


Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Dev Set Accuracy: 0.8667
Test Set Accuracy: 0.8600
