In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Load and preprocess data
outcomes = pd.read_csv('outcomes.csv')
projects = pd.read_csv('projects.csv')

outcomes['fully_funded'] = outcomes['fully_funded'].map({'t': 1, 'f': 0})
outcomes['not_fully_funded'] = 1 - outcomes['fully_funded']
projects['date_posted'] = pd.to_datetime(projects['date_posted'])
data = pd.merge(projects, outcomes[['projectid', 'not_fully_funded']], on='projectid')

data = data[data['date_posted'] >= '2010-01-01']
data_chicago = data[data['school_state'] == 'IL']
data_chicago_dropped = data_chicago.drop(columns=[
    'secondary_focus_subject', 'secondary_focus_area', 'school_metro',
    'school_ncesid', 'school_latitude', 'school_longitude'
])

# Feature Selection
features = [
    'primary_focus_subject', 'resource_type', 'school_nlns', 'school_charter',
    'school_county', 'school_district', 'teacher_prefix', 'poverty_level',
    'grade_level', 'fulfillment_labor_materials',
    'total_price_excluding_optional_support', 'total_price_including_optional_support'
]



X = data_chicago_dropped[features]
y = data_chicago_dropped['not_fully_funded']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Define preprocessor
numeric_features = [
    'fulfillment_labor_materials', 'total_price_excluding_optional_support',
    'total_price_including_optional_support'
]
categorical_features = [
    'primary_focus_subject', 'resource_type', 'school_nlns', 'school_charter',
    'school_county', 'school_district', 'teacher_prefix', 'poverty_level', 'grade_level'
]



numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
])



# Create Pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])



# Parameter grid and grid search
demo_param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [5, 10],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__bootstrap': [True]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=model_pipeline,
    param_grid=demo_param_grid,
    cv=cv,
    refit='PR_AUC',
    n_jobs=-1,
    verbose=2,
    scoring={
        'accuracy': 'accuracy',
        'precision': 'precision',
        'recall': 'recall',
        'f1': 'f1',
        'roc_auc': 'roc_auc',
        'PR_AUC': 'average_precision'
    }
)

print("\nStarting grid search...")
grid_search.fit(X_train, y_train)

# Rest of your evaluation code...
print("\nBest parameters:")
print(grid_search.best_params_)
print(f"\nBest cross-validation score: {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nTest Set Evaluation:")
print(classification_report(y_test, y_pred))