In [2]:
import pandas as pd
import numpy as np
# import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
# Load the merged dataset
file_path = './Virginia Public Schools Archive/demographics_sol_dataset.csv'
df = pd.read_csv(file_path)

mean_sol_pass_rate = df['SOL Pass Rate'].mean()
print(mean_sol_pass_rate)

0.663268653808416


In [4]:
df.shape

(1877, 22)

In [5]:
df.isnull().sum().sort_values(ascending=False)

cohort_dropout_rate                              1563
percentage_homeless                               222
free_reduced_lunch_percentage                     182
Bachelors_Percent                                 171
Doctoral_Percent                                  171
Masters_Percent                                   171
Total_Per_Pupil_Expenditures                      167
Division_Level_Expenditures_Per_Pupil_State       167
Division_Level_Expenditures_Per_Pupil_Federal     167
School_Level_Expenditures_Per_Pupil_State         167
School_Level_Expenditures_Per_Pupil_Federal       167
Total_Expenditures                                167
chronic_absenteeism_rate                          166
SOL Pass Rate                                     166
percentage_foster_care                             98
percentage_male                                    58
percentage_english_learners                        44
percentage_disadvantaged                           25
percentage_military         

In [6]:
numerical_df = df.select_dtypes(include= np.number)
num_summary = numerical_df.describe().T
numerical_df.head()

Unnamed: 0,percentage_disadvantaged,percentage_english_learners,percentage_foster_care,percentage_male,percentage_homeless,percentage_military,percentage_disabled,free_reduced_lunch_percentage,chronic_absenteeism_rate,cohort_dropout_rate,Bachelors_Percent,Masters_Percent,Doctoral_Percent,SOL Pass Rate,School_Level_Expenditures_Per_Pupil_Federal,School_Level_Expenditures_Per_Pupil_State,Division_Level_Expenditures_Per_Pupil_Federal,Division_Level_Expenditures_Per_Pupil_State,Total_Per_Pupil_Expenditures,Total_Expenditures
0,0.274112,0.137532,0.002538,0.522843,0.0,0.124365,0.120558,0.258,0.1081,,35.0,62.0,2.0,0.765,524.0,7333.0,571.0,3640.0,12068.0,9941898.0
1,0.381919,0.053803,0.0,0.49631,0.0,0.02583,0.169742,0.405,0.2736,,44.0,56.0,0.0,0.6975,198.0,9208.0,745.0,2110.0,12261.0,14493019.0
2,0.542135,0.194842,0.002809,0.502809,0.0,0.007022,0.141854,0.609,0.1985,,31.0,69.0,0.0,0.53,879.0,8338.0,543.0,4412.0,14172.0,6946661.0
3,0.358209,0.052434,0.003731,0.526119,0.0,0.014925,0.05597,0.373,0.2289,,52.0,44.0,0.0,0.72,861.0,8633.0,45.0,2983.0,12522.0,5874488.0
4,0.6,0.0,0.0,0.494624,0.0,0.070968,0.094624,0.911,0.2294,,58.0,32.0,3.0,0.5175,1540.0,6746.0,446.0,2738.0,11470.0,5366257.0


In [7]:
num_summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
percentage_disadvantaged,1852.0,0.4789965,0.2409772,0.0,0.3177511,0.4799329,0.6182932,1.0
percentage_english_learners,1833.0,0.1995751,0.3170542,0.0,0.0,0.04929577,0.2116788,1.0
percentage_foster_care,1779.0,0.117112,0.3178648,0.0,0.0,0.001650165,0.00578054,1.0
percentage_male,1819.0,0.512401,0.06093966,0.0,0.4992472,0.5151515,0.5310835,1.0
percentage_homeless,1655.0,0.05191336,0.2155176,0.0,0.0,0.0,0.0,1.0
percentage_military,1857.0,0.1825376,0.3389872,0.0,0.0078125,0.0230608,0.119403,1.0
percentage_disabled,1860.0,0.2439119,0.2864344,0.0,0.1139402,0.1399344,0.1791184,1.0
free_reduced_lunch_percentage,1695.0,0.5092136,0.2767305,0.0,0.294,0.478,0.7535,1.0
chronic_absenteeism_rate,1711.0,0.1967887,0.1046815,0.0,0.1201,0.1803,0.2551,0.9172
cohort_dropout_rate,314.0,0.05200541,0.06097394,0.0,0.0174,0.03925,0.0635,0.5057


In [8]:
categorical_df = df.select_dtypes(include= "object")
cat_summary = categorical_df.describe().T
categorical_df.head()

Unnamed: 0,Division Name,School Name
0,Prince William County,A. Henderson Elementary
1,Culpeper County,A.G. Richardson Elementary
2,Chesterfield County,A.M. Davis Elementary
3,Warren County,A.S. Rhodes Elementary
4,Hampton City,A.W.E. Bassette Elementary


In [9]:
cat_summary

Unnamed: 0,count,unique,top,freq
Division Name,1877,133,Fairfax County,199
School Name,1877,1803,Mountain View Elementary,8


In [10]:
all_data_na = (df.isnull().sum() / len(df)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:100]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data

Unnamed: 0,Missing Ratio
cohort_dropout_rate,83.271177
percentage_homeless,11.827384
free_reduced_lunch_percentage,9.696324
Bachelors_Percent,9.110282
Masters_Percent,9.110282
Doctoral_Percent,9.110282
Total_Per_Pupil_Expenditures,8.897176
Division_Level_Expenditures_Per_Pupil_State,8.897176
Division_Level_Expenditures_Per_Pupil_Federal,8.897176
School_Level_Expenditures_Per_Pupil_State,8.897176


drop rows with missing target variable & feature with over 80% missing

In [11]:
df = df.drop(columns='cohort_dropout_rate')
df = df.dropna(subset=['SOL Pass Rate'])

In [12]:
all_data_na = (df.isnull().sum() / len(df)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:100]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data

Unnamed: 0,Missing Ratio
percentage_homeless,12.098188
percentage_foster_care,5.201636
percentage_male,3.156049
percentage_english_learners,2.279369
percentage_disadvantaged,1.285798
percentage_military,1.168907
free_reduced_lunch_percentage,0.935126
percentage_disabled,0.87668
Doctoral_Percent,0.292227
Bachelors_Percent,0.292227


In [13]:
df_temp = df.drop(columns=['Division Name', 'School Name'])

In [14]:
num_transformer = Pipeline([
    ("Imputer", SimpleImputer(strategy= "mean")),
    ("Scaler", StandardScaler())])

cat_transformer = Pipeline([
    ("Imputer", SimpleImputer(strategy= "constant", fill_value= "MISSING")),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output = False))
])

In [15]:
categorical_columns = df_temp.select_dtypes(include='object').columns
numerical_columns = df_temp.select_dtypes(include=np.number).columns

preprocessor = ColumnTransformer(
    [("num", num_transformer, numerical_columns),
     ("cat", cat_transformer, categorical_columns)],
    remainder= "passthrough")

pipeline = Pipeline([("preprocessor", preprocessor)])

In [16]:
X_preprocessed = preprocessor.fit_transform(df_temp)

In [17]:
X = pd.DataFrame(X_preprocessed, columns= preprocessor.get_feature_names_out())
X.head()

Unnamed: 0,num__percentage_disadvantaged,num__percentage_english_learners,num__percentage_foster_care,num__percentage_male,num__percentage_homeless,num__percentage_military,num__percentage_disabled,num__free_reduced_lunch_percentage,num__chronic_absenteeism_rate,num__Bachelors_Percent,num__Masters_Percent,num__Doctoral_Percent,num__SOL Pass Rate,num__School_Level_Expenditures_Per_Pupil_Federal,num__School_Level_Expenditures_Per_Pupil_State,num__Division_Level_Expenditures_Per_Pupil_Federal,num__Division_Level_Expenditures_Per_Pupil_State,num__Total_Per_Pupil_Expenditures,num__Total_Expenditures
0,-0.873119,-0.22109,-0.381386,0.200292,-0.263296,-0.183014,-0.434027,-0.912335,-0.847472,-0.506522,0.579529,0.484431,0.648046,-0.452145,-0.607534,0.014515,-0.107781,-0.644443,0.01989
1,-0.420032,-0.4849,-0.389405,-0.260542,-0.263296,-0.472237,-0.262086,-0.378474,0.733977,0.223246,0.122619,-0.662604,0.21806,-0.918616,-0.115487,0.298013,-1.074189,-0.599907,0.648554
2,0.253316,-0.040519,-0.380531,-0.147663,-0.263296,-0.527442,-0.359579,0.362396,0.016353,-0.830863,1.11259,-0.662604,-0.848944,0.055822,-0.343797,-0.031105,0.379845,-0.158937,-0.393853
3,-0.519679,-0.489213,-0.377616,0.257204,-0.263296,-0.504246,-0.659821,-0.494688,0.306843,0.871928,-0.7912,-0.662604,0.361388,0.030066,-0.266381,-0.842495,-0.522768,-0.53968,-0.541957
4,0.496509,-0.654422,-0.389405,-0.28983,-0.263296,-0.339748,-0.524692,1.459172,0.31162,1.35844,-1.705018,1.057948,-0.928571,1.001642,-0.761577,-0.189147,-0.67752,-0.782433,-0.612161


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, train_test_split, KFold

from scipy.stats import uniform, randint
from sklearn.ensemble import VotingClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import numpy as np

In [19]:
y = df['SOL Pass Rate']
y

0       0.7650
1       0.6975
2       0.5300
3       0.7200
4       0.5175
         ...  
1872    0.7925
1873    0.8500
1874    0.7180
1875    0.5500
1876    0.7150
Name: SOL Pass Rate, Length: 1711, dtype: float64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

# Define models
models = {
    f"RandomForest_{i}": RandomForestRegressor(random_state=42)
    for i in range(1, 2)
}
# Add Linear Regression to the models
models["LinearRegression"] = LinearRegression()

# Define hyperparameter grids for RandomForestRegressor
random_param_grids = {
    f"RandomForest_{i}": {
        "n_estimators": [50 * i, 100 * i, 150 * i],
        "max_depth": [10 * i, 20 * i, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }
    if i % 2 == 0 else {
        "n_estimators": [75 * i, 125 * i, 200 * i],
        "max_depth": [5 * i, 15 * i, None],
        "min_samples_split": [3, 6, 9],
        "min_samples_leaf": [1, 3, 5]
    }
    for i in range(1, 2)
}
# Linear Regression does not require hyperparameter tuning
random_param_grids["LinearRegression"] = None


<h1> Continuous <h1>

In [22]:
import os
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold, RandomizedSearchCV

# Cross-validation configuration
cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Store model tuning and results
grids = {}
test_errors = {}
results = []

# Loop through models and hyperparameter grids
for model_name, model in models.items():
    if model_name in random_param_grids and random_param_grids[model_name]:  # Check if grid is defined
        print(f'Training and tuning {model_name}...')
        
        # RandomizedSearchCV for hyperparameter tuning
        grids[model_name] = RandomizedSearchCV(
            estimator=model,
            param_distributions=random_param_grids[model_name],
            n_iter=5,
            cv=cv,
            scoring='neg_mean_squared_error',  # Using MSE for scoring
            n_jobs=-1,
            verbose=3
        )
        grids[model_name].fit(X_train, y_train)
        
        # Extract best parameters and scores
        best_params = grids[model_name].best_params_
        best_score = -grids[model_name].best_score_  # Convert back to positive MSE
        print(f'Best parameters for {model_name}: {best_params}')
        print(f'Best CV Mean Squared Error for {model_name}: {best_score:.4f}\n')

        # Evaluate on test set
        best_model = grids[model_name].best_estimator_
        y_pred = best_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        test_errors[model_name] = mse
        print(f"Test Metrics for {model_name}:")
        print(f" - Mean Squared Error: {mse:.4f}")
        print(f" - Mean Absolute Error: {mae:.4f}")
        print(f" - R² Score: {r2:.4f}\n")

        # Log results for this model
        results.append({
            "model": model_name,
            "mean_cv_mse": best_score,
            "test_mse": mse,
            "test_mae": mae,
            "test_r2": r2,
            "best_params": best_params
        })
    else:
        print(f'{model_name} does not require hyperparameter tuning. Training directly...')
        
        # Train directly without tuning
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        test_errors[model_name] = mse
        print(f"Test Metrics for {model_name}:")
        print(f" - Mean Squared Error: {mse:.4f}")
        print(f" - Mean Absolute Error: {mae:.4f}")
        print(f" - R² Score: {r2:.4f}\n")

        # Log results for this model
        results.append({
            "model": model_name,
            "mean_cv_mse": None,  # No CV score since not tuned
            "test_mse": mse,
            "test_mae": mae,
            "test_r2": r2,
            "best_params": model.get_params()
        })

# Save results to a CSV file
results_df = pd.DataFrame(results)
output_file = "ml_model_results_continuous.csv"

if os.path.exists(output_file):
    # If the file exists, append the new results
    existing_df = pd.read_csv(output_file)
    updated_df = pd.concat([existing_df, results_df], ignore_index=True)
    updated_df.to_csv(output_file, index=False)
    print(f"Results appended to {output_file}")
else:
    # If the file does not exist, create it
    results_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")


Training and tuning RandomForest_1...
Fitting 3 folds for each of 5 candidates, totalling 15 fits


[CV 2/3] END max_depth=5, min_samples_leaf=1, min_samples_split=6, n_estimators=75;, score=-0.000 total time=   0.3s
[CV 3/3] END max_depth=5, min_samples_leaf=1, min_samples_split=6, n_estimators=75;, score=-0.000 total time=   0.3s
[CV 1/3] END max_depth=5, min_samples_leaf=1, min_samples_split=6, n_estimators=75;, score=-0.000 total time=   0.3s
[CV 1/3] END max_depth=None, min_samples_leaf=5, min_samples_split=3, n_estimators=125;, score=-0.000 total time=   0.6s
[CV 2/3] END max_depth=None, min_samples_leaf=5, min_samples_split=3, n_estimators=125;, score=-0.000 total time=   0.6s
[CV 1/3] END max_depth=5, min_samples_leaf=3, min_samples_split=6, n_estimators=200;, score=-0.000 total time=   0.8s
[CV 2/3] END max_depth=5, min_samples_leaf=3, min_samples_split=6, n_estimators=200;, score=-0.000 total time=   0.8s
[CV 3/3] END max_depth=5, min_samples_leaf=3, min_samples_split=6, n_estimators=200;, score=-0.000 total time=   0.8s
[CV 1/3] END max_depth=5, min_samples_leaf=1, min_sam