### Convert Number Columns to Numeric and Feature Engineering

In [11]:
# Convert relevant columns to numeric, removing commas if present
cols_to_numeric = [
    'Battery Electric Vehicles (BEVs)',
    'Plug-In Hybrid Electric Vehicles (PHEVs)',
    'Electric Vehicle (EV) Total',
    'Non-Electric Vehicle Total',
    'Total Vehicles'
]
for col in cols_to_numeric:
    df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', ''), errors='coerce')
    df[col] = df[col].fillna(0)

# Add Year and Month columns
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

### Quick EDA: Distribution and Trend Plots

In [12]:
# Distribution of percent EVs
sns.histplot(df['Percent Electric Vehicles'], bins=40)
plt.title('Distribution: Percent Electric Vehicles')
plt.show()

# EV growth trend (sampled for large datasets)
sample_df = df.sample(n=1000, random_state=42) if len(df) > 1000 else df
sns.lineplot(x='Date', y='Electric Vehicle (EV) Total', data=sample_df)
plt.title('EV Total Over Time (sample)')
plt.tight_layout()
plt.show()

### Encode Categorical Columns

In [13]:
for col in ['County', 'State', 'Vehicle Primary Use']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

### Prepare Features and Target

In [14]:
features = [
    'Year', 'Month', 'County', 'State', 'Vehicle Primary Use',
    'Battery Electric Vehicles (BEVs)',
    'Plug-In Hybrid Electric Vehicles (PHEVs)',
    'Non-Electric Vehicle Total', 'Total Vehicles', 'Percent Electric Vehicles'
]
target = 'Electric Vehicle (EV) Total'

X = df[features]
y = df[target]

### Train-Test Split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Training with RandomForest and Hyperparameter Search (Optimized)

In [16]:
# Set up a smaller grid for speed
rf = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [25, 50, 100],
    'max_depth': [5, 10, 20]
}
search = RandomizedSearchCV(rf, param_grid, cv=3, n_jobs=-1, random_state=42)
search.fit(X_train, y_train)
best_rf = search.best_estimator_

### Model Evaluation & Visualization

In [17]:
y_pred = best_rf.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R2:  {r2_score(y_test, y_pred):.3f}")

# Scatter plot of actual vs predicted
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel('Actual EV Total')
plt.ylabel('Predicted EV Total')
plt.title('Actual vs Predicted EV Total (Test set)')
plt.show()

### Save the Model (Optional)

In [18]:
# Save the trained random forest model for later use
joblib.dump(best_rf, 'ev_adoption_rf_model.joblib')

### Conclusion

- Numeric columns were converted with outlier handling, features were engineered for modeling.
- Optimized hyperparameter search finds the best RandomForest.
- Metrics are reported and predictions visualized to assess performance.
- Save the model and code to GitHub for submission and reuse. 
