In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

In [2]:
# Read salary data
file_path = Path("All_data_cleaned.csv")
crime = pd.read_csv(file_path)

# Display sample data
crime.dtypes

Year                                                            int64
City                                                           object
Population 25 years and over - Less than 9th grade              int64
Percent Population 25 years and over - Less than 9th grade    float64
Population 25 years and over - 9th to12th (No Diploma)          int64
                                                               ...   
Percent 15 to 24                                              float64
Percent Uneducated                                            float64
Percent Higher Education                                      float64
Sum Uneducated                                                  int64
Sum Higher Education                                            int64
Length: 61, dtype: object

In [3]:
# One-hot encode the 'City' column
crime_encoded = pd.get_dummies(crime, columns=['City'])

In [4]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'City' column
crime['City_encoded'] = label_encoder.fit_transform(crime['City'])

crime.head()

Unnamed: 0,Year,City,Population 25 years and over - Less than 9th grade,Percent Population 25 years and over - Less than 9th grade,Population 25 years and over - 9th to12th (No Diploma),Percent Population 25 years and over - 9th to12th (No Diploma),Population 25 years and over - High School Graduate (and equivalent),Percent Population 25 years and over - High School Graduate (and equivalent),"Population 25 years and over - Some college, no degree","Percent Population 25 years and over - Some college, no degree",...,Percent Home Occupied,Percent Renter Occupied,Percent 25+,Percent 14-,Percent 15 to 24,Percent Uneducated,Percent Higher Education,Sum Uneducated,Sum Higher Education,City_encoded
0,2010,Alameda,2752,5.5,2234,4.5,8546,17.0,10009,19.9,...,46.920853,53.079147,65.8,19.8,12.3,10.0,53.1,4986,26639,0
1,2010,Alhambra,6236,10.1,4575,7.4,13358,21.7,12091,19.6,...,38.053381,61.946619,71.1,12.9,13.1,17.5,41.1,10811,25307,1
2,2010,Anaheim,32561,15.5,22329,10.7,50982,24.3,39407,18.8,...,48.38851,51.61149,61.0,21.2,16.7,26.2,30.7,54890,64362,2
3,2010,Antioch,4874,7.7,4633,7.3,17092,26.9,20113,31.7,...,62.767194,37.232806,60.7,22.9,15.5,15.0,26.5,9507,16815,3
4,2010,Apple Valley,639,1.5,3353,7.7,14524,33.4,11690,26.9,...,70.690377,29.309623,60.4,23.4,13.9,9.2,30.5,3992,13303,4


In [5]:
# Define features set
X = crime.copy()
X.drop(columns=["Crime_Rate_per_100k", 
                "City",
                "Violent Crimes Sum",
                "City_encoded",
                "Year"
               ], axis=1, inplace=True)
y = crime['Crime_Rate_per_100k']
X.dtypes

Population 25 years and over - Less than 9th grade                                int64
Percent Population 25 years and over - Less than 9th grade                      float64
Population 25 years and over - 9th to12th (No Diploma)                            int64
Percent Population 25 years and over - 9th to12th (No Diploma)                  float64
Population 25 years and over - High School Graduate (and equivalent)              int64
Percent Population 25 years and over - High School Graduate (and equivalent)    float64
Population 25 years and over - Some college, no degree                            int64
Percent Population 25 years and over - Some college, no degree                  float64
Population 25 years and over - Associate's degree                                 int64
Percent Population 25 years and over - Associate's degree                       float64
Population 25 years and over - Bachelor's degree                                  int64
Percent Population 25 years and 

In [6]:
y = crime["Crime_Rate_per_100k"].values.reshape(-1, 1)
y[:5]

array([[233.84382477],
       [222.35042427],
       [344.24581701],
       [840.91683294],
       [270.94412498]])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=8)

In [8]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [9]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [10]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Create the random forest classifier instance
regressor = RandomForestRegressor(n_estimators=1000, random_state=8)

# Fit the model and use .ravel()on the "y_train" data. 
regressor = regressor.fit(X_train_scaled, y_train.ravel())
                          
# Making predictions using the testing data
predictions = regressor.predict(X_test_scaled)
                          
# Check if 'predictions' is not 1-dimensional and reshape if necessary
if predictions.ndim != 1:
    predictions = predictions.flatten()

# Check if 'y_test' is not 1-dimensional and reshape if necessary
if y_test.ndim != 1:
    y_test = y_test.flatten()
                          
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,281.618353,119.793729
1,414.679570,359.563884
2,214.374745,168.934525
3,456.676449,355.505368
4,496.980396,433.151080
...,...,...
112,624.326210,487.957347
113,266.395794,216.123044
114,427.769246,425.953734
115,445.655290,436.229287


In [12]:
# Create the random forest classifier instance
regressor = RandomForestRegressor(n_estimators=2000, random_state=8)

# Fit the model and use .ravel()on the "y_train" data. 
regressor = regressor.fit(X_train_scaled, y_train.ravel())
                          
# Making predictions using the testing data
predictions = regressor.predict(X_test_scaled)
                          
# Check if 'predictions' is not 1-dimensional and reshape if necessary
if predictions.ndim != 1:
    predictions = predictions.flatten()

# Check if 'y_test' is not 1-dimensional and reshape if necessary
if y_test.ndim != 1:
    y_test = y_test.flatten()
                          
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,273.491590,119.793729
1,417.652472,359.563884
2,216.968315,168.934525
3,452.793314,355.505368
4,489.181813,433.151080
...,...,...
112,628.854665,487.957347
113,268.590653,216.123044
114,429.406034,425.953734
115,443.670870,436.229287


In [13]:
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
errors = abs(predictions - y_test)

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error (MAE): 83.24590675880093
Mean Squared Error (MSE): 14710.121741018744
Root Mean Squared Error (RMSE): 121.28529070344328
R-squared (R²): 0.7395911732097067
Accuracy: 69.81 %.




In [14]:
# Get feature importances
importances = regressor.feature_importances_

# Associate each feature importance with its column name
feature_names = X.columns
feature_importances = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

# Display the feature importances
for name, importance in feature_importances:
    print(f"{name}: {importance}")

% All Families: 0.2531743461772152
Percent Population 25 years and over - Graduate or Prefessional Degree: 0.0775644434594427
Percent Higher Education: 0.039417108744415265
20 to 24 years: 0.03714078037772515
House price mean: 0.03065438054767137
Total population: 0.02895183625916207
Population 25 years and over - High School Graduate (and equivalent): 0.028162077857996435
Percent Renter Occupied: 0.028066251815362056
Percent Home Occupied: 0.027646221266885793
Percent Population 25 years and over - Less than 9th grade: 0.02025330491574173
Percent Uneducated: 0.01962590979058442
Sum Uneducated: 0.018969255125520876
Percent Population 25 years and over - Bachelor's degree: 0.018600430432246143
Renter occupied: 0.01758473775678749
15 to 19 years: 0.016771297899105746
Population 25 years and over - Less than 9th grade: 0.014525246587619355
Population 25 years and over - 9th to12th (No Diploma): 0.014510078117605057
Percent Population 25 years and over - High School Graduate (and equivalen

In [15]:
# Create the random forest classifier instance
regressor = RandomForestRegressor(n_estimators=500, random_state=8)

# Fit the model and use .ravel()on the "y_train" data. 
regressor = regressor.fit(X_train_scaled, y_train.ravel())
                          
# Making predictions using the testing data
predictions = regressor.predict(X_test_scaled)
                          
# Check if 'predictions' is not 1-dimensional and reshape if necessary
if predictions.ndim != 1:
    predictions = predictions.flatten()

# Check if 'y_test' is not 1-dimensional and reshape if necessary
if y_test.ndim != 1:
    y_test = y_test.flatten()
                          
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,280.160816,119.793729
1,413.036370,359.563884
2,219.742526,168.934525
3,448.335711,355.505368
4,503.356728,433.151080
...,...,...
112,621.042672,487.957347
113,263.419455,216.123044
114,431.910676,425.953734
115,437.799578,436.229287


In [16]:
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
errors = abs(predictions - y_test)

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error (MAE): 83.16281321293222
Mean Squared Error (MSE): 14643.032995526888
Root Mean Squared Error (RMSE): 121.00840051635625
R-squared (R²): 0.7407788249376765
Accuracy: 69.71 %.




In [18]:
# # # Save the trained model to a file
# model_filename = "random_forest_regressor.pkl"
# joblib.dump(regressor, model_filename)

['random_forest_regressor.pkl']