In [1]:
import pandas as pd
import numpy as np 
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Read salary data
file_path = Path("All_data_cleaned.csv")
crime = pd.read_csv(file_path)

# Display sample data
crime.head()

Unnamed: 0,Year,City,Percent Population 25 years and over - Less than 9th grade,Percent Population 25 years and over - 9th to12th (No Diploma),Percent Population 25 years and over - High School Graduate (and equivalent),"Percent Population 25 years and over - Some college, no degree",Percent Population 25 years and over - Associate's degree,Percent Population 25 years and over - Bachelor's degree,Percent Population 25 years and over - Graduate or Prefessional Degree,Total population,...,Percent 60 to 64 years,Percent 65 to 74 years,Percent 75 to 84 years,Percent 85 years and over,Violent Crimes Sum,% All Families,House Price Mean,Crime_Rate_per_100k,Percent Home Occupied,Percent Renter Occupied
0,2015,Alameda,3.6,3.6,12.3,18.7,7.3,36.0,18.5,78614.0,...,6.4,8.3,4.6,2.0,148.0,5.2,789464.7,188.261633,44.102898,55.897102
1,2015,Alhambra,8.3,7.6,28.8,14.5,6.0,22.9,11.9,85572.0,...,6.0,9.0,3.8,2.6,168.0,11.8,524186.5,196.325901,41.495811,58.504189
2,2015,Anaheim,14.4,11.4,23.6,21.1,6.4,17.4,5.8,350738.0,...,4.0,5.8,2.9,1.6,1271.0,11.8,479128.8,362.378756,43.471912,56.528088
3,2015,Bakersfield,10.6,10.2,28.8,21.1,7.9,13.9,7.5,373627.0,...,4.5,6.1,2.5,0.9,1810.0,14.3,214306.0,484.440364,57.775194,42.224806
4,2015,Baldwin Park,22.1,12.3,31.1,14.3,6.3,10.8,3.2,77056.0,...,4.8,9.0,2.8,1.2,299.0,14.4,343992.4,388.029485,54.430721,45.569279


In [3]:
# Define features set
X = crime.copy()
X.drop(columns=["Crime_Rate_per_100k", "City"], axis=1, inplace=True)
X.head()

Unnamed: 0,Year,Percent Population 25 years and over - Less than 9th grade,Percent Population 25 years and over - 9th to12th (No Diploma),Percent Population 25 years and over - High School Graduate (and equivalent),"Percent Population 25 years and over - Some college, no degree",Percent Population 25 years and over - Associate's degree,Percent Population 25 years and over - Bachelor's degree,Percent Population 25 years and over - Graduate or Prefessional Degree,Total population,Percent Under 5 years,...,Percent 55 to 59 years,Percent 60 to 64 years,Percent 65 to 74 years,Percent 75 to 84 years,Percent 85 years and over,Violent Crimes Sum,% All Families,House Price Mean,Percent Home Occupied,Percent Renter Occupied
0,2015,3.6,3.6,12.3,18.7,7.3,36.0,18.5,78614.0,5.3,...,7.3,6.4,8.3,4.6,2.0,148.0,5.2,789464.7,44.102898,55.897102
1,2015,8.3,7.6,28.8,14.5,6.0,22.9,11.9,85572.0,4.5,...,8.9,6.0,9.0,3.8,2.6,168.0,11.8,524186.5,41.495811,58.504189
2,2015,14.4,11.4,23.6,21.1,6.4,17.4,5.8,350738.0,7.1,...,5.4,4.0,5.8,2.9,1.6,1271.0,11.8,479128.8,43.471912,56.528088
3,2015,10.6,10.2,28.8,21.1,7.9,13.9,7.5,373627.0,7.9,...,5.4,4.5,6.1,2.5,0.9,1810.0,14.3,214306.0,57.775194,42.224806
4,2015,22.1,12.3,31.1,14.3,6.3,10.8,3.2,77056.0,6.0,...,6.1,4.8,9.0,2.8,1.2,299.0,14.4,343992.4,54.430721,45.569279


In [4]:
y = crime["Crime_Rate_per_100k"].values.reshape(-1, 1)
y[:5]

array([[188.26163279],
       [196.325901  ],
       [362.37875565],
       [484.44036432],
       [388.02948505]])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [6]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [7]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [8]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Create the random forest classifier instance
regressor = RandomForestRegressor(n_estimators=10000)

In [10]:
# Fit the model and use .ravel()on the "y_train" data. 
regressor = regressor.fit(X_train_scaled, y_train.ravel())

In [11]:
# Making predictions using the testing data
predictions = regressor.predict(X_test_scaled)


In [12]:
# Check if 'predictions' is not 1-dimensional and reshape if necessary
if predictions.ndim != 1:
    predictions = predictions.flatten()

# Check if 'y_test' is not 1-dimensional and reshape if necessary
if y_test.ndim != 1:
    y_test = y_test.flatten()

In [13]:
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,473.095091,462.652666
1,693.906652,670.459385
2,376.894305,335.792527
3,426.301032,411.929944
4,132.281923,127.215607
...,...,...
148,435.696588,298.761324
149,315.055889,303.478227
150,690.844199,627.739562
151,700.617515,784.339357


In [14]:
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")

Mean Absolute Error (MAE): 28.184908381902037
Mean Squared Error (MSE): 2653.2286925737853
Root Mean Squared Error (RMSE): 51.50950099325158
R-squared (R²): 0.95427814022974




In [16]:
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(mae, 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 28.18 degrees.
Accuracy: 92.33 %.
