In [12]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import joblib

In [2]:
# Read salary data
file_path = Path("All_data_cleaned.csv")
crime = pd.read_csv(file_path)

# Display sample data
crime.head()

Unnamed: 0,Year,City,Population 25 years and over - Less than 9th grade,Percent Population 25 years and over - Less than 9th grade,Population 25 years and over - 9th to12th (No Diploma),Percent Population 25 years and over - 9th to12th (No Diploma),Population 25 years and over - High School Graduate (and equivalent),Percent Population 25 years and over - High School Graduate (and equivalent),"Population 25 years and over - Some college, no degree","Percent Population 25 years and over - Some college, no degree",...,Crime_Rate_per_100k,Percent Home Occupied,Percent Renter Occupied,Percent 25+,Percent 14-,Percent 15 to 24,Percent Uneducated,Percent Higher Education,Sum Uneducated,Sum Higher Education
0,2010,Alameda,2752,5.5,2234,4.5,8546,17.0,10009,19.9,...,233.843825,46.920853,53.079147,65.8,19.8,12.3,10.0,53.1,4986,26639
1,2010,Alhambra,6236,10.1,4575,7.4,13358,21.7,12091,19.6,...,222.350424,38.053381,61.946619,71.1,12.9,13.1,17.5,41.1,10811,25307
2,2010,Anaheim,32561,15.5,22329,10.7,50982,24.3,39407,18.8,...,344.245817,48.38851,51.61149,61.0,21.2,16.7,26.2,30.7,54890,64362
3,2010,Antioch,4874,7.7,4633,7.3,17092,26.9,20113,31.7,...,840.916833,62.767194,37.232806,60.7,22.9,15.5,15.0,26.5,9507,16815
4,2010,Apple Valley,639,1.5,3353,7.7,14524,33.4,11690,26.9,...,270.944125,70.690377,29.309623,60.4,23.4,13.9,9.2,30.5,3992,13303


In [3]:
y = crime["Crime_Rate_per_100k"]
X = crime.drop(columns=["Crime_Rate_per_100k", "City","Violent Crimes Sum", "Year"])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.1,
                                                    random_state=8)
X_train.shape

(1048, 57)

In [5]:
 ## Create a Linear Regression Model

In [6]:
regressor = LinearRegression()
regressor

In [7]:
regressor.fit(X_train, y_train)

In [8]:
print(f"Training Data Score: {regressor.score(X_train, y_train)}")
print(f"Testing Data Score: {regressor.score(X_test, y_test)}")

Training Data Score: 0.5609753205187243
Testing Data Score: 0.5203308345273281


In [9]:
predictions = regressor.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,272.955941,119.793729
1,447.709719,359.563884
2,399.305723,168.934525
3,292.082221,355.505368
4,553.792494,433.151080
...,...,...
112,662.732189,487.957347
113,344.878618,216.123044
114,442.317083,425.953734
115,348.050040,436.229287


In [10]:

# Assume `predictions` are the predicted values for your test set
# and `y_test` are the actual values

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")

Mean Absolute Error (MAE): 120.43967383920425
Mean Squared Error (MSE): 27095.824310126183
Root Mean Squared Error (RMSE): 164.60809308817772
R-squared (R²): 0.5203308345273281




In [11]:
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(mae, 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 120.44 degrees.
Accuracy: 50.71 %.


In [15]:
# # Now, save the trained model to a file
# joblib_file = "linear_regression_model.pkl"
# joblib.dump(regressor, joblib_file)

['linear_regression_model.pkl']