In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

In [2]:
# Import our input dataset
crime_df = pd.read_csv('All_data_cleaned.csv')
crime_df.head()

# One-hot encode the 'City' column
crime_encoded = pd.get_dummies(crime_df, columns=['City'])

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'City' column
crime_df['City_encoded'] = label_encoder.fit_transform(crime_df['City'])

crime_df

Unnamed: 0,Year,City,Population 25 years and over - Less than 9th grade,Percent Population 25 years and over - Less than 9th grade,Population 25 years and over - 9th to12th (No Diploma),Percent Population 25 years and over - 9th to12th (No Diploma),Population 25 years and over - High School Graduate (and equivalent),Percent Population 25 years and over - High School Graduate (and equivalent),"Population 25 years and over - Some college, no degree","Percent Population 25 years and over - Some college, no degree",...,Percent Home Occupied,Percent Renter Occupied,Percent 25+,Percent 14-,Percent 15 to 24,Percent Uneducated,Percent Higher Education,Sum Uneducated,Sum Higher Education,City_encoded
0,2010,Alameda,2752,5.50,2234,4.50,8546,17.00,10009,19.90,...,46.920853,53.079147,65.8,19.8,12.3,10.00,53.10,4986,26639,0
1,2010,Alhambra,6236,10.10,4575,7.40,13358,21.70,12091,19.60,...,38.053381,61.946619,71.1,12.9,13.1,17.50,41.10,10811,25307,1
2,2010,Anaheim,32561,15.50,22329,10.70,50982,24.30,39407,18.80,...,48.388510,51.611490,61.0,21.2,16.7,26.20,30.70,54890,64362,2
3,2010,Antioch,4874,7.70,4633,7.30,17092,26.90,20113,31.70,...,62.767194,37.232806,60.7,22.9,15.5,15.00,26.50,9507,16815,3
4,2010,Apple Valley,639,1.50,3353,7.70,14524,33.40,11690,26.90,...,70.690377,29.309623,60.4,23.4,13.9,9.20,30.50,3992,13303,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1160,2019,Vista,10080,15.05,6638,9.91,13245,19.77,16027,23.93,...,45.724859,54.275141,64.4,19.9,14.1,24.96,31.34,16718,20990,129
1161,2019,West Covina,5308,7.09,4530,6.05,20919,27.95,16083,21.49,...,57.975360,42.024640,69.5,16.9,11.9,13.14,37.41,9838,28003,131
1162,2019,Westminster,8307,13.03,6607,10.36,15641,24.53,11273,17.68,...,47.216869,52.783131,68.0,18.2,11.5,23.39,34.41,14914,21943,132
1163,2019,Whittier,3021,5.44,3921,7.06,16232,29.24,12079,21.76,...,60.928433,39.071567,63.3,21.0,13.9,12.50,36.49,6942,20258,133


In [3]:
# Fix the seed for random number generation
seed_value = 8
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

# Remove crime and city target from features data
X = crime_df.copy()
X.drop(columns=["Crime_Rate_per_100k", 
                "City",
                "Violent Crimes Sum",
                "City_encoded",
                "Year"
               ], axis=1, inplace=True)
y = crime_df['Crime_Rate_per_100k']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Split Data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=8)

In [4]:
 ## Create a Linear Regression Model

In [5]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
# Create the random forest classifier instance
regressor = regressor = LinearRegression()

# Fit the model and use .ravel()on the "y_train" data. 
regressor = regressor.fit(X_train_scaled, y_train.ravel())
                          
# Making predictions using the testing data
predictions3 = regressor.predict(X_test_scaled)
                          
# Check if 'predictions' is not 1-dimensional and reshape if necessary
if predictions3.ndim != 1:
    predictions3 = predictions3.flatten()

# Check if 'y_test' is not 1-dimensional and reshape if necessary
if y_test.ndim != 1:
    y_test = y_test.flatten()
                          
results = pd.DataFrame({"Prediction": predictions3, "Actual": y_test}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,272.465225,119.793729
1,448.308975,359.563884
2,398.730850,168.934525
3,293.965225,355.505368
4,553.480850,433.151080
...,...,...
112,657.715225,487.957347
113,345.058975,216.123044
114,442.621475,425.953734
115,347.590225,436.229287


In [7]:
print(f"Training Data Score: {regressor.score(X_train, y_train)}")
print(f"Testing Data Score: {regressor.score(X_test, y_test)}")

Training Data Score: -2.948362708091002e+19
Testing Data Score: -3.5245636191557763e+19


In [8]:
if predictions3.ndim != 1:
    predictions3 = predictions3.flatten()

# Check if 'y_test' is not 1-dimensional and reshape if necessary
if y_test.ndim != 1:
    x = y_test.flatten()
    
results = pd.DataFrame({"Prediction": predictions3, "Actual": y_test}).reset_index(drop=True)
print(results)

mae = mean_absolute_error(y_test, predictions3)
mse = mean_squared_error(y_test, predictions3)
rmse = mean_squared_error(y_test, predictions3, squared=False)
r2 = r2_score(y_test, predictions3)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
errors = abs(predictions3 - y_test)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

     Prediction      Actual
0    272.465225  119.793729
1    448.308975  359.563884
2    398.730850  168.934525
3    293.965225  355.505368
4    553.480850  433.151080
..          ...         ...
112  657.715225  487.957347
113  345.058975  216.123044
114  442.621475  425.953734
115  347.590225  436.229287
116  648.433975  388.424770

[117 rows x 2 columns]
Mean Absolute Error (MAE): 120.40417315490821
Mean Squared Error (MSE): 27082.627043538123
Root Mean Squared Error (RMSE): 164.56800127466494
R-squared (R²): 0.5205644617378666
Accuracy: 50.75 %.




In [9]:
joblib.dump(regressor, 'linear_regression_model.pkl')

['linear_regression_model.pkl']