In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression

In [2]:
# Read salary data
file_path = Path("All_data_cleaned.csv")
crime = pd.read_csv(file_path)

# Display sample data
crime.head()

Unnamed: 0,Year,City,Population 25 years and over - Less than 9th grade,Percent Population 25 years and over - Less than 9th grade,Population 25 years and over - 9th to12th (No Diploma),Percent Population 25 years and over - 9th to12th (No Diploma),Population 25 years and over - High School Graduate (and equivalent),Percent Population 25 years and over - High School Graduate (and equivalent),"Population 25 years and over - Some college, no degree","Percent Population 25 years and over - Some college, no degree",...,75 to 84 years,Percent 75 to 84 years,85 years and over,Percent 85 years and over,Violent Crimes Sum,% All Families,House Price Mean,Total,Total Occupied,Renter occupied
0,2015,Alameda,2089,3.6,2089,3.6,7242,12.3,11010,18.7,...,3599,4.60%,1541,2.00%,148.0,5.2,789464.7,30710,13544,17166
1,2015,Alhambra,5402,8.3,4923,7.6,18760,28.8,9435,14.5,...,3237,3.80%,2210,2.60%,168.0,11.8,524186.5,29362,12184,17178
2,2015,Anaheim,32506,14.4,25762,11.4,53477,23.6,47633,21.1,...,10167,2.90%,5608,1.60%,1271.0,11.8,479128.8,99991,43468,56523
3,2015,Bakersfield,23326,10.6,22434,10.2,63568,28.8,46592,21.1,...,9168,2.50%,3276,0.90%,1810.0,14.3,214306.0,114383,66085,48298
4,2015,Baldwin Park,11082,22.1,6156,12.3,15636,31.1,7172,14.3,...,2120,2.80%,933,1.20%,299.0,14.4,343992.4,18541,10092,8449


In [3]:
crime.dtypes

Year                                                                              int64
City                                                                             object
Population 25 years and over - Less than 9th grade                                int64
Percent Population 25 years and over - Less than 9th grade                      float64
Population 25 years and over - 9th to12th (No Diploma)                            int64
Percent Population 25 years and over - 9th to12th (No Diploma)                  float64
Population 25 years and over - High School Graduate (and equivalent)              int64
Percent Population 25 years and over - High School Graduate (and equivalent)    float64
Population 25 years and over - Some college, no degree                            int64
Percent Population 25 years and over - Some college, no degree                  float64
Population 25 years and over - Associate's degree                                 int64
Percent Population 25 years and 

In [4]:
columns_to_convert = [
    'Total population', 'Under 5 years', 'Percent Under 5 years', 
    '5 to 9 years', 'Percent 5 to 9 years', '10 to 14 years', 
    'Percent 10 to 14 years', '15 to 19 years', 'Percent 15 to 19 years', 
    '20 to 24 years', 'Percent 20 to 24 years', '25 to 34 years', 
    'Percent 25 to 34 years', '35 to 44 years', 'Percent 35 to 44 years', 
    '45 to 54 years', 'Percent 45 to 54 years', '55 to 59 years', 
    'Percent 55 to 59 years', '60 to 64 years', 'Percent 60 to 64 years', 
    '65 to 74 years', 'Percent 65 to 74 years', '75 to 84 years', 
    'Percent 75 to 84 years', '85 years and over', 'Percent 85 years and over', 'Total', 'Total Occupied', 'Renter occupied'
]

for column in columns_to_convert:
    crime[column] = crime[column].str.replace(',|%', '', regex=True) .astype(float)

In [5]:
crime.dtypes

Year                                                                              int64
City                                                                             object
Population 25 years and over - Less than 9th grade                                int64
Percent Population 25 years and over - Less than 9th grade                      float64
Population 25 years and over - 9th to12th (No Diploma)                            int64
Percent Population 25 years and over - 9th to12th (No Diploma)                  float64
Population 25 years and over - High School Graduate (and equivalent)              int64
Percent Population 25 years and over - High School Graduate (and equivalent)    float64
Population 25 years and over - Some college, no degree                            int64
Percent Population 25 years and over - Some college, no degree                  float64
Population 25 years and over - Associate's degree                                 int64
Percent Population 25 years and 

In [6]:
y = crime["Violent Crimes Sum"]
X = crime.drop(columns=["Violent Crimes Sum", "City"])

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.25)
X_train.shape

(456, 47)

In [8]:
 ## Create a Logistic Regression Model

In [14]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=100000,)
classifier

In [15]:
classifier.fit(X_train, y_train)

In [19]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.006535947712418301


In [20]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,245.0,344.0
1,426.0,164.0
2,331.0,272.0
3,563.0,775.0
4,264.0,225.0
5,166.0,234.0
6,433.0,263.0
7,244.0,232.0
8,255.0,450.0
9,292.0,130.0


In [18]:
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.006535947712418301