In [81]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [82]:
csv_path = Path("chicago_crime.csv")
df = pd.read_csv(csv_path)

In [83]:
df.head()

Unnamed: 0,CASE#,DATE OF OCCURRENCE,BLOCK,IUCR,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,LOCATION DESCRIPTION,ARREST,DOMESTIC,BEAT,WARD,FBI CD,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION
0,JF462937,11/03/2022 07:00:00 PM,031XX S ASHLAND AVE,860,THEFT,RETAIL THEFT,SMALL RETAIL STORE,N,N,912,12.0,6,,,,,
1,JF463363,11/03/2022 09:00:00 PM,025XX N ELSTON AVE,860,THEFT,RETAIL THEFT,DEPARTMENT STORE,N,N,1432,1.0,6,,,,,
2,JF394023,09/11/2022 01:00:00 AM,0000X W HUBBARD ST,810,THEFT,OVER $500,BAR OR TAVERN,N,N,1831,42.0,6,,,,,
3,JF461531,11/03/2022 01:00:00 PM,028XX N MERRIMAC AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,STREET,N,N,2511,30.0,14,,,,,
4,JF407237,09/21/2022 02:30:00 PM,006XX N MICHIGAN AVE,860,THEFT,RETAIL THEFT,DEPARTMENT STORE,N,N,1834,42.0,6,,,,,


In [84]:
df[['date_column', 'time_column']] = df['DATE  OF OCCURRENCE'].str.split(' ', 1, expand=True)


  df[['date_column', 'time_column']] = df['DATE  OF OCCURRENCE'].str.split(' ', 1, expand=True)


In [85]:
df.head()

Unnamed: 0,CASE#,DATE OF OCCURRENCE,BLOCK,IUCR,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,LOCATION DESCRIPTION,ARREST,DOMESTIC,BEAT,WARD,FBI CD,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION,date_column,time_column
0,JF462937,11/03/2022 07:00:00 PM,031XX S ASHLAND AVE,860,THEFT,RETAIL THEFT,SMALL RETAIL STORE,N,N,912,12.0,6,,,,,,11/03/2022,07:00:00 PM
1,JF463363,11/03/2022 09:00:00 PM,025XX N ELSTON AVE,860,THEFT,RETAIL THEFT,DEPARTMENT STORE,N,N,1432,1.0,6,,,,,,11/03/2022,09:00:00 PM
2,JF394023,09/11/2022 01:00:00 AM,0000X W HUBBARD ST,810,THEFT,OVER $500,BAR OR TAVERN,N,N,1831,42.0,6,,,,,,09/11/2022,01:00:00 AM
3,JF461531,11/03/2022 01:00:00 PM,028XX N MERRIMAC AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,STREET,N,N,2511,30.0,14,,,,,,11/03/2022,01:00:00 PM
4,JF407237,09/21/2022 02:30:00 PM,006XX N MICHIGAN AVE,860,THEFT,RETAIL THEFT,DEPARTMENT STORE,N,N,1834,42.0,6,,,,,,09/21/2022,02:30:00 PM


In [86]:
contains_value = df.applymap(lambda x: '143A' in str(x)).any()

# List the column(s) where the value is present
columns_with_value = contains_value[contains_value].index.tolist()

print("Columns with '143A':", columns_with_value)

Columns with '143A': [' IUCR']


In [88]:
columns_to_remove = ['CASE#', 'BLOCK', 'DATE  OF OCCURRENCE', 'BEAT', ' IUCR']

# Drop the specified columns
df.drop(columns=columns_to_remove, inplace=True)

In [89]:
print(df.columns)

Index([' PRIMARY DESCRIPTION', ' SECONDARY DESCRIPTION',
       ' LOCATION DESCRIPTION', 'ARREST', 'DOMESTIC', 'WARD', 'FBI CD',
       'X COORDINATE', 'Y COORDINATE', 'LATITUDE', 'LONGITUDE', 'LOCATION',
       'date_column', 'time_column'],
      dtype='object')


In [90]:
unique_entries = df[' PRIMARY DESCRIPTION'].unique()
print(unique_entries)

['THEFT' 'CRIMINAL DAMAGE' 'DECEPTIVE PRACTICE' 'BATTERY' 'NARCOTICS'
 'MOTOR VEHICLE THEFT' 'BURGLARY' 'OTHER OFFENSE' 'PUBLIC PEACE VIOLATION'
 'SEX OFFENSE' 'STALKING' 'ASSAULT' 'CRIMINAL SEXUAL ASSAULT'
 'CRIMINAL TRESPASS' 'WEAPONS VIOLATION' 'ROBBERY' 'HOMICIDE'
 'OFFENSE INVOLVING CHILDREN' 'OBSCENITY' 'INTIMIDATION'
 'LIQUOR LAW VIOLATION' 'KIDNAPPING' 'ARSON' 'OTHER NARCOTIC VIOLATION'
 'INTERFERENCE WITH PUBLIC OFFICER' 'HUMAN TRAFFICKING' 'PROSTITUTION'
 'CONCEALED CARRY LICENSE VIOLATION' 'GAMBLING' 'NON-CRIMINAL'
 'PUBLIC INDECENCY']


In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256473 entries, 0 to 256472
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0    PRIMARY DESCRIPTION    256473 non-null  object 
 1    SECONDARY DESCRIPTION  256473 non-null  object 
 2    LOCATION DESCRIPTION   255420 non-null  object 
 3   ARREST                  256473 non-null  object 
 4   DOMESTIC                256473 non-null  object 
 5   WARD                    256467 non-null  float64
 6   FBI CD                  256473 non-null  object 
 7   X COORDINATE            252400 non-null  float64
 8   Y COORDINATE            252400 non-null  float64
 9   LATITUDE                252400 non-null  float64
 10  LONGITUDE               252400 non-null  float64
 11  LOCATION                252400 non-null  object 
 12  date_column             256473 non-null  object 
 13  time_column             256473 non-null  object 
dtypes: float64(5), objec

In [92]:
label_encoder = LabelEncoder()
df[' PRIMARY DESCRIPTION'] = label_encoder.fit_transform(df[' PRIMARY DESCRIPTION'])

In [93]:
X = df.drop(' PRIMARY DESCRIPTION', axis=1)
y = df[' PRIMARY DESCRIPTION']

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [95]:
encoded_df = pd.get_dummies(df, columns=[' PRIMARY DESCRIPTION'])

In [96]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

ValueError: could not convert string to float: 'PURSE-SNATCHING'