In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/crime.csv')

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
df.columns

Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location', 'geometry', 'Community Area Name', 'F_Date', 'Time'],
      dtype='object')

In [None]:
# 폭력 범죄 리스트 정의
violent_crimes = [
    "HOMICIDE", "CRIMINAL SEXUAL ASSAULT", "ROBBERY", "BATTERY", "RITUALISM",
    "ASSAULT", "BURGLARY", "THEFT", "MOTOR VEHICLE THEFT", "HUMAN TRAFFICKING", "ARSON"
]

# 폭력 범죄 여부 컬럼 생성
df['violent'] = df['Primary Type'].isin(violent_crimes).astype(int)

In [None]:
def is_violent_crime(row):
    primary = row['Primary Type']
    description = str(row['Description']).upper()

    if primary in ['HOMICIDE', 'CRIMINAL SEXUAL ASSAULT', 'ROBBERY', 'HUMAN TRAFFICKING']:
        return True
    elif primary in ['ASSAULT', 'BATTERY', 'ARSON'] and 'AGGRAVATED' in description:
        return True
    else:
        return False

# 컬럼 생성
df['IsViolent'] = df.apply(is_violent_crime, axis=1).astype(int)

In [None]:
# 날짜 데이터를 datetime 형식으로 변환
df['F_Date']  = pd.to_datetime(df['F_Date'])

# 분기(Quarter) 컬럼 생성
df['Quarter'] = df['F_Date'].dt.quarter

In [None]:
high_risk_list = ['Humboldt Park',
                  'Austin',
                  'West Garfield Park',
                  'East Garfield Park',
                  'Near West Side',
                  'North Lawndale',
                  'Fuller Park',
                  'Grand Boulevard',
                  'Washington Park',
                  'Woodlawn',
                  'South Shore',
                  'Chatham',
                  'South Chicago',
                  'Roseland',
                  'West Pullman',
                  'Riverdale',
                  'New City',
                  'Chicago Lawn',
                  'West Englewood',
                  'Englewood',
                  'Greater Grand Crossing',
                  'Auburn Gresham']

df['high_risk'] = df['Community Area Name'].isin(high_risk_list).astype(int)

In [None]:
df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Longitude,Location,geometry,Community Area Name,F_Date,Time,violent,IsViolent,Quarter,high_risk
0,6255892,HP342049,2008-05-17 18:00:00,101XX S LAFAYETTE AVE,031A,ROBBERY,ARMED - HANDGUN,RESIDENCE,False,False,...,-87.624796,"(41.710039855, -87.62479561)",POINT (-87.62479561 41.710039855),Roseland,2008-05-17,18:00:00,1,1,2,1
1,6272641,HP358387,2008-05-27 01:00:00,105XX S PERRY AVE,0320,ROBBERY,STRONG ARM - NO WEAPON,STREET,False,True,...,-87.625785,"(41.703006756, -87.625784664)",POINT (-87.625784664 41.703006756),Roseland,2008-05-27,01:00:00,1,1,2,1
2,6438609,HP496499,2008-08-05 22:37:00,126XX S UNION AVE,031A,ROBBERY,ARMED - HANDGUN,SCHOOL - PUBLIC GROUNDS,False,False,...,-87.639053,"(41.664425476, -87.639053139)",POINT (-87.639053139 41.664425476),West Pullman,2008-08-05,22:37:00,1,1,3,1
3,6680276,HP754070,2008-12-27 20:00:00,058XX N MANTON AVE,0610,BURGLARY,FORCIBLE ENTRY,RESIDENCE - GARAGE,False,False,...,-87.77065,"(41.9873259, -87.770650405)",POINT (-87.770650405 41.9873259),Jefferson Park,2008-12-27,20:00:00,1,0,4,0
4,12536164,JE439378,2015-09-24 00:00:00,031XX W 53RD PL,1753,OFFENSE INVOLVING CHILDREN,SEXUAL ASSAULT OF CHILD BY FAMILY MEMBER,APARTMENT,False,True,...,-87.702253,"(41.796278388, -87.702253422)",POINT (-87.702253422 41.796278388),Gage Park,2015-09-24,00:00:00,0,0,3,0


전체범죄

In [None]:
df.columns

Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location', 'geometry', 'Community Area Name', 'F_Date', 'Time',
       'violent', 'IsViolent', 'Quarter', 'high_risk'],
      dtype='object')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# 1. 사용할 Feature 선택
features = [
    'Primary Type', 'Description', 'IUCR', 'FBI Code',
    'Beat', 'District', 'Ward', 'Community Area',
    'Latitude', 'Longitude', 'Quarter'
]
target = 'Arrest'

In [None]:
#  범주형 컬럼 숫자로 변환 (Label Encoding)
label_cols = ['Primary Type', 'Description', 'IUCR', 'FBI Code']
le_dict = {}

for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # 문자열 변환 후 Label Encoding
    le_dict[col] = le

In [None]:
# 학습 데이터 분리 (Train/Test)
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# 4. 모델 학습 (랜덤 포레스트)
model = RandomForestClassifier(n_estimators=100, random_state=42,n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
# 예측
y_pred = model.predict(X_test)

In [None]:
# 정확도 평가
accuracy = accuracy_score(y_test , y_pred)
print(f"Accuracy : {accuracy}")

Accuracy : 0.8709439479017842


In [None]:
print(classification_report(y_test,y_pred,zero_division=0))

              precision    recall  f1-score   support

       False       0.89      0.94      0.92   1235014
        True       0.79      0.68      0.73    420321

    accuracy                           0.87   1655335
   macro avg       0.84      0.81      0.82   1655335
weighted avg       0.87      0.87      0.87   1655335



In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

강력범죄

In [None]:
# 강력범죄만 필터링
violent_df = df[df['violent']==1]

In [None]:
violent_df.columns

Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location', 'geometry', 'Community Area Name', 'F_Date', 'Time',
       'violent', 'IsViolent', 'Quarter', 'high_risk'],
      dtype='object')

위험범죄

In [None]:
# 위험범죄만 필터링
severe_df = df[df['IsViolent']]

In [None]:
severe_df.columns

Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location', 'geometry', 'Community Area Name', 'F_Date', 'Time',
       'violent', 'IsViolent', 'Quarter'],
      dtype='object')