# [San Francisco Crime Classification | Kaggle](https://www.kaggle.com/c/sf-crime)

### [SF Crime Prediction with scikit-learn 을 따라해 본다. | Kaggle](https://www.kaggle.com/rhoslug/sf-crime-prediction-with-scikit-learn)

### Data fields
* 날짜  - 범죄 사건의 타임 스탬프
* 범주  - 범죄 사건 카테고리 (train.csv에만 해당) 이 변수를 예측하는 게 이 경진대회 과제임
* 설명  - 범죄 사건에 대한 자세한 설명 (train.csv에만 있음)
* DayOfWeek - 요일
* PdDistrict - 경찰서 구의 이름
* 해결 방법 - 범죄 사건이 어떻게 해결 되었는지 (train.csv에서만)
* 주소 - 범죄 사건의 대략적인 주소 
* X - 경도
* Y - 위도


* Dates - timestamp of the crime incident
* Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.
* Descript - detailed description of the crime incident (only in train.csv)
* DayOfWeek - the day of the week
* PdDistrict - name of the Police Department District
* Resolution - how the crime incident was resolved (only in train.csv)
* Address - the approximate street address of the crime incident 
* X - Longitude 
* Y - Latitude 

In [2]:
from __future__ import print_function, division
import pandas as pd
import numpy as np

In [3]:
df_train = pd.read_csv('data/train.csv', parse_dates=['Dates'])
df_train.shape

(878049, 9)

In [4]:
df_train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [5]:
# 'Descript', 'Dates', 'Resolution' 는 제거
df_train.drop(['Descript', 'Dates', 'Resolution'], axis=1, inplace=True)
df_train.shape

(878049, 6)

In [6]:
df_test = pd.read_csv('data/test.csv', parse_dates=['Dates'])
df_test.shape

(884262, 7)

In [7]:
df_test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [8]:
df_test.drop(['Dates'], axis=1, inplace=True)

In [9]:
df_test.head()

Unnamed: 0,Id,DayOfWeek,PdDistrict,Address,X,Y
0,0,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [10]:
# 트레이닝과 검증셋을 선택한다.
inds = np.arange(df_train.shape[0])
inds

array([     0,      1,      2, ..., 878046, 878047, 878048])

In [11]:
np.random.shuffle(inds)
df_train.shape[0]

878049

In [12]:
# 트레인 셋
train_inds = inds[:int(0.2 * df_train.shape[0])]
print(train_inds.shape)
# 검증 셋
val_inds = inds[int(0.2) * df_train.shape[0]:]
print(val_inds.shape)

(175609,)
(878049,)


In [13]:
# 컬럼명을 추출한다.
col_names = np.sort(df_train['Category'].unique())
col_names

array(['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
       'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
       'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
       'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
       'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
       'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
       'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
       'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
       'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
       'WARRANTS', 'WEAPON LAWS'], dtype=object)

In [14]:
# 카테고리를 숫자로 변환해 준다.
df_train['Category'] = pd.Categorical(df_train['Category']).codes
df_train['DayOfWeek'] = pd.Categorical(df_train['DayOfWeek']).codes
df_train['PdDistrict'] = pd.Categorical(df_train['PdDistrict']).codes
df_test['DayOfWeek'] = pd.Categorical(df_test['DayOfWeek']).codes
df_test['PdDistrict'] = pd.Categorical(df_test['PdDistrict']).codes

In [15]:
df_train.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,Address,X,Y
0,37,6,4,OAK ST / LAGUNA ST,-122.425892,37.774599
1,21,6,4,OAK ST / LAGUNA ST,-122.425892,37.774599
2,21,6,4,VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,16,6,4,1500 Block of LOMBARD ST,-122.426995,37.800873
4,16,6,5,100 Block of BRODERICK ST,-122.438738,37.771541


In [16]:
df_test.head()

Unnamed: 0,Id,DayOfWeek,PdDistrict,Address,X,Y
0,0,3,0,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,3,0,3RD ST / REVERE AV,-122.391523,37.732432
2,2,3,4,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,3,2,4700 Block of MISSION ST,-122.437394,37.721412
4,4,3,2,4700 Block of MISSION ST,-122.437394,37.721412


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
# text 빈도를 추출한다.
cvec = CountVectorizer()
cvec

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [18]:
bows_train = cvec.fit_transform(df_train['Address'].values)

In [19]:
bows_test = cvec.fit_transform(df_test['Address'].values)

In [20]:
# 트레이닝과 검증셋을 나눈다.
df_val = df_train.iloc[val_inds]
df_val.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,Address,X,Y
68509,20,3,1,700 Block of VALLEJO ST,-122.409213,37.798467
617213,21,4,9,100 Block of HYDE ST,-122.415533,37.782137
299207,21,6,0,CARGO WY / 3RD ST,-122.387178,37.746157
356906,37,5,3,16TH ST / MISSION ST,-122.419672,37.76505
265648,32,0,3,2100 Block of MISSION ST,-122.419424,37.763232


In [21]:
df_val.shape

(878049, 6)

In [22]:
df_train = df_train.iloc[train_inds]
df_train.shape

(175609, 6)

In [23]:
df_train.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,Address,X,Y
68509,20,3,1,700 Block of VALLEJO ST,-122.409213,37.798467
617213,21,4,9,100 Block of HYDE ST,-122.415533,37.782137
299207,21,6,0,CARGO WY / 3RD ST,-122.387178,37.746157
356906,37,5,3,16TH ST / MISSION ST,-122.419672,37.76505
265648,32,0,3,2100 Block of MISSION ST,-122.419424,37.763232


In [24]:
from patsy import dmatrices, dmatrix
y_train, X_train = dmatrices('Category ~ X + Y + DayOfWeek + PdDistrict', df_train)

In [25]:
y_train.shape

(175609, 1)

In [26]:
# 벡터화 된 주소
X_train = np.hstack((X_train, bows_train[train_inds, :].toarray()))

In [27]:
X_train.shape

(175609, 2146)

In [28]:
y_val, X_val = dmatrices('Category ~ X + Y + DayOfWeek + PdDistrict', df_val)

In [29]:
X_val = np.hstack((X_val, bows_train[val_inds, :].toarray()))
X_test = dmatrix('X + Y + DayOfWeek + PdDistrict', df_test)

In [30]:
X_test = np.hstack((X_test, bows_test.toarray()))

In [33]:
# IncrementalPCA
from sklearn.decomposition import IncrementalPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [34]:
ipca = IncrementalPCA(n_components=4, batch_size=5)
ipca

IncrementalPCA(batch_size=5, copy=True, n_components=4, whiten=False)

In [None]:
# 로컬 메모리 부족으로 실행 실패 T_T
X_train = ipca.fit_transform(X_train)

In [None]:
X_val = ipca.transform(X_val)

In [None]:
X_test = ipca.transform(X_test)

In [None]:
# 로지스틱 회귀를 생성하고 fit 시킨다.
logistic = LogisticRegression()
logistic.fit(X_train, y_train.ravel())

# 정확도를 본다.
print('Mean accuracy (Logistic): {:.4f}.format(logistic.score(X_val, y_val.ravel())))')

In [None]:
# 랜덤 포레스트로 fit 시키고 정확도를 본다.
randforest = RandomForestClassifier()
randforest.fit(X_train, y_train.ravel())

# 정확도를 본다.
print('Mean accuracy (Logistic): {:.4f}.format(logistic.score(X_val, y_val.ravel())))')

In [None]:
# Make predictions

predict_probs = logistic.predict_proba(X_test)

In [None]:
df_pred = pd.DataFrame(data=predict_probs, columns=col_names)
df_pred['Id'] = df_test['Id'].astype(int)
df_pred.to_csv('output.csv', index=False)