In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Id          884262 non-null  int64  
 1   Dates       884262 non-null  object 
 2   DayOfWeek   884262 non-null  object 
 3   PdDistrict  884262 non-null  object 
 4   Address     884262 non-null  object 
 5   X           884262 non-null  float64
 6   Y           884262 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 47.2+ MB


In [7]:
# Preprocessing train data

train["Dates"] = pd.to_datetime(train["Dates"])
train["Year"] = train["Dates"].dt.year
train["Month"] = train["Dates"].dt.month
train["Day"] = train["Dates"].dt.day
train["Hour"] = train["Dates"].dt.hour
# train = train.join(pd.get_dummies(train["DayOfWeek"]))
train = train.join(pd.get_dummies(train["PdDistrict"]))

train = train.drop(["Dates", "Descript", "DayOfWeek", "Resolution", "PdDistrict", "Address"], axis=1)

train.head()

Unnamed: 0,Category,X,Y,Year,Month,Day,Hour,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,WARRANTS,-122.425892,37.774599,2015,5,13,23,False,False,False,False,True,False,False,False,False,False
1,OTHER OFFENSES,-122.425892,37.774599,2015,5,13,23,False,False,False,False,True,False,False,False,False,False
2,OTHER OFFENSES,-122.424363,37.800414,2015,5,13,23,False,False,False,False,True,False,False,False,False,False
3,LARCENY/THEFT,-122.426995,37.800873,2015,5,13,23,False,False,False,False,True,False,False,False,False,False
4,LARCENY/THEFT,-122.438738,37.771541,2015,5,13,23,False,False,False,False,False,True,False,False,False,False


In [8]:
# Preprocessing test data

test["Dates"] = pd.to_datetime(test["Dates"])
test["Year"] = test["Dates"].dt.year
test["Month"] = test["Dates"].dt.month
test["Day"] = test["Dates"].dt.day
test["Hour"] = test["Dates"].dt.hour
# test = test.join(pd.get_dummies(test["DayOfWeek"]))
test = test.join(pd.get_dummies(test["PdDistrict"]))
test_ids = test["Id"]

test = test.drop(["Dates", "DayOfWeek", "PdDistrict", "Address", "Id"], axis=1)

test.head()

Unnamed: 0,X,Y,Year,Month,Day,Hour,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,-122.399588,37.735051,2015,5,10,23,True,False,False,False,False,False,False,False,False,False
1,-122.391523,37.732432,2015,5,10,23,True,False,False,False,False,False,False,False,False,False
2,-122.426002,37.792212,2015,5,10,23,False,False,False,False,True,False,False,False,False,False
3,-122.437394,37.721412,2015,5,10,23,False,False,True,False,False,False,False,False,False,False
4,-122.437394,37.721412,2015,5,10,23,False,False,True,False,False,False,False,False,False,False


In [9]:
crimes = list(train['Category'].unique())
crimes.sort()
crimes

['ARSON',
 'ASSAULT',
 'BAD CHECKS',
 'BRIBERY',
 'BURGLARY',
 'DISORDERLY CONDUCT',
 'DRIVING UNDER THE INFLUENCE',
 'DRUG/NARCOTIC',
 'DRUNKENNESS',
 'EMBEZZLEMENT',
 'EXTORTION',
 'FAMILY OFFENSES',
 'FORGERY/COUNTERFEITING',
 'FRAUD',
 'GAMBLING',
 'KIDNAPPING',
 'LARCENY/THEFT',
 'LIQUOR LAWS',
 'LOITERING',
 'MISSING PERSON',
 'NON-CRIMINAL',
 'OTHER OFFENSES',
 'PORNOGRAPHY/OBSCENE MAT',
 'PROSTITUTION',
 'RECOVERED VEHICLE',
 'ROBBERY',
 'RUNAWAY',
 'SECONDARY CODES',
 'SEX OFFENSES FORCIBLE',
 'SEX OFFENSES NON FORCIBLE',
 'STOLEN PROPERTY',
 'SUICIDE',
 'SUSPICIOUS OCC',
 'TREA',
 'TRESPASS',
 'VANDALISM',
 'VEHICLE THEFT',
 'WARRANTS',
 'WEAPON LAWS']

In [10]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(train['Category'])
train = train.drop(['Category'], axis=1)

In [11]:
model = xgb.XGBClassifier(objective='multi:softprob', num_class=len(crimes))
model.fit(train, y_train)

In [12]:
y_pred = model.predict_proba(test)
y_pred_df = pd.DataFrame(y_pred, columns=crimes)

In [13]:
y_pred_df

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0.002383,0.129521,0.000035,0.000992,0.017047,0.000728,0.011667,0.021012,0.001238,0.000486,...,0.000056,0.009938,0.000252,0.040093,1.166209e-06,0.005068,0.083409,0.177138,0.024937,0.023273
1,0.003396,0.114827,0.000020,0.000960,0.011013,0.000984,0.012535,0.051445,0.003316,0.000134,...,0.000476,0.004257,0.001181,0.029403,2.642079e-06,0.005113,0.036505,0.098875,0.060154,0.027591
2,0.003054,0.056395,0.000033,0.000042,0.059396,0.000531,0.000649,0.011233,0.001368,0.000186,...,0.000023,0.004300,0.000053,0.020601,3.175657e-07,0.010897,0.073073,0.115580,0.011298,0.003112
3,0.002000,0.098143,0.000008,0.013146,0.018041,0.002472,0.002873,0.021962,0.004637,0.000201,...,0.000088,0.008688,0.000162,0.036486,2.086823e-06,0.001817,0.057182,0.133497,0.025743,0.017622
4,0.002000,0.098143,0.000008,0.013146,0.018041,0.002472,0.002873,0.021962,0.004637,0.000201,...,0.000088,0.008688,0.000162,0.036486,2.086823e-06,0.001817,0.057182,0.133497,0.025743,0.017622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884257,0.001174,0.087948,0.003087,0.000090,0.017891,0.001571,0.011204,0.020146,0.014802,0.014402,...,0.002430,0.005002,0.000249,0.042355,3.946234e-07,0.009123,0.034548,0.026024,0.012058,0.004095
884258,0.000216,0.049221,0.003169,0.000006,0.047458,0.007683,0.001195,0.007957,0.001402,0.016006,...,0.000015,0.012167,0.000120,0.079253,4.388756e-07,0.004047,0.025067,0.017887,0.004532,0.002365
884259,0.000807,0.075621,0.003793,0.000370,0.009047,0.001391,0.000160,0.022397,0.001493,0.012314,...,0.001120,0.003244,0.008138,0.051865,5.876485e-07,0.003092,0.046542,0.031690,0.021499,0.001727
884260,0.004167,0.084608,0.001971,0.000294,0.030340,0.008495,0.000261,0.020658,0.001440,0.007058,...,0.001819,0.003105,0.000872,0.051279,9.269546e-07,0.008270,0.024362,0.043884,0.019409,0.013666


In [14]:
output=pd.concat([test_ids, y_pred_df], axis=1)
output.to_csv("submission.csv", index=False)