In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
print "Reading in Training data...\n"
train_df=pd.read_csv("//Users/xiaoweichen/Kaggle/SFCrimeClassification/train.csv")

Reading in Training data...



In [4]:
def parse_time(original_date):
    new_date=datetime.strptime(original_date,"%Y-%m-%d %H:%M:%S")
    Hour=new_date.hour
    Month=new_date.month
    Year=new_date.year
    return Hour,Month,Year

In [5]:
def parse_data(df,addr_crime_prob,crimes):
    feature_list=df.columns.tolist()
    # Remove useless columns
    if "Descript" in feature_list:
        feature_list.remove("Descript")
    if "Resolution" in feature_list:
        feature_list.remove("Resolution")
    if "Id" in feature_list:
        feature_list.remove("Id")
    if "PdDistrict" in feature_list:
        feature_list.remove("PdDistrict")
    if "X" in feature_list:
        feature_list.remove("X")
    if "Y" in feature_list:
        feature_list.remove("Y")    
    
    cleanData=df[feature_list]
    cleanData.index=range(len(df))
    
    print "Performing Count featurization for address...\n"
    address_feature=cleanData["Address"].apply(lambda x:addr_crime_prob[x])
    address_feature.columns=[str(crime) + " PROBABILITY" for crime in crimes]
    
    print "Parsing date...\n"
    cleanData["Hour"],cleanData["Month"],cleanData["Year"]=zip(*cleanData["Dates"].apply(parse_time))
    
    print "Creating one-hot variables...\n"
    dummy_dayOfWeek=pd.get_dummies(cleanData["DayOfWeek"])
    cleanData["IsIntersection"]=cleanData["Address"].apply(lambda x: 1 if "/" in x else 0)
    
    # Dropping processed columns
    cleanData=cleanData.drop("Address",axis=1)
    cleanData=cleanData.drop("Dates",axis=1)
    cleanData=cleanData.drop("DayOfWeek",axis=1)
    if "Category" in cleanData.columns.tolist():
        cleanData=cleanData.drop("Category",axis=1)
    
    print "Joining one-hot features...\n"
    features=cleanData.join(dummy_dayOfWeek).join(address_feature)
    
    if "Category" in df.columns:
        response=df["Category"].astype("category")
    else:
        response=None
        
    print "Done!"
    return features,response

In [6]:
addresses=sorted(train_df["Address"].unique())
crimes=train_df["Category"].unique()
addr_counts=train_df.groupby(["Address"]).size()
crime_counts=train_df.groupby("Category").size()
addr_crime_counts=train_df.groupby(["Address","Category"]).size()
addr_crime_prob={}
for addr in addresses:
    addr_crime_prob[addr]=0.0*crime_counts
    for crime in crimes:
        if crime in addr_crime_counts[addr].keys():
            addr_crime_prob[addr][crime]=addr_crime_counts[addr][crime]/float(addr_counts[addr])
    addr_crime_prob[addr]=pd.Series(addr_crime_prob[addr])
    addr_crime_prob[addr].index=range(len(crimes))

In [7]:
features,response=parse_data(train_df,addr_crime_prob,crimes)

Performing Count featurization for address...

Parsing date...

Creating one-hot variables...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Joining one-hot features...

Done!


In [8]:
print  "Fitting a gradient boosting classifer...\n"
gbc=GradientBoostingClassifier(learning_rate=0.2,n_estimators=5)
gbc.fit(features,response)

Fitting a gradient boosting classifer...



GradientBoostingClassifier(init=None, learning_rate=0.2, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=5,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [None]:
print "Reading in testing data set...\n"
test_df=pd.read_csv("//Users/xiaoweichen/Kaggle/SFCrimeClassification/test.csv")
print "Transform testing data set...\n"
addresses=sorted(train_df["Address"].unique())
testing_addresses=sorted(test_df["Address"].unique())
new_addresses=set(testing_addresses+addresses)-set(addresses)
default_crime_prob=crime_counts/len(train_df)
for addr in new_addresses:
    addr_crime_prob[addr]=default_crime_prob
    addr_crime_prob[addr].index=range(len(crimes))
test_feature, _=parse_data(test_df,addr_crime_prob,crimes)

In [None]:
print "Making predictions...\n"
pred=gbc.predict_proba(test_feature)

output=pd.DataFrame(pred)
output.columns=gbc.classes_
output=output.join(test_df["Id"])

In [None]:
print "Exporting predictions to Excel...\n"
output.to_csv("python_gbc_countFeature.csv",index=False)
print "Done!"