In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

In [2]:
print "Reading in Training data...\n"
train_df=pd.read_csv("//Users/xiaoweichen/Kaggle/SFCrimeClassification/train.csv")

Reading in Training data...



In [3]:
def parse_time(original_date):
    new_date=datetime.strptime(original_date,"%Y-%m-%d %H:%M:%S")
    Hour=new_date.hour
    Month=new_date.month
    Year=new_date.year
    return Hour,Month,Year

In [4]:
def parse_data(df,addr_crime_prob,crimes):
    feature_list=df.columns.tolist()
    # Remove useless columns
    if "Descript" in feature_list:
        feature_list.remove("Descript")
    if "Resolution" in feature_list:
        feature_list.remove("Resolution")
    if "Id" in feature_list:
        feature_list.remove("Id")
    if "PdDistrict" in feature_list:
        feature_list.remove("PdDistrict")
    if "X" in feature_list:
        feature_list.remove("X")
    if "Y" in feature_list:
        feature_list.remove("Y")    
    
    cleanData=df[feature_list]
    cleanData.index=range(len(df))
    
    print "Performing Count featurization for address...\n"
    address_feature=cleanData["Address"].apply(lambda x:addr_crime_prob[x])
    address_feature.columns=[str(crime) + " PROBABILITY" for crime in crimes]
    
    print "Parsing date...\n"
    cleanData["Hour"],cleanData["Month"],cleanData["Year"]=zip(*cleanData["Dates"].apply(parse_time))
    
    print "Creating one-hot variables...\n"
    dummy_dayOfWeek=pd.get_dummies(cleanData["DayOfWeek"])
    cleanData["IsIntersection"]=cleanData["Address"].apply(lambda x: 1 if "/" in x else 0)
    
    # Dropping processed columns
    cleanData=cleanData.drop("Address",axis=1)
    cleanData=cleanData.drop("Dates",axis=1)
    cleanData=cleanData.drop("DayOfWeek",axis=1)
    if "Category" in cleanData.columns.tolist():
        cleanData=cleanData.drop("Category",axis=1)
    
    print "Joining one-hot features...\n"
    features=cleanData.join(dummy_dayOfWeek).join(address_feature)
    
    if "Category" in df.columns:
        response=df["Category"].astype("category")
    else:
        response=None
        
    print "Done!"
    return features,response

In [5]:
addresses=sorted(train_df["Address"].unique())
crimes=train_df["Category"].unique()
addr_counts=train_df.groupby(["Address"]).size()
crime_counts=train_df.groupby("Category").size()
addr_crime_counts=train_df.groupby(["Address","Category"]).size()
addr_crime_prob={}
for addr in addresses:
    addr_crime_prob[addr]=0.0*crime_counts
    for crime in crimes:
        if crime in addr_crime_counts[addr].keys():
            addr_crime_prob[addr][crime]=addr_crime_counts[addr][crime]/float(addr_counts[addr])
    addr_crime_prob[addr]=pd.Series(addr_crime_prob[addr])
    addr_crime_prob[addr].index=range(len(crimes))

In [6]:
features,response=parse_data(train_df,addr_crime_prob,crimes)

Performing Count featurization for address...

Parsing date...

Creating one-hot variables...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Joining one-hot features...

Done!


In [7]:
# Perform feature standardization
column_names=features.columns.tolist()
scaler = preprocessing.StandardScaler()
scaler.fit(features)
features[column_names]=scaler.transform(features)

In [8]:
print  "Fitting a logistic regression model...\n"
lg=LogisticRegression(solver="sag",multi_class="ovr")
lg.fit(features,response)

Fitting a logistic regression model...



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0)

In [8]:
print "Reading in testing data set...\n"
test_df=pd.read_csv("//Users/xiaoweichen/Kaggle/SFCrimeClassification/test.csv")
print "Transform testing data set...\n"
addresses=sorted(train_df["Address"].unique())
testing_addresses=sorted(test_df["Address"].unique())
new_addresses=set(testing_addresses+addresses)-set(addresses)
default_crime_prob=crime_counts/len(train_df)
for addr in new_addresses:
    addr_crime_prob[addr]=default_crime_prob
    addr_crime_prob[addr].index=range(len(crimes))
test_feature, _=parse_data(test_df,addr_crime_prob,crimes)

Reading in testing data set...

Transform testing data set...

Performing Count featurization for address...

Parsing date...

Creating one-hot variables...

Joining one-hot features...

Done!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
# Perform feature standardization
column_names=test_feature.columns.tolist()
scaler = preprocessing.StandardScaler()
scaler.fit(test_feature)
test_feature[column_names]=scaler.transform(test_feature)

In [10]:
print "Making predictions...\n"
pred=lg.predict_proba(test_feature)

output=pd.DataFrame(pred)
output.columns=lg.classes_
output=output.join(test_df["Id"])

Making predictions...



NameError: name 'lg' is not defined

In [None]:
print "Exporting predictions to Excel...\n"
output.to_csv("python_lg_countFeature.csv",index=False)
print "Done!"