In [24]:
# We first import all necessary libraries
import pandas as pd
import numpy as np
#from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Normalizer
from sklearn.decomposition import PCA

#from sklearn.svm import SVC, LinearSVC
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.naive_bayes import GaussianNB
#from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,\
BaggingClassifier

from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectFromModel, SelectPercentile, f_classif, chi2
from sklearn.cross_validation import train_test_split, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import log_loss#, f1_score, roc_auc_score, auc, make_scorer

# Make matplotlib show our plots inline
%matplotlib inline
# Set figure size
from pylab import rcParams
rcParams['figure.figsize'] = 12, 8

In [4]:
# Get the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print "The dimensions of train and test sets are: {}, {}".format(train.shape, test.shape)

The dimensions of train and test sets are: (878049, 9), (884262, 7)


In [5]:
# Check the first several rows
print "The first 5 rows of train set: \n{}".format(train.head())
print "\n"
print "The first 5 rows of test set: \n{}".format(test.head())

The first 5 rows of train set: 
                 Dates        Category                      Descript  \
0  2015-05-13 23:53:00        WARRANTS                WARRANT ARREST   
1  2015-05-13 23:53:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
2  2015-05-13 23:33:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
3  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   
4  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   

   DayOfWeek PdDistrict      Resolution                    Address  \
0  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
1  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
2  Wednesday   NORTHERN  ARREST, BOOKED  VANNESS AV / GREENWICH ST   
3  Wednesday   NORTHERN            NONE   1500 Block of LOMBARD ST   
4  Wednesday       PARK            NONE  100 Block of BRODERICK ST   

            X          Y  
0 -122.425892  37.774599  
1 -122.425892  37.774599  
2 -122.424363  37.800414  
3 -122

In [6]:
# Drop the additional columns in train set
train.drop(['Descript', 'Resolution', 'Address'], axis=1, inplace=True)
print"The new dimension of the train set: {}".format(train.shape)

The new dimension of the train set: (878049, 6)


In [7]:
# Separate the target
target = train['Category'].copy()
train.drop(['Category'], axis=1, inplace=True)

In [8]:
# Separate ID for the test set and drop the Address column
test_id = test['Id'].copy()
test.drop(['Id', 'Address'], axis=1, inplace=True)

In [9]:
# Define a function for preprocessing
def preprocess(df):
    df['Dates'] = pd.to_datetime(df['Dates']) #convert the Dates column to date type
    df['hour'] = df.Dates.dt.hour #now create a new column contain only time
    df.drop(['Dates'], axis=1, inplace=True) #drop the original column
    df_new = pd.get_dummies(df) #convert categorical variables to numerical
    print("Dataframe has been preprocessed")
    return df_new

In [10]:
train = preprocess(train)
test = preprocess(test)

Dataframe has been preprocessed
Dataframe has been preprocessed


In [11]:
# Convert the target column to numeric as well
le = LabelEncoder()
y = le.fit_transform(target)

In [12]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=.2, random_state=88, stratify=y)

In [46]:
# Remove all zero variance features
selector_variance = VarianceThreshold()

# Choose the scaler
#scaler = StandardScaler() 
scaler = RobustScaler() 
#scaler = MinMaxScaler()
#scaler = MaxAbsScaler()
#scaler = Normalizer()

# Choose the feature selection method
#selector = SelectKBest()
#selector = SelectPercentile()
selector = SelectFromModel(ExtraTreesClassifier(random_state=88))

# Choose the estimator
#estimator = XGBClassifier()
#estimator = GaussianNB() #13.3707515324
#estimator = LogisticRegression() #2.60101362415
estimator = LinearDiscriminantAnalysis() ###2.61244429398###
#estimator = QuadraticDiscriminantAnalysis() #9.30360536618
#estimator = RandomForestClassifier()
#estimator = AdaBoostClassifier() #3.58991845559
#estimator = GradientBoostingClassifier() # too long!
#estimator = SGDClassifier() # doen't work!
#estimator = LinearSVC()
#estimator = KNeighborsClassifier() #17.6703306353
#estimator = SVC()# too long!

# Prepare the pipeline
pipe = make_pipeline(#selector_variance,
                     scaler,
                     #PCA(),
                     #selector,
                     estimator)

# Make the parameter grid
params = {
    #'pca__n_components': [num_pca],
    #'selectkbest__k': [120, 130, 142],
    #'selectkbest__score_func': [f_classif], 
    #'selectpercentile__score_func': [f_classif],
    #'selectpercentile__percentile': [10, 20, 30, 50, 60],
    #'selectfrommodel__threshold': ['mean', 'median'],
    #'logisticregression__C': [1.5],
    #'logisticregression__max_iter': [150],
    #'logisticregression__multi_class': ['ovr'],
    #'logisticregression__n_jobs': [-1],
    #'logisticregression__random_state': [88],
    #'logisticregression__solver': ['lbfgs'],
    #'logisticregression__tol': [0.1],
    'lineardiscriminantanalysis__solver': ['svd', 'lsqr'],
    'lineardiscriminantanalysis__store_covariance': [False, True],
    'lineardiscriminantanalysis__tol': [0.00001, 0.0001, 0.001],
    #'lineardiscriminantanalysis__solver': ['svd', 'lsqr', 'eigen'],
    #'lineardiscriminantanalysis__solver': ['svd', 'lsqr', 'eigen'],
    #'lineardiscriminantanalysis__solver': ['svd', 'lsqr', 'eigen'],
    #'sgdclassifier__loss': ['log'],
    #'adaboostclassifier__n_estimators': [4],
    #'adaboostclassifier__learning_rate': [0.02],
    #'adaboostclassifier__random_state': [88],
    #'randomforestclassifier__n_estimators': [10, 20],
    #'randomforestclassifier__max_features': ['auto', 'sqrt'],
    #'randomforestclassifier__n_jobs': [-1],
    #'randomforestclassifier__random_state': [88], 
    #'adaboostclassifier__random_state': [88],
    #'gradientboostingclassifier__random_state': [88]
    #'xgbclassifier__nthread': [4],
    #'xgbclassifier__objective': ['binary:logistic'],
    #'xgbclassifier__learning_rate': [0.1], 
    #'xgbclassifier__reg_lambda': [1], 
    #'xgbclassifier__max_depth': [4],
    #'xgbclassifier__min_child_weight': [4],
    #'xgbclassifier__silent': [1],
    #'xgbclassifier__subsample': [1],
    #'xgbclassifier__colsample_bytree': [0.5],
    #'xgbclassifier__scale_pos_weight': [1],
    #'xgbclassifier__n_estimators': [100],
    #'xgbclassifier__seed': [88]
}

# Make an StratifiedShuffleSplit iterator for cross-validation in GridSearchCV
#sss = StratifiedShuffleSplit(y_train, n_iter=10, test_size=.3, random_state=88)

# Make the model using GridSearchCV and run cross-validation
clf = GridSearchCV(pipe,
                   param_grid=params,
                   scoring='log_loss',
                   #cv=sss,
                   n_jobs=-1,
                   verbose=1)

# Fit the model using premade clf
clf.fit(X_train, y_train)
preds = clf.predict_proba(X_test)
# Calculate the performance score
score = log_loss(y_test, preds)
print "The log_loss is: {}".format(score)
print "The best estimator is:\n{}".format(clf.best_estimator_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
The log_loss is: 2.61244429398
The best estimator is:
Pipeline(steps=[('robustscaler', RobustScaler(copy=True, with_centering=True, with_scaling=True)), ('lineardiscriminantanalysis', LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=1e-05))])


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   43.8s finished


In [72]:
# Get the classes
classes = clf.best_estimator_.classes_
# Convert back to categories
classes = le.inverse_transform(classes)
# Store pedictions in a dataframe
preds_test = clf.predict_proba(test)
pred_df = pd.DataFrame(preds_test, index=test_id.values, columns=classes)
# Combine the ID with the predictions
out_df = pd.concat([test_id, pred_df], axis=1)
# Save the output in a CSV file for submission
out_df.to_csv("2.60100034896_LogisticRegression.csv", index=False)