In [42]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn import ensemble
import re
from sklearn.model_selection import StratifiedKFold

def cv_performance(clf, X, y, k=5):
    skf = StratifiedKFold(n_splits=k)
    curr_sum = 0
    for train, test in skf.split(X, y):
        clf.fit(X.iloc[train], y.iloc[train])
        y_pred = clf.predict_proba(X.iloc[test])[:,1]
        curr_sum = curr_sum + metrics.roc_auc_score(y.iloc[test], y_pred)
    return(curr_sum/k)


In [7]:
#Import Data
train_dat = pd.read_csv("train.csv", encoding = "ISO-8859-1", low_memory=False)
test_dat = pd.read_csv("test.csv")
train_dat_cl = train_dat.dropna(axis=0, subset=["compliance"])

In [20]:
#Generate Zip Payment Rate Feature
train_dat_cl['zip_payment_rate'] = np.zeros(train_dat_cl.shape[0])
test_dat['zip_payment_rate'] = np.zeros(test_dat.shape[0])
for zip_code in train_dat_cl.zip_code.unique():
    zip_dat = train_dat_cl[train_dat_cl["zip_code"] == zip_code]
    if zip_dat.shape[0] >= 4:
        comp_rate = sum(zip_dat["compliance"] == 1)/zip_dat.shape[0]
        if comp_rate <= 0.065:
            train_dat_cl.loc[train_dat_cl['zip_code'] == zip_code, 'zip_payment_rate'] = 1.
            test_dat.loc[test_dat['zip_code'] == zip_code, 'zip_payment_rate'] = 1.
        elif comp_rate > 0.065 and comp_rate <= 0.09:
            train_dat_cl.loc[train_dat_cl['zip_code'] == zip_code, 'zip_payment_rate'] = 2.
            test_dat.loc[test_dat['zip_code'] == zip_code, 'zip_payment_rate'] = 2.
        else:
            train_dat_cl.loc[train_dat_cl['zip_code'] == zip_code, 'zip_payment_rate'] = 3.
            test_dat.loc[test_dat['zip_code'] == zip_code, 'zip_payment_rate'] = 3.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [10]:
#Generate is the code feature
train_dat_cl["is_that_code"] = np.zeros(train_dat_cl.shape[0])
train_dat_cl.loc[train_dat_cl["violation_code"] == "9-1-36(a)", "is_that_code"] = 1

test_dat["is_that_code"] = np.zeros(test_dat.shape[0])
test_dat.loc[test_dat["violation_code"] == "9-1-36(a)", "is_that_code"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [None]:
train_dat_cl['ticket_issued_date'] = pd.to_datetime(train_dat_cl['ticket_issued_date'], format='%-m/%-d/%Y %-H:%M', errors='ignore')

In [46]:
####REDO WITH ORDINAL TIME SERIES CROSS VALIDATION -- SELECT RANDOM SUBSET FROM PRE_2010 and POST_2010#####
# jan2010 = pd.to_datetime('1/1/2010 0:00', format='%-m/%-d/%Y %-H:%M', errors='ignore')
# train_dat_pre2010 = train_dat_cl[train_dat_cl['ticket_issued_date'] < jan2010]
# train_dat_post2010 = train_dat_cl[train_dat_cl['ticket_issued_date'] >= jan2010]

train_dat_cl['housePayRate'] = 0
train_dat_cl['violatorPayRate'] = 0
train_dat_cl['housePaid'] = 0
train_dat_cl['houseTotal'] = 0
train_dat_cl['violatorPaid'] = 0
train_dat_cl['violatorTotal'] = 0
violatorPayRate = {}
housePayRate = {}
for i, row in train_dat_cl.iterrows():
    if row['violator_name'] in violatorPayRate:
        train_dat_cl.set_value(i,'violatorPayRate',violatorPayRate[row['violator_name']][0] / violatorPayRate[row['violator_name']][1])
        violatorPayRate[row['violator_name']][0] += row['compliance']
        violatorPayRate[row['violator_name']][1] += 1
    else:
        violatorPayRate[row['violator_name']] = [row['compliance'],1]
    if str(row['violation_street_number']) + str(row['violation_street_name']) in housePayRate:
        train_dat_cl.set_value(i,'housePayRate',housePayRate[str(row['violation_street_number']) + str(row['violation_street_name'])][0] / housePayRate[str(row['violation_street_number']) + str(row['violation_street_name'])][1])
        housePayRate[str(row['violation_street_number']) + str(row['violation_street_name'])][0] += row['compliance']
        housePayRate[str(row['violation_street_number']) + str(row['violation_street_name'])][1] += 1
    else:
        housePayRate[str(row['violation_street_number']) + str(row['violation_street_name'])] = [row['compliance'],1]
for i, row in train_dat_cl.iterrows():
    train_dat_cl.at[i,'violatorPaid'] = violatorPayRate[row['violator_name']][0]
    train_dat_cl.at[i,'housePaid'] = housePayRate[str(row['violation_street_number']) + str(row['violation_street_name'])][0] 
    train_dat_cl.at[i,'violatorTotal'] = violatorPayRate[row['violator_name']][1]
    train_dat_cl.at[i,'houseTotal'] = housePayRate[str(row['violation_street_number']) + str(row['violation_street_name'])][1]
train_dat_cl['housePayRate'] = train_dat_cl['housePaid'] / train_dat_cl['houseTotal']
train_dat_cl['violatorPayRate'] = train_dat_cl['violatorPaid'] / train_dat_cl['violatorTotal']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [47]:
test_dat['violatorPaid'] = 0
test_dat['violatorTotal'] = 1
test_dat['violatorPayRate'] = 0
for i, row in test_dat.iterrows():
    if row['violator_name'] in violatorPayRate:
        test_dat.at[i,'violatorPaid'] = violatorPayRate[row['violator_name']][0]
        test_dat.at[i,'violatorTotal'] = violatorPayRate[row['violator_name']][1]
test_dat['violatorPayRate'] = test_dat['violatorPaid'] / test_dat['violatorTotal']

In [48]:
#Create feature matrix
feat_matrix = train_dat_cl[['zip_payment_rate', 'is_that_code', 'judgment_amount', 'late_fee', 'violatorPayRate']]
feat_matrix_test = test_dat[['zip_payment_rate', 'is_that_code', 'judgment_amount', 'late_fee', 'violatorPayRate']]
clf = ensemble.RandomForestClassifier()
print(cv_performance(clf, feat_matrix, train_dat_cl["compliance"], k=5))

# clf.fit(feat_matrix, train_dat_cl["compliance"])
# y_pred = clf.predict_proba(feat_matrix_test)
# y_probs = y_pred[:,1]
# print(y_probs)

0.98366864957
