In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn import ensemble
import re
from sklearn.model_selection import StratifiedKFold
pd.options.mode.chained_assignment = None  # default='warn'

def cv_performance(clf, X, y, k=5):
    skf = StratifiedKFold(n_splits=k)
    curr_sum = 0
    for train, test in skf.split(X, y):
        clf.fit(X.iloc[train], y.iloc[train])
        y_pred = clf.predict_proba(X.iloc[test])[:,1]
        curr_sum = curr_sum + metrics.roc_auc_score(y.iloc[test], y_pred)
    return(curr_sum/k)


In [3]:
#Import Data
train_dat = pd.read_csv("train.csv", encoding = "ISO-8859-1", low_memory=False)
test_dat = pd.read_csv("test.csv")
train_dat_cl = train_dat.dropna(axis=0, subset=["compliance"])
train_dat_cl = train_dat_cl.reset_index(drop=True)

In [4]:
#Generate Zip Payment Rate Feature
train_dat_cl['zip_payment_rate'] = np.zeros(train_dat_cl.shape[0])
test_dat['zip_payment_rate'] = np.zeros(test_dat.shape[0])
for zip_code in train_dat_cl.zip_code.unique():
    zip_dat = train_dat_cl[train_dat_cl["zip_code"] == zip_code]
    if zip_dat.shape[0] >= 4:
        comp_rate = sum(zip_dat["compliance"] == 1)/zip_dat.shape[0]
        if comp_rate <= 0.065:
            train_dat_cl.loc[train_dat_cl['zip_code'] == zip_code, 'zip_payment_rate'] = 1.
            test_dat.loc[test_dat['zip_code'] == zip_code, 'zip_payment_rate'] = 1.
        elif comp_rate > 0.065 and comp_rate <= 0.09:
            train_dat_cl.loc[train_dat_cl['zip_code'] == zip_code, 'zip_payment_rate'] = 2.
            test_dat.loc[test_dat['zip_code'] == zip_code, 'zip_payment_rate'] = 2.
        else:
            train_dat_cl.loc[train_dat_cl['zip_code'] == zip_code, 'zip_payment_rate'] = 3.
            test_dat.loc[test_dat['zip_code'] == zip_code, 'zip_payment_rate'] = 3.

In [136]:
train_dat_cl["is_that_code"] = np.zeros(train_dat_cl.shape[0])
train_dat_cl.loc[train_dat_cl["violation_code"] == "9-1-36(a)", "is_that_code"] = 1

test_dat["is_that_code"] = np.zeros(test_dat.shape[0])
test_dat.loc[test_dat["violation_code"] == "9-1-36(a)", "is_that_code"] = 1

train_dat_cl["disposition_notdefault"] = np.ones(train_dat_cl.shape[0])
train_dat_cl.loc[train_dat_cl["disposition"] == "Responsible by Default", "disposition_notdefault"] = 0

test_dat["disposition_notdefault"] = np.ones(test_dat.shape[0])
test_dat.loc[test_dat["disposition"] == "Responsible by Default", "disposition_notdefault"] = 0

train_dat_cl["graffiti_code"] = 1
train_dat_cl.loc[train_dat_cl["violation_code"] == "9-1-111", "graffiti_code"] = 0

test_dat["graffiti_code"] = 1
test_dat.loc[test_dat["violation_code"] == "9-1-111", "graffiti_code"] = 0


In [6]:
train_dat_cl['ticket_issued_date'] = pd.to_datetime(train_dat_cl['ticket_issued_date'], format='%m/%d/%Y %H:%M')

In [7]:
####REDO WITH ORDINAL TIME SERIES CROSS VALIDATION -- SELECT RANDOM SUBSET FROM PRE_2010 and POST_2010#####
# jan2010 = pd.to_datetime('1/1/2010 0:00', format='%-m/%-d/%Y %-H:%M', errors='ignore')
# train_dat_pre2010 = train_dat_cl[train_dat_cl['ticket_issued_date'] < jan2010]
# train_dat_post2010 = train_dat_cl[train_dat_cl['ticket_issued_date'] >= jan2010]

train_dat_cl['violatorPaid'] = 0
train_dat_cl['violatorSkipped'] = 0
violatorPaid = {}
violatorSkipped = {}
for i, row in train_dat_cl.iterrows():
    if row['violator_name'] in violatorPaid:
        train_dat_cl.set_value(i,'violatorPaid', violatorPaid[row['violator_name']])
    if row['violator_name'] in violatorSkipped:
        train_dat_cl.set_value(i,'violatorSkipped', violatorSkipped[row['violator_name']])
    if row['compliance'] == 1:
        if row['violator_name'] in violatorPaid:
            violatorPaid[row['violator_name']] += 1 
        else:
            violatorPaid[row['violator_name']] = 1
    else:
        if row['violator_name'] in violatorSkipped:
            violatorSkipped[row['violator_name']] += 1
        else:
            violatorSkipped[row['violator_name']] = 1

In [8]:
test_dat['violatorPaid'] = 0
test_dat['violatorSkipped'] = 0
for i, row in test_dat.iterrows():
    if row['violator_name'] in violatorPaid:
        test_dat.at[i,'violatorPaid'] = violatorPaid[row['violator_name']]
    if row['violator_name'] in violatorSkipped:
        test_dat.at[i,'violatorSkipped'] = violatorSkipped[row['violator_name']]

In [153]:
#Create feature matrix
feat_matrix = train_dat_cl[['zip_payment_rate', 'is_that_code', 'discount_amount', 'late_fee', 'violatorPaid', 'violatorSkipped', 'disposition_notdefault']]
feat_matrix_test = test_dat[['zip_payment_rate', 'is_that_code', 'discount_amount', 'late_fee', 'violatorPaid', 'violatorSkipped', 'disposition_notdefault']]
clf = ensemble.RandomForestClassifier()

print(cv_performance(clf, feat_matrix, train_dat_cl["compliance"], k=5))


#Create feature matrix
feat_matrix = train_dat_cl[['zip_payment_rate', 'graffiti_code', 'discount_amount', 'late_fee', 'violatorPaid', 'violatorSkipped', 'disposition_notdefault']]
feat_matrix_test = test_dat[['zip_payment_rate', 'graffiti_code', 'discount_amount', 'late_fee', 'violatorPaid', 'violatorSkipped', 'disposition_notdefault']]
#clf = ensemble.RandomForestClassifier()

print(cv_performance(clf, feat_matrix, train_dat_cl["compliance"], k=5))

#Create feature matrix
feat_matrix = train_dat_cl[['zip_payment_rate', 'is_that_code', 'judgment_amount', 'late_fee', 'violatorPaid', 'violatorSkipped', 'disposition_notdefault']]
feat_matrix_test = test_dat[['zip_payment_rate', 'is_that_code', 'judgment_amount', 'late_fee', 'violatorPaid', 'violatorSkipped', 'disposition_notdefault']]
#clf = ensemble.RandomForestClassifier()

print(cv_performance(clf, feat_matrix, train_dat_cl["compliance"], k=5))


# clf.fit(feat_matrix, train_dat_cl["compliance"])
# y_pred = clf.predict_proba(feat_matrix_test)
# y_probs = y_pred[:,1]
# print(y_probs)

0.836137184467
0.834864497234
0.831580339867


In [94]:
def cv_windows(clf, X, y, dates, windows=10):
    rng = pd.date_range(start='2004-03-15', end='2008-12-30', freq='D')
    time_idxs = np.array(range(len(rng)))
    start_train = rng[np.random.choice(time_idxs, windows)]
    end_train = start_train + pd.DateOffset(years=2)
    start_test = end_train + pd.DateOffset(days=1)
    end_test = start_test + pd.DateOffset(years=1)
    curr_sum = 0
    for i in range(windows):
        train_idx = np.array((dates >= start_train[i]) & (dates <= end_train[i]))
        train_idx = np.where(train_idx == True)[0]
        train_idx = np.random.choice(train_idx, train_idx.shape[0])
        test_idx = np.array((dates >= start_test[i]) & (dates <= end_test[i]))
        test_idx = np.where(test_idx == True)[0]
        test_idx = np.random.choice(test_idx, test_idx.shape[0])
        clf.fit(X.ix[train_idx], y.ix[train_idx])
        y_pred = clf.predict_proba(X.iloc[test_idx])[:,1]
        curr_sum = curr_sum + metrics.roc_auc_score(y.iloc[test_idx], y_pred)
    return(curr_sum/windows)

In [156]:
#CROSS VALIDATION
feat_matrix = train_dat_cl[['zip_payment_rate', 'discount_amount', 'graffiti_code', 'late_fee', 'violatorPaid', 'violatorSkipped', 'disposition_notdefault']]
feat_matrix_test = test_dat[['zip_payment_rate', 'discount_amount', 'graffiti_code', 'late_fee', 'violatorPaid', 'violatorSkipped', 'disposition_notdefault']]

clf = ensemble.RandomForestClassifier()
#Create dates vector
#dates = train_dat_cl['ticket_issued_date']

print(cv_windows(clf, feat_matrix, train_dat_cl["compliance"], dates, 100))

0.817355446637


In [166]:
#SUBMISSION
clf.fit(feat_matrix, train_dat_cl["compliance"])
y_pred = clf.predict_proba(feat_matrix_test)
y_probs = y_pred[:,1]
toWrite = pd.concat([test_dat["ticket_id"], pd.Series(y_probs)], axis=1)
toWrite.to_csv(path_or_buf="submission5.csv", index=False)
#SanityCheck
print(sum(y_probs > 0.5))
print(sum(y_probs <= 0.5))

1453
59548


In [159]:
feat_matrix

Unnamed: 0,zip_payment_rate,discount_amount,graffiti_code,late_fee,violatorPaid,violatorSkipped,disposition_notdefault
0,3.0,0,1,25.0,0,0,0.0
1,1.0,0,1,75.0,0,0,1.0
2,0.0,0,1,25.0,0,0,0.0
3,2.0,0,1,75.0,0,0,0.0
4,2.0,0,1,10.0,0,0,0.0
5,2.0,0,1,10.0,0,1,0.0
6,1.0,0,1,75.0,0,0,0.0
7,1.0,0,1,75.0,0,0,0.0
8,1.0,0,1,75.0,0,0,0.0
9,1.0,0,1,75.0,0,0,0.0
