In [None]:
#Optimization of AUC score for property violation tickets payment probability P_compl
#We use the logistic regression and MLP classifier to train and predict
#We optmize the AUC score above .75 with different parameter ex: Solve 'sag' and solver 'lbfgs'

#The data (not included here) can be obtained from the detroit open data portal
#links: 
#https://data.detroitmi.gov/property-parcels/building-permits/xw2a-a7tf
#https://data.detroitmi.gov/property-parcels/trades-permits/635b-dsgv
#https://data.detroitmi.gov/government/improve-detroit-submitted-issues/fwz3-w3yn
#https://data.detroitmi.gov/public-safety/dpd-citizen-complaints-2016/kahe-efs3
#https://data.detroitmi.gov/property-parcels/parcel-map/fxkw-udwf


def Model1():

    import pandas as pd
    import numpy as np
    from sklearn.grid_search import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.metrics import roc_curve, auc
    from sklearn.metrics import roc_auc_score


    scaler = MinMaxScaler()

    df_test = pd.read_csv('test.csv')
    df_train0 = pd.read_csv('train.csv', encoding='Latin-1')
    df_address = pd.read_csv('addresses.csv')
    df_location = pd.read_csv('latlons.csv')

    train = df_train0[~np.isnan(df_train0['compliance'])]

    df_address.set_index('address',inplace = True)
    df_location.set_index('address',inplace = True)

    df_address = df_address.join(df_location, how='left')

    train.set_index('ticket_id', inplace = True)
    df_address.set_index('ticket_id', inplace = True)

    train_dat = train.join(df_address)

    df_test.set_index('ticket_id', inplace = True)

    test_dat = df_test.join(df_address)
    train_dat = train_dat[~train['hearing_date'].isnull()]

    drop_train_dat = ['payment_amount', 'payment_date', 'payment_status', 'balance_due', 'collection_status', 'compliance_detail']
    drop_less_relevant = ['agency_name', 'inspector_name', 'violator_name', 'violation_street_number', 
                     'violation_street_name', 'violation_zip_code', 'mailing_address_str_number',
                     'mailing_address_str_name', 'city', 'state', 'zip_code', 
                     'non_us_str_code', 'country','ticket_issued_date', 'hearing_date', 'violation_code', 
                     'violation_description', 'disposition', 'grafitti_status']

    train_dat.drop(drop_train_dat, axis =1, inplace = True)
    test_dat.drop(drop_less_relevant, axis =1, inplace = True)
    train_dat.drop(drop_less_relevant, axis =1, inplace = True)

    train_dat['lat'].fillna(method ='pad', inplace = True)
    train_dat['lon'].fillna(method ='pad', inplace = True)
    test_dat['lat'].fillna(method ='pad', inplace = True)
    test_dat['lon'].fillna(method ='pad', inplace = True)

    X_train = train_dat.drop('compliance', axis =1)
    y_train =  train_dat['compliance']

    X_test = test_dat


    X_train_Scl = scaler.fit_transform(X_train)
    X_test_Scl = scaler.transform(X_test)


    grid = { 'C': np.power(10.0, np.arange(-10, 10)), 'solver': ['sag'] }

    lr = LogisticRegression(penalty='l2', max_iter=40, tol=10)

    P_compl = lr.fit(X_train_Scl, y_train).predict_proba(X_test_Scl)[:,1]
    GridROC = GridSearchCV(lr, grid, scoring='roc_auc')
    GridROC.fit(X_train_Scl, y_train)
    GridROC.best_score_
    return P_compl

Model1()


In [None]:
import pandas as pd
import numpy as np

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score


scaler = MinMaxScaler()

df_test = pd.read_csv('test.csv')
df_train0 = pd.read_csv('train.csv', encoding='Latin-1')
df_address = pd.read_csv('addresses.csv')
df_location = pd.read_csv('latlons.csv')

train = df_train0[~np.isnan(df_train0['compliance'])]

df_address.set_index('address',inplace = True)
df_location.set_index('address',inplace = True)

df_address = df_address.join(df_location, how='left')

train.set_index('ticket_id', inplace = True)
df_address.set_index('ticket_id', inplace = True)

train_dat = train.join(df_address)

df_test.set_index('ticket_id', inplace = True)

test_dat = df_test.join(df_address)
train_dat = train_dat[~train['hearing_date'].isnull()]

drop_train_dat = ['payment_amount', 'payment_date', 'payment_status', 'balance_due', 'collection_status', 'compliance_detail']
drop_less_relevant = ['agency_name', 'inspector_name', 'violator_name', 'violation_street_number', 
                     'violation_street_name', 'violation_zip_code', 'mailing_address_str_number',
                     'mailing_address_str_name', 'city', 'state', 'zip_code', 
                     'non_us_str_code', 'country','ticket_issued_date', 'hearing_date', 'violation_code', 
                     'violation_description', 'disposition', 'grafitti_status']

train_dat.drop(drop_train_dat, axis =1, inplace = True)
test_dat.drop(drop_less_relevant, axis =1, inplace = True)
train_dat.drop(drop_less_relevant, axis =1, inplace = True)

train_dat['lat'].fillna(method ='pad', inplace = True)
train_dat['lon'].fillna(method ='pad', inplace = True)
test_dat['lat'].fillna(method ='pad', inplace = True)
test_dat['lon'].fillna(method ='pad', inplace = True)

X_train = train_dat.drop('compliance', axis =1)
y_train =  train_dat['compliance']

X_test = test_dat


X_train_Scl = scaler.fit_transform(X_train)
X_test_Scl = scaler.transform(X_test)


MLP = MLPClassifier(hidden_layer_sizes = [10,10],alpha = 5, random_state = 0, 
                    solver = 'lbfgs')

MLP.fit(X_train_Scl, y_train)
P_compl = MLP.predict_proba(X_test_Scl)[:,1]

grid_values = {'hidden_layer_sizes': [[100, 10], [150, 10]]}
grid_AUC_scores = GridSearchCV(MLP, param_grid = grid_values, scoring = 'roc_auc')
grid_AUC_scores.fit(X_train_Scl, y_train)

print(grid_AUC_scores.best_score_)

