In [139]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier


def blight_model():
    
    df_test = pd.read_csv('test.csv', encoding="Latin-1", low_memory=False)
    np.random.seed(seed=0)
    df_train = pd.read_csv('train.csv',encoding="Latin-1",low_memory=False)
    
    missing_cols = set( df_train.columns ) - set( df_test.columns )
    for c in missing_cols:
        df_test[c] = 0
    
    
    df_test[['admin_fee','state_fee']] = df_test[['admin_fee','state_fee']].astype(int)
    df_test[['late_fee','discount_amount']] = df_test[['late_fee','discount_amount']].astype(int)
    df_test[['clean_up_cost','judgment_amount']] = df_test[['clean_up_cost','judgment_amount']].astype(int)
    df_test[['payment_amount','balance_due']] = df_test[['payment_amount','balance_due']].astype(int)
    df_test[df_test['violation_street_number'].isnull()] = 0
    df_test[df_test['fine_amount'].isnull()] = 0
    df_test[df_test['admin_fee'].isnull()] = 0
    df_test[df_test['state_fee'].isnull()] = 0
    df_test[df_test['late_fee'].isnull()] = 0
    df_test[df_test['discount_amount'].isnull()] = 0
    df_test[df_test['clean_up_cost'].isnull()] = 0
    df_test[df_test['judgment_amount'].isnull()] = 0
    df_test[df_test['payment_amount'].isnull()] = 0
    df_test[df_test['balance_due'].isnull()] = 0
    df_test[df_test['compliance'].isnull()] = 0 
    df_test[df_test['agency_name'].isnull()] = 0 

    
    df_train = df_train[(df_train['compliance'] == 1) | (df_train['compliance'] == 0)]
    df_train = df_train[df_train['ticket_id'] != 0]
    df_train[df_train['mailing_address_str_number'].isnull()] = 0
    df_train[df_train['violation_street_number'].isnull()] = 0
    df_train[df_train['fine_amount'].isnull()] = 0
    df_train[df_train['admin_fee'].isnull()] = 0
    df_train[df_train['state_fee'].isnull()] = 0
    df_train[df_train['late_fee'].isnull()] = 0
    df_train[df_train['discount_amount'].isnull()] = 0
    df_train[df_train['clean_up_cost'].isnull()] = 0
    df_train[df_train['judgment_amount'].isnull()] = 0
    df_train[df_train['payment_amount'].isnull()] = 0
    df_train[df_train['balance_due'].isnull()] = 0
    df_train[df_train['compliance'].isnull()] = 0 
    df_train[df_train['agency_name'].isnull()] = 0 
    df_train[['admin_fee','state_fee']] = df_train[['admin_fee','state_fee']].astype(int)
    df_train[['late_fee','discount_amount']] = df_train[['late_fee','discount_amount']].astype(int)
    df_train[['clean_up_cost','judgment_amount']] = df_train[['clean_up_cost','judgment_amount']].astype(int)
    df_train[['payment_amount','balance_due']] = df_train[['payment_amount','balance_due']].astype(int)
    
    df_train = df_train[df_train['ticket_id'] != 0]
    
    le = preprocessing.LabelEncoder()
    df_train.agency_name = le.fit_transform(df_train.agency_name)
    df_test.agency_name = le.fit_transform(df_test.agency_name)
    
    #df_train.disposition = le.fit_transform(df_train.disposition)
    #df_test.disposition = le.fit_transform(df_test.disposition)
    
    #df_train.country = le.fit_transform(df_train.country)
    #df_test.country = le.fit_transform(df_test.country)
    
    df_train.payment_status= le.fit_transform(df_train.payment_status)
    #df_train.collection_status= le.fit_transform(df_train.collection_status)
    df_train.compliance_detail= le.fit_transform(df_train.compliance_detail)
    df_train.compliance = le.fit_transform(df_train.compliance)
    
    #X = df_train[['agency_name','admin_fee','state_fee','compliance_detail','fine_amount','late_fee',
    #              'discount_amount','clean_up_cost','judgment_amount','payment_status','payment_amount']]
    X = df_train[['compliance_detail','fine_amount','late_fee','discount_amount',
                 'payment_status','payment_amount']]
    y = df_train['compliance']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    
    #X_test_df_test = df_test[['state_fee','compliance_detail','fine_amount','late_fee','payment_status','payment_amount']]
    
    #clf = RandomForestClassifier(max_features=8,random_state=0)
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)

    
    print('Accuracy of RF classifier on training set: {:.2f}'
         .format(clf.score(X_train, y_train)))
    print('Accuracy of RF classifier on test set: {:.2f}'
         .format(clf.score(X_test, y_test)))
    print('Feature importances: {}'.format(clf.feature_importances_))
    
    
    grid_values = {'max_features': [2,5],'random_state' : [0,1]}

    # default metric to optimize over grid parameters: accuracy
    grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'roc_auc')
    grid_clf_acc.fit(X_train, y_train)
    y_decision_fn_scores_auc = grid_clf_acc.predict_proba(X_test) 
    
        
    print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc[:,1]))
    print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
    print('Grid best score (accuracy): ', grid_clf_acc.best_score_)

    return df_train

In [140]:
d = blight_model()

Accuracy of RF classifier on training set: 1.00
Accuracy of RF classifier on test set: 1.00
Feature importances: [ 0.00087275  0.69220888  0.00450633  0.04670177  0.02311793  0.00652656
  0.08708475  0.13898102]
('Test set AUC: ', 1.0)
('Grid best parameter (max. accuracy): ', {'max_features': 2, 'random_state': 0})
('Grid best score (accuracy): ', 1.0)
