# **Assignment 4 Machine Learning with Python course from University of Michigan**

In [5]:
import pandas as pd
import numpy as np
import time

#import packages

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

def blight_model():
    
    print("\n*** INITIALIZATING TRAINING MODEL ***\n")
    
    start_time = time.time()
    
    #Load the data
    
    ptrain_df = pd.read_csv("../input/unimich-ds-assignment4/train.csv", encoding = 'ISO-8859-1')
    ptest_df = pd.read_csv("../input/unimich-ds-assignment4/test.csv", encoding = 'ISO-8859-1')
    addresses_df = pd.read_csv("../input/unimich-ds-assignment4/addresses.csv")
    latlons_df = pd.read_csv("../input/unimich-ds-assignment4/latlons.csv")
    
    # Merge data to train and test
    
    train_df = pd.merge(ptrain_df, pd.merge(addresses_df, latlons_df, on="address"), on="ticket_id")
    test_df = pd.merge(ptest_df, pd.merge(addresses_df, latlons_df, on="address"), on="ticket_id")
    
    labels = ["fine_amount", "discount_amount", "judgment_amount", "lat", "lon", "compliance"]
    train_df = train_df[labels]
    test_df = test_df[labels[:-1]]
    
    # Clean df
    
    train_df = train_df[train_df.compliance.notnull()]
    train_df = train_df.fillna(0)
    test_df = test_df.fillna(0)
    
    X_train = train_df.iloc[:, :-1]
    y_train = train_df["compliance"]
    X_test = test_df
    
    # Begin the training
    
    model = RandomForestClassifier(random_state = 42)
    
    # n_estimator and max_depth we want to try.
    grid = {"n_estimators": [10,30,50,100,150], "max_depth": [None,10,30]}
    grid_models = GridSearchCV(model, param_grid=grid, scoring="roc_auc")
    grid_models.fit(X_train, y_train)
    
    print("Best Results: ", grid_models.best_params_)
    print('Fold-1: ', grid_models.cv_results_['split0_test_score'][0])
    print('Fold-2: ', grid_models.cv_results_['split1_test_score'][0])
    print('Fold-3: ', grid_models.cv_results_['split2_test_score'][0])

    y_scores = grid_models.best_estimator_.predict_proba(X_test)
    
    # calculate time
    
    print("\nTime used: %s seconds" % (time.time() - start_time))
    
    return pd.DataFrame(y_scores[:,1], ptest_df.ticket_id)

## **Run the model: We expect 0.7+ to aprove the assignment**

In [None]:
blight_model()