In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [1]:
def get_feature_list():
    features = ['ticket_id', 
                #'agency_name',
                #'inspector_name',
                #'violator_name',
                'lat', 
                'lon',
                #'violation_street_number', 
                #'violation_street_name', 
                #'violation_zip_code',
                #'mailing_address_str_number', 
                #'mailing_address_str_name', 
                #'city', 
                #'state', 
                #'zip_code', 
                #'non_us_str_code', 
                #'country',
                #'ticket_issued_date',
                #'hearing_date',
                'violation_code', 
                #'violation_description',
                #'disposition',
                'fine_amount',
                'admin_fee',
                'state_fee',
                'late_fee',
                'discount_amount',
                'clean_up_cost',
                'judgment_amount',
                'grafitti_status']
    return features

In [3]:
def get_df(theSourceData):
    df = pd.read_csv(theSourceData, engine='python')
    addresses = pd.read_csv('readonly/addresses.csv')
    latlons = pd.read_csv('readonly/latlons.csv')

    new_df = df.merge(right=addresses, how='left', on='ticket_id').merge(right=latlons, how='left', on='address')
    new_df['grafitti_status'] = new_df.loc[new_df['grafitti_status'].notnull()]['grafitti_status'] = 1.0
    new_df['grafitti_status'] = new_df.loc[new_df['grafitti_status'].isnull()]['grafitti_status'] = 0.0
    new_df['lat'] = new_df.loc[new_df['lat'].isnull()]['lat'] = 0.0
    new_df['lon'] = new_df.loc[new_df['lon'].isnull()]['lon'] = 0.0
    new_df['fine_amount'] = new_df.loc[new_df['fine_amount'].isnull()]['fine_amount'] = 0.0
    new_df['ticket_issued_date'] = pd.to_datetime(new_df['ticket_issued_date'], format='%Y-%m-%d %H:%M:%S')
    new_df['hearing_date'] = pd.to_datetime(new_df['hearing_date'], format='%Y-%m-%d %H:%M:%S')
    new_df['violation_code'] = new_df.loc[new_df['violation_code'].isnull()]['violation_code'] = 'NaN'
    new_df['violation_code'] = LabelEncoder().fit_transform(new_df['violation_code'])
    new_df['non_us_str_code'] = new_df.loc[new_df['non_us_str_code'].isnull()]['non_us_str_code'] = 'NaN'
    new_df['non_us_str_code'] = LabelEncoder().fit_transform(new_df['non_us_str_code']) 
    
    return new_df

In [4]:
def blight_model():
    # Your code here
    
    train_df = get_df('readonly/train.csv')
    train_df.dropna(axis=0,subset=['compliance'], inplace=True)
    train_df = train_df.astype(dtype={'compliance':'int64'})
    test_df = get_df('readonly/test.csv')
    test_df = get_df('readonly/test.csv')
    X = train_df[get_feature_list()]
    y = train_df['compliance']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    dt = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
    X_realTest = test_df[get_feature_list()]
    y_proba_tree = dt.predict_proba(X_realTest)[:,1]
    
    result = pd.Series(index=test_df['ticket_id'], data=y_proba_tree, name='compliance')
    
    return result # Your answer here

In [5]:
blight_model()

ticket_id
284932    0.177546
285362    0.030849
285361    0.177546
285338    0.177546
285346    0.177546
            ...   
376496    0.030849
376497    0.030849
376499    0.177546
376500    0.177546
369851    0.302650
Name: compliance, Length: 61001, dtype: float64