In [None]:
#This assignment was for my Machine Learning class on Coursera. The problem was coming up with probabilities that tickets for
#blight violations in Detroit would be paid. At first I spent a lot of effort coming up with ways to get usable features
#from different kinds of information, but then I found out that using just a few features would be enough.

import pandas as pd
import numpy as np

import string
import re
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier

def blight_model():
    
    #read test and training data
    dfTrain = pd.read_csv('train.csv', encoding = 'ISO-8859-1')
    dfTest = pd.read_csv('test.csv', encoding = 'ISO-8859-1')

    #set indexes to ticket ID
    dfTrain.set_index('ticket_id', inplace=True)
    dfTest.set_index('ticket_id', inplace=True)

    #dropping columns in training data that aren't in test data
    cols = pd.DataFrame(dfTrain.columns).merge(pd.DataFrame(dfTest.columns), how='left', indicator=True)
    colNames = cols[0]
    dropCols = colNames[cols['_merge'] == 'left_only'].drop(colNames[colNames == 'compliance'].index[0])
    dfTrain = dfTrain.drop(dropCols, axis=1)

    #rename columns for convenience
    colRename = {'violator_name': 'v_name', 'violation_street_number': 'v_str_num', 'violation_street_name': 'v_str_name',
                 'mailing_address_str_number': 'm_str_num', 'mailing_address_str_name': 'm_str_name', 'violation_code': 'v_code',
                 'violation_description': 'v_description'}
    dfTrain.rename(columns=colRename, inplace=True)

    #make strings lower case
    strCols = ['disposition']
    dfTrain[strCols] = dfTrain[strCols].apply(lambda a : a.astype(str).str.lower())

    #Removing rows where judgment was not responsible or judgment still pending
    notRespJudgments = ['not responsible by dismissal', 'not responsible by city dismissal', 'pending judgment',
                        'not responsible by determination', 'set-aside (pending judgment)', 'responsible (fine waived) by deter']
    notResp = dfTrain[dfTrain['disposition'].isin(notRespJudgments)]
    dfTrain.drop(notResp.index, inplace=True)

    #drop unnecessary columns
    keepCols = ['discount_amount', 'compliance', 'disposition']
    dfTrain = dfTrain[keepCols]

    #encode categorical data with get_dummies
    catCols = ['disposition']
    dfDum = pd.get_dummies(dfTrain[catCols], prefix=catCols, drop_first=True)
    dfTrain = pd.concat([dfTrain, dfDum], axis=1).drop(catCols, axis=1)

    #X and y
    X = dfTrain.drop('compliance', axis=1)
    y = dfTrain['compliance']

    #MinMax scale
    scaler = MinMaxScaler()
    XMM = scaler.fit_transform(X)

    #store ticket ID original order
    TIDOrigOrder = dfTest.index

    #all entries with Fine Waived in disposition should be in compliance
    iFWaived = dfTest[dfTest['disposition'].str.contains('Fine Waived')].index
    sFWaived = pd.Series(1, index=iFWaived)
    dfTest.drop(iFWaived, inplace=True)

    #preprocessing of Test data, almost same as for Training data

    colRename = {'violator_name': 'v_name', 'violation_street_number': 'v_str_num', 'violation_street_name': 'v_str_name',
                 'mailing_address_str_number': 'm_str_num', 'mailing_address_str_name': 'm_str_name', 'violation_code': 'v_code',
                 'violation_description': 'v_description'}
    dfTest.rename(columns=colRename, inplace=True)

    strCols = ['disposition']
    dfTest[strCols] = dfTest[strCols].apply(lambda a : a.astype(str).str.lower())

    keepCols = ['discount_amount', 'disposition']
    dfTest = dfTest[keepCols]

    #fix a few disposition values
    disp = dfTest['disposition']
    disp = disp.str.replace('responsible - compl/adj by default', 'responsible by default')
    disp = disp.str.replace('responsible by dismissal', 'responsible by default')
    disp = disp.str.replace('responsible - compl/adj by determi', 'responsible by determination')
    dfTest['disposition'] = disp

    catCols = ['disposition']
    dfDum = pd.get_dummies(dfTest[catCols], prefix=catCols, drop_first=True)
    dfTest = pd.concat([dfTest, dfDum], axis=1).drop(catCols, axis=1)

    scaler = MinMaxScaler()
    testMM = scaler.fit_transform(dfTest)

    clf = SGDClassifier(loss="log")
    clf.fit(XMM, y)

    pr = clf.predict(testMM)
    aPr = clf.predict_proba(testMM)
    aPr = aPr[:,1]
    sPr = pd.Series(aPr, index=dfTest.index)
    sPr = sFWaived.append(sPr).reindex(TIDOrigOrder)

    return sPr

blight_model()



#things I tried and didn't use:

#dfTrain.info()
#dfTrain.describe()
#dfTrain.describe(include=['O'])

#Ticket ID to Address, Address to Longitude/Latitude
# TIDAddr = pd.read_csv('addresses.csv')
# AddrLL = pd.read_csv('latlons.csv')

# trainAddrs = dfTrain['ticket_id'].map(pd.Series(TIDAddr['address'], index=TIDAddr['ticket_id']))
# dfTrain['lat'] = trainAddrs.map(pd.Series(AddrLL['lat'].values, index=AddrLL['address']))
# dfTrain['lon'] = trainAddrs.map(pd.Series(AddrLL['lon'].values, index=AddrLL['address']))

#display(dfTrain.head())
#display(dfTest.head())

#tried looking at periods between ticket issued date and hearing date
# dfIH = dfTrain[['ticket_issued_date', 'hearing_date']]
# dfIH = dfIH.applymap(lambda a : np.nan if pd.isna(a) else datetime.strptime(a, '%Y-%m-%d %H:%M:%S'))
# (lambda a : a['hearing_date'] - a['ticket_issued_date'])(dfIH.dropna())
#noticed some values are negative, decided not to use dates

#removing punctuation from street names
# makeT = ''.maketrans("", "", string.punctuation)
# strNames = dfTrain[['v_str_name', 'm_str_name']]
# strNames = strNames.apply(lambda a : a.str.translate(makeT))
#removing N, S, E, W from beginning of street names
# strNames = strNames.applymap(lambda a : re.sub('^n\s|^s\s|^e\s|^w\s', '', str(a), 1))
#adding column for whether name of violator street address matches name of mailing street address
# dfTrain[['v_str_name', 'm_str_name']] = strNames
# dfTrain['match_v_mail'] = dfTrain['v_str_name'].eq(dfTrain['m_str_name']).astype(int)

#adding column for total number of properties cited for each violator
#dfTrain['v_num_props'] = dfTrain['v_name'].map(dfTrain['v_name'].value_counts().to_dict())

#fill missing state data with state missing
# dfTrain['state'].replace('nan', np.NaN).fillna('state missing', inplace=True)
#fill missing latitude and longitude data with mean
# dfTrain['lat'].fillna(dfTrain['lat'].median(), inplace=True)
# dfTrain['lon'].fillna(dfTrain['lon'].median(), inplace=True)

#drop two agency_name outliers
# ANNeighbor = dfTrain[dfTrain['agency_name'].str.contains('neighborhood city halls')].index.values
# dfTrain.drop(ANNeighbor, inplace=True)

#train-test split
#train_x, test_x, train_y, test_y = train_test_split(XMM, y, random_state = 0)

# clf = SVC(kernel='linear', probability=True)
# clf.fit(train_x, train_y)
# pr = clf.predict(test_x)
# prProb = clf.predict_proba(test_x)
# prProba = prProb[:,1]
# print(roc_auc_score(test_y, prProba))

# TIDAddr = pd.read_csv('addresses.csv')
# AddrLL = pd.read_csv('latlons.csv')
# testAddrs = dfTest['ticket_id'].map(pd.Series(TIDAddr['address'], index=TIDAddr['ticket_id']))
# dfTest['lat'] = testAddrs.map(pd.Series(AddrLL['lat'].values, index=AddrLL['address']))
# dfTest['lon'] = testAddrs.map(pd.Series(AddrLL['lon'].values, index=AddrLL['address']))

# dfTest['v_num_props'] = dfTest['v_name'].map(dfTest['v_name'].value_counts().to_dict())

# dfTest['state'].replace('nan', np.NaN).fillna('state missing', inplace=True)

# dfTest['lat'].fillna(dfTest['lat'].median(), inplace=True)
# dfTest['lon'].fillna(dfTest['lon'].median(), inplace=True)

#add some columns for missing categorical values
# X, dfTest = X.align(dfTest, join='left', axis=1, fill_value=0)

# from sklearn.kernel_approximation import Nystroem

# fmn = Nystroem()
# XMMTr = fmn.fit_transform(XMM)
# clf = SVC(kernel='linear', probability=True)
# clf.fit(XMMTr, y)