In [5]:
import pandas as pd
import datetime
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import GridSearchCV

## Read in the final version of the data that has the weather and census data merged in already and do some further processing

In [6]:
# read in the data

modeldata_df = pd.read_csv("../static/dataignore/modeldata.csv")


In [7]:
# assign to a dataframe

data_final_df=modeldata_df

In [8]:
# look at the available columns

print(data_final_df.columns.values)
data_final_df.head()

['case number' 'sr location' 'county' 'district' 'neighborhood' 'tax id'
 'trash quad' 'recycle quad' 'trash day' 'heavy trash day' 'recycle day'
 'key map' 'management district' 'department' 'division' 'sr type' 'queue'
 'sla' 'status' 'sr create date' 'due date' 'date closed' 'overdue'
 'title' 'latitude' 'longitude' 'channel type' 'date' 'year' 'month'
 'zipcode' 'daystoclose' 'daysdue' 'missedduedate' 'date_field' 'tempMax'
 'tempAvg' 'tempMin' 'precipitation' 'Population' 'Median Age'
 'Household Income' 'Per Capita Income' 'Poverty Rate' 'Total Households'
 'Total Owner Occupied' '% Owner Occupied']


Unnamed: 0,case number,sr location,county,district,neighborhood,tax id,trash quad,recycle quad,trash day,heavy trash day,...,tempMin,precipitation,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Total Households,Total Owner Occupied,% Owner Occupied
0,101002444726,Intersection 3900 S GESSNER RD&10000 WESTPARK DR,Harris County,F,MID WEST,,,,,,...,64.0,0.3,38931.0,31.9,44957.0,32805.0,22.824998,38751.0,9667.0,24.946453
1,12091836-101002444730,"3303 SAGE, HOUSTON TX 77056",HARRIS,G,GREATER UPTOWN,451400060009.0,,,,,...,64.0,0.3,21732.0,39.1,107003.0,89180.0,5.268728,21641.0,10643.0,49.179798
2,101002444733,Intersection 1400 CAROLINE ST&1300 CLAY ST,Harris County,I,DOWNTOWN,,,,,,...,64.0,0.3,915.0,44.6,250001.0,196722.0,6.666667,915.0,78.0,8.52459
3,12091839-101002444736,"7701 APPLETON, HOUSTON TX 77022",HARRIS,H,NORTHSIDE/NORTHLINE,710210010015.0,NE,NW,MONDAY,3rd Monday,...,64.0,0.3,27364.0,34.1,30164.0,14924.0,29.399942,27186.0,13143.0,48.344736
4,12091840-101002444737,"7701 APPLETON, HOUSTON TX 77022",HARRIS,H,NORTHSIDE/NORTHLINE,420050000055.0,NE,NW,MONDAY,3rd Monday,...,64.0,0.3,27364.0,34.1,30164.0,14924.0,29.399942,27186.0,13143.0,48.344736


In [9]:
################################################################
# create a date value formatted the same as the weather data

splitdate = data_final_df["sr create date"].str.split(" ", n=1, expand=True)

data_final_df["create_date"] = splitdate[0]
data_final_df["create_date"] = pd.to_datetime(data_final_df["create_date"])
data_final_df["create_date"] = data_final_df['create_date'].dt.strftime('%Y-%m-%d')


# I set these as date formats using the to_datetime

data_final_df["create_date_time"] = pd.to_datetime(data_final_df["sr create date"])
data_final_df["date_closed_time"] = pd.to_datetime(data_final_df["date closed"])
data_final_df["due_date_time"] = pd.to_datetime(data_final_df["due date"])


# here  I define weather is was closed by the due date (0) or not (1)
data_final_df["missed_due_date"] = np.where(data_final_df["due_date_time"] > data_final_df["date_closed_time"], 0, 1)

# calculate the difference between the closed date at the create date
data_final_df["time_to_close"] = data_final_df["date_closed_time"] - data_final_df["create_date_time"]

# apply formatting so that is is in seconds. then calculate the number of days.
data_final_df["seconds_to_close"] = data_final_df["time_to_close"].astype('timedelta64[s]')
data_final_df["days_to_close"] = data_final_df["seconds_to_close"]/60/60/24
 
# data311_df[["create_date_time", "date_closed_time", "time_to_close","seconds_to_close", "days_to_close"]]


data_final_df[["create_date_time","due_date_time", "date_closed_time", "missed_due_date", "days_to_close"]].head()



Unnamed: 0,create_date_time,due_date_time,date_closed_time,missed_due_date,days_to_close
0,2017-01-01 00:14:04,2017-01-02 00:14:04,2017-01-01 00:23:57,0,0.006863
1,2017-01-01 00:30:48,2017-01-11 00:30:47,2017-02-02 13:20:06,1,32.534236
2,2017-01-01 04:26:27,2017-01-02 04:26:27,2017-01-01 07:55:51,0,0.145417
3,2017-01-01 05:22:37,2017-01-02 05:22:35,2017-01-04 11:00:04,1,3.23434
4,2017-01-01 05:23:33,2017-01-03 05:23:33,2017-01-04 03:40:04,1,2.928137


In [10]:
# keep just the fields that will be used in analysis

data_final_df=data_final_df[["create_date_time","create_date","days_to_close","missed_due_date","sr type","tempMax","precipitation","Population","Median Age",
                    "Household Income","Poverty Rate","% Owner Occupied"]]


data_final_df["sr type"].value_counts()


# There were some entries where we were not able to get the zipcode, thus we weren't able to map in the 
# weather data and census data, so dropping these in able to do modeling

data_final_df = data_final_df.dropna()
data_final_df.isna().sum()


create_date_time    0
create_date         0
days_to_close       0
missed_due_date     0
sr type             0
tempMax             0
precipitation       0
Population          0
Median Age          0
Household Income    0
Poverty Rate        0
% Owner Occupied    0
dtype: int64

In [11]:
# calculate the weekly average precipitation and temperature

data_final_df['Year-Week'] = data_final_df['create_date_time'].dt.strftime('%Y-%U')

data_final_df[["create_date","Year-Week"]]

# group by the year, week and calculate the avereage

group_weather_df = data_final_df[["Year-Week","tempMax","precipitation"]].groupby('Year-Week').mean()

group_weather_df = group_weather_df.rename(columns={'tempMax': 'tempAvg', 'precipitation': 'precipAvg'})

group_weather_df.head()

Unnamed: 0_level_0,tempAvg,precipAvg
Year-Week,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01,56.070189,0.274542
2017-02,73.721911,0.021043
2017-03,73.017596,0.824914
2017-04,69.029579,0.004741
2017-05,72.576087,0.0


In [12]:
# merge the weekly average weather back in and drop the daily data 

data_final_df = data_final_df.merge(group_weather_df, how="left", left_on="Year-Week", right_on="Year-Week")

data_final_df = data_final_df.drop(columns=["tempMax","precipitation"])

data_final_df.head()

Unnamed: 0,create_date_time,create_date,days_to_close,missed_due_date,sr type,Population,Median Age,Household Income,Poverty Rate,% Owner Occupied,Year-Week,tempAvg,precipAvg
0,2017-01-01 00:14:04,2017-01-01,0.006863,0,Traffic Signal Maintenance,38931.0,31.9,44957.0,22.824998,24.946453,2017-01,56.070189,0.274542
1,2017-01-01 00:30:48,2017-01-01,32.534236,1,Water Leak,21732.0,39.1,107003.0,5.268728,49.179798,2017-01,56.070189,0.274542
2,2017-01-01 04:26:27,2017-01-01,0.145417,0,Traffic Signal Maintenance,915.0,44.6,250001.0,6.666667,8.52459,2017-01,56.070189,0.274542
3,2017-01-01 05:22:37,2017-01-01,3.23434,1,Sewer Wastewater,27364.0,34.1,30164.0,29.399942,48.344736,2017-01,56.070189,0.274542
4,2017-01-01 05:23:33,2017-01-01,2.928137,1,Water Service,27364.0,34.1,30164.0,29.399942,48.344736,2017-01,56.070189,0.274542


In [13]:
data_final_df.describe()

# data_final_df[data_final_df["days_to_close"]>500].shape

data_final_df.shape

(673898, 13)

In [14]:

# need to create dummy variable for the "sr type" column, which is categorical

data_dummies_df=pd.get_dummies(data_final_df["sr type"])



# add dummy variables in to dataframe with other variables

data_regress_df=pd.concat([data_final_df, data_dummies_df], axis=1)   

# create y variable for the regression analysis.  Ended up not using the regression analysis

y=data_final_df["days_to_close"].values.reshape(-1,1)


# create x variables (drop columns that aren't needed)

X_var=data_regress_df.drop(columns=["sr type","days_to_close", "missed_due_date","create_date","create_date_time","Year-Week"])
X_var.isna().sum()
 
print(X_var.shape)
print(X_var.columns.values)
X_var.head()  



(673898, 22)
['Population' 'Median Age' 'Household Income' 'Poverty Rate'
 '% Owner Occupied' 'tempAvg' 'precipAvg' 'Container Problem' 'Drainage'
 'Missed Garbage Pickup' 'Missed Heavy Trash Pickup'
 'Missed Recycling Pickup' 'Nuisance On Property' 'SWM Escalation'
 'Sewer Wastewater' 'Storm Debris Collection' 'Street Condition'
 'Street Hazard' 'Traffic Signal Maintenance' 'Traffic Signs' 'Water Leak'
 'Water Service']


Unnamed: 0,Population,Median Age,Household Income,Poverty Rate,% Owner Occupied,tempAvg,precipAvg,Container Problem,Drainage,Missed Garbage Pickup,...,Nuisance On Property,SWM Escalation,Sewer Wastewater,Storm Debris Collection,Street Condition,Street Hazard,Traffic Signal Maintenance,Traffic Signs,Water Leak,Water Service
0,38931.0,31.9,44957.0,22.824998,24.946453,56.070189,0.274542,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,21732.0,39.1,107003.0,5.268728,49.179798,56.070189,0.274542,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,915.0,44.6,250001.0,6.666667,8.52459,56.070189,0.274542,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,27364.0,34.1,30164.0,29.399942,48.344736,56.070189,0.274542,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,27364.0,34.1,30164.0,29.399942,48.344736,56.070189,0.274542,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [26]:
# define new data for feeding in to models

# Population, Median Age, Household Income, Poverty Rate, % Owner Occupied, tempAvg, precipAvg, Container Problem, Drainage,
# Missed Garbage Pickup, Missed Heavy Trash Pickup, Missed Recycling Pickup, Nuisance On Property, SWM Escalation, 
# Sewer Wasterwater, Storm Debris Collection, Street Condition,
# Street Hazard, Traffic Signal Maintenance, Traffic Sign, Water Leak, Water Service

new_data = [[38931, 31.9, 44957, 22.82, 24.95, 56.07, 0.27, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

## Build Logistic Regression Model to predict if issue was not closed by due date

In [15]:
# I need to define y as the classfication variable.  The X_var can stay the same

y_class = data_final_df["missed_due_date"]

In [16]:
# define logistic model (note a grid search suggested the C and penalty values used)
# Note that changing the parameters based on the output of the grid search didn't improve the model very much

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(C=166, penalty='l1')
classifier

LogisticRegression(C=166, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
# define training and test sets

from sklearn.model_selection import train_test_split


X_train, X_test, y_class_train, y_class_test = train_test_split(X_var, y_class, random_state=33)


In [18]:
# define standard scaler

from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)


In [19]:
# scale the x data (y is just 0 or 1, so not scaling)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
# fit the model

classifier.fit(X_train_scaled, y_class_train)



LogisticRegression(C=166, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
# look at the scores

print(f"Training Data Score: {classifier.score(X_train_scaled, y_class_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_class_test)}")

Training Data Score: 0.7291318361055987
Testing Data Score: 0.7274788544294406


In [22]:
# look at the prediction vs actual for the test data

predictions = classifier.predict(X_test_scaled)
probs = classifier.predict_proba(X_test_scaled)


pd.DataFrame({
    "Prediction": predictions, 
    "Actual": y_class_test, 
    "On Time": [p[0] for p in probs], 
    "Overdue": [p[1] for p in probs]})

Unnamed: 0,Prediction,Actual,On Time,Overdue
41449,0,0,0.870926,0.129074
307201,1,0,0.449865,0.550135
48134,0,0,0.597195,0.402805
390753,0,0,0.626954,0.373046
147945,0,0,0.835804,0.164196
182701,1,1,0.452184,0.547816
335712,0,0,0.932387,0.067613
372443,0,1,0.587364,0.412636
506217,0,0,0.621141,0.378859
160745,0,0,0.574356,0.425644


In [82]:
# predict a new set of data

# Population, Median Age, Household Income, Poverty Rate, % Owner Occupied, tempAvg, precipAvg, Container Problem, Drainage,
# Missed Garbage Pickup, Missed Heavy Trash Pickup, Missed Recycling Pickup, Nuisance On Property, SWM Escalation, 
# Sewer Wasterwater, Storm Debris Collection, Street Condition,
# Street Hazard, Traffic Signal Maintenance, Traffic Sign, Water Leak, Water Service

new_data = [[38931, 31.9, 44957, 25.82, 24.95, 56.07, 0.27, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

new_data_scaled = X_scaler.transform(new_data)

new_predict = classifier.predict(new_data_scaled)
new_prob = classifier.predict_proba(new_data_scaled)


print("New Prediction: %.3f" % new_predict)

print("Prob of Meeting Deadline : %.3f" % new_prob[0,0])
print("Prob of Not Meeting Deadline : %.3f" % new_prob[0,1])
    

New Prediction: 1.000
Prob of Meeting Deadline : 0.451
Prob of Not Meeting Deadline : 0.549


### Do a grid search to try and find a better model.

In [129]:
logistic = LogisticRegression()

In [130]:
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [131]:
# Create grid search using 5-fold cross validation
clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)

In [132]:
# Fit grid search
best_model = clf.fit(X_train_scaled, y_class_train)







In [133]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

Best Penalty: l1
Best C: 166.81005372000593


In [77]:
# output model to pickle

import pickle

with open('../static/models/logistic_model.pkl','wb') as f:
    pickle.dump(classifier, f)

## Develop KNN model to predict if issue was not closed by due date

In [23]:
from sklearn.neighbors import KNeighborsClassifier

In [84]:
# Chose k=4 after running the loop below to check other values for k

knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train_scaled, y_class_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

In [85]:
print('k=5 Test Acc: %.3f' % knn.score(X_train_scaled, y_class_train))
knn

k=5 Test Acc: 0.843


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

In [87]:
# predict a  new data point

# Population, Median Age, Household Income, Poverty Rate, % Owner Occupied, tempAvg, precipAvg, Container Problem, Drainage,
# Missed Garbage Pickup, Missed Heavy Trash Pickup, Missed Recycling Pickup, Nuisance On Property, SWM Escalation, 
# Sewer Wasterwater, Storm Debris Collection, Street Condition,
# Street Hazard, Traffic Signal Maintenance, Traffic Sign, Water Leak, Water Service


new_data_knn = [[38931, 31.9, 44957, 22.82, 24.95, 56.07, 0.27, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

# have to scale the new data
new_data_knn_scaled = X_scaler.transform(new_data_knn)

new_predict_knn = knn.predict(new_data_knn_scaled)

print("New Prediction: %.3f" % new_predict_knn)

new_prob_knn = knn.predict_proba(new_data_knn_scaled)

print("Prob of Meeting Deadline : %.3f" % new_prob_knn[0,0])
print("Prob of Not Meeting Deadline : %.3f" % new_prob_knn[0,1])

print(new_data_knn)
print(new_data_knn_scaled)
    

New Prediction: 0.000
Prob of Meeting Deadline : 1.000
Prob of Not Meeting Deadline : 0.000
[[38931, 31.9, 44957, 22.82, 24.95, 56.07, 0.27, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[ 0.39366916 -0.57383356 -0.35805553  0.21805589 -1.95933636 -2.13800639
   0.24057979  2.81738006 -0.21252576 -0.37613373 -0.27086858 -0.30086824
  -0.30896869 -0.20270583 -0.33567591 -0.13876926 -0.1752962  -0.22708341
  -0.2092351  -0.19037391 -0.35690846 -0.2390681 ]]


In [22]:
# This output a file that is >200 MB, so not running it.

# output model using pickle

# import pickle

# with open('../static/models/knn_model.pkl','wb') as f:
#     pickle.dump(knn, f)

### Look at different number of neighbors to see if a better model can be found
#### This took a long time to run, given the large amount of data.  I stopped it after K=11 (after running 2 hours), since it was clearly not improving.  It looks like k=4 is probably about right.

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []

for k in range(1, 20, 2):
    # create the classifier for each value of K
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_class_train)
    
    #Score with both test and train data
    train_score = knn.score(X_train_scaled, y_class_train)
    test_score = knn.score(X_test_scaled, y_class_test)
    
    #append the scores to our list of scores
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    

k: 1, Train/Test Score: 0.830/0.768
k: 3, Train/Test Score: 0.847/0.798
k: 5, Train/Test Score: 0.845/0.804
k: 7, Train/Test Score: 0.840/0.804
k: 9, Train/Test Score: 0.834/0.804
k: 11, Train/Test Score: 0.829/0.801
