# 0. Imports

In [0]:
# Imports
import pandas as pd

In [0]:
# Load data set
df_0 = pd.read_csv("https://drive.switch.ch/index.php/s/Gs1wqzxkNeCppeC/download")

In [23]:
# Copy to easily reset, without loading the data set again
df = df_0
df.sample(3)

Unnamed: 0,Inspection ID,DBA Name,License #,Facility Type,Risk,Address,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Year,Month,Weekday,LenViol,TMAX,MeanMaxTemp3Days,ApproxCreationDate,DaysInBusiness
101667,1106687,"CLARK ST. MARKET, INC.",2177457,grocery store,Risk 1 (High),7007 N CLARK ST,60626,2013-07-18,canvass,Fail,3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...,42.009253,-87.673874,2013,7,3,2329,35.6,33.9,2012-07-23,360.0
13971,2088565,CARNICERIA AGUASCALIENTES,8475,grocery store,Risk 1 (High),3132 W 26TH ST,60623,2017-09-21,complaint,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.84464,-87.703875,2017,9,3,1347,35.0,27.766667,2002-05-14,5609.0
2732,1633188,AUGUSTA PIZZA & GRILL,2017992,restaurant,Risk 1 (High),4400 W AUGUSTA BLVD,60651,2016-03-15,canvass re-inspection,Pass,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,41.898958,-87.736167,2016,3,1,1060,17.8,12.6,2010-02-10,2225.0


In [24]:
df.shape

(122595, 21)

# 1. Preprocessing

Some features, such as "Inspection ID", "DBA Name", are not too useful when it comes to predicting the outcome of the inspection. Therefore, those columns are dropped.

In [25]:
# Drop columns that are not useful
df = df.drop(["Inspection ID", "DBA Name", "License #", "Address", "Zip", "Inspection Date", "Violations", "ApproxCreationDate"], axis = 1)
df.sample(3)

Unnamed: 0,Facility Type,Risk,Inspection Type,Results,Latitude,Longitude,Year,Month,Weekday,LenViol,TMAX,MeanMaxTemp3Days,DaysInBusiness
4196,restaurant,Risk 1 (High),canvass,Fail,41.87811,-87.627535,2012,8,2,1264,30.0,25.0,3891.0
48874,restaurant,Risk 1 (High),canvass,Pass,41.881977,-87.638876,2016,3,0,602,10.0,8.5,3952.0
12413,grocery store,Risk 1 (High),short form complaint,Fail,41.98219,-87.668652,2014,11,4,1611,-1.6,5.4,4615.0


### 1.1 Standardization

The first step is to normalise the numerical features :

*   Latitude
*   Longitude
*   Year
*   LenViol
*   TMAX
*   MeanMaxTemp3Days
*   DaysInBusiness

Note : Month and Weekday are already encoded from categorical data.


In [36]:
# Work on copy of df
df_prep = df[["Latitude", "Longitude", "Year", "LenViol", "TMAX", "MeanMaxTemp3Days", "DaysInBusiness", "Facility Type", "Risk", "Inspection Type", "Month", "Weekday", "Results"]]
df_prep.sample(1)

Unnamed: 0,Latitude,Longitude,Year,LenViol,TMAX,MeanMaxTemp3Days,DaysInBusiness,Facility Type,Risk,Inspection Type,Month,Weekday,Results
19168,41.911085,-87.630632,2017,1046,16.7,17.433333,309.0,restaurant,Risk 1 (High),canvass,4,3,Pass


In [38]:
# Standardization
from sklearn.preprocessing import StandardScaler
standardize = StandardScaler()
standardize.fit(df_prep.iloc[:, :7])
df_prep.iloc[:, :7] = standardize.transform(df_prep.iloc[:, :7])
df_prep.sample(1)

Unnamed: 0,Latitude,Longitude,Year,LenViol,TMAX,MeanMaxTemp3Days,DaysInBusiness,Facility Type,Risk,Inspection Type,Month,Weekday,Results
6398,-0.412274,-0.808051,-1.568362,-1.005143,-0.265436,0.086754,-0.81433,restaurant,Risk 1 (High),complaint re-inspection,10,3,Pass


### 1.2 Encoding

The following features need to be encoded :

*   Facility Type
*   Risk
*   Inspection Type

In [0]:
# Encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#One hot encoding
one_hot = OneHotEncoder(categories="auto")
cat_to_onehot = one_hot.fit_transform(df_prep[["Facility Type", "Risk", "Inspection Type"]]).toarray()
cat_to_onehot = pd.DataFrame(cat_to_onehot)

# Generate X
feature_names = ["Latitude", "Longitude", "Year", "LenViol", "TMAX", "MeanMaxTemp3Days", "DaysInBusiness"]
X = df_prep[feature_names]
X = pd.concat((X, cat_to_onehot), axis=1)

# Generate y
lab_encoder = LabelEncoder()
cat_to_label = lab_encoder.fit_transform(df_prep["Results"])
Y = pd.DataFrame(cat_to_label)

In [46]:
X.sample(1)

Unnamed: 0,Latitude,Longitude,Year,LenViol,TMAX,MeanMaxTemp3Days,DaysInBusiness,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
17949,-1.032018,-1.197285,1.708636,-1.005143,-1.076437,-1.088547,-1.191479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
Y.sample(5)

Unnamed: 0,0
60559,0
17834,0
91461,1
41164,1
11978,1


### 1.3 Splitting the data set

In [0]:
# Split train/test sets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=72)

# 2. Prediction

### 2.1 Base rate

In [56]:
# The base rest is defined as : size(most frequent class)/size(dataset)
failCount = df[df["Results"] == "Fail"].shape[0]
passCount = df[df["Results"] == "Pass"].shape[0]

if passCount > failCount :
  print("The most frequent class is " + "'Pass' (" + str(passCount) + ") VS 'Fail' (" + str(failCount)+ ").")
else :
   print("The most frequent class is " + "'Fail' (" + str(failCount) + ") VS 'Pass' (" + str(passCount)+ ").")

The most frequent class is 'Pass' (92447) VS 'Fail' (30148).


In [57]:
# Base rate
br = max(passCount, failCount)/(passCount + failCount)

print("The base rate is : " + str(br))

The base rate is : 0.754084587462784


### 2.2 Logistic Regression

In [51]:
from sklearn.linear_model import LogisticRegressionCV

# Logistic regression with cross-validation
LR = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000)
LR.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=1000, multi_class='warn', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [53]:
# Train accuracy
LR.score(X_train,Y_train)

0.8455483502589828

In [54]:
# Test accuracy
LR.score(X_test, Y_test)

0.8411028182226029