# 0. Imports

In [0]:
# Imports
import pandas as pd

In [0]:
# Load data set
df_0 = pd.read_csv("https://drive.switch.ch/index.php/s/Gs1wqzxkNeCppeC/download")

In [3]:
# Copy to easily reset, without loading the data set again
df = df_0
df.sample(3)

Unnamed: 0,Inspection ID,DBA Name,License #,Facility Type,Risk,Address,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Year,Month,Weekday,LenViol,TMAX,MeanMaxTemp3Days,ApproxCreationDate,DaysInBusiness
100768,1402071,AMIGO FOOD MART,1169708,grocery store,Risk 3 (Low),1859 N KIMBALL AVE,60647,2014-03-18,canvass re-inspection,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.915437,-87.711709,2014,3,1,1601,10.6,2.233333,2001-08-22,4591.0
97146,660145,LA CEMITA,1874882,restaurant,Risk 1 (High),3619 W NORTH AVE,60647,2012-01-24,canvass,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.909874,-87.717533,2012,1,1,1891,2.8,3.566667,2007-11-16,1530.0
107430,612283,GREEN OAK FOOD & LIQUOR INC,33090,grocery store,Risk 3 (Low),956 N LARRABEE ST,60610,2011-06-16,complaint,Pass,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,41.900361,-87.643236,2011,6,3,894,27.2,22.966667,2002-05-08,3326.0


In [4]:
df.shape

(122595, 21)

# 1. Preprocessing

Some features, such as "Inspection ID", "DBA Name", are not too useful when it comes to predicting the outcome of the inspection. Therefore, those columns are dropped.

In [5]:
# Drop columns that are not useful
df = df.drop(["Inspection ID", "DBA Name", "License #", "Address", "Zip", "Inspection Date", "Violations", "ApproxCreationDate"], axis = 1)
df.sample(3)

Unnamed: 0,Facility Type,Risk,Inspection Type,Results,Latitude,Longitude,Year,Month,Weekday,LenViol,TMAX,MeanMaxTemp3Days,DaysInBusiness
60236,restaurant,Risk 1 (High),canvass,Pass,41.894776,-87.627477,2017,5,3,500,21.1,19.433333,2318.0
23956,school,Risk 1 (High),canvass re-inspection,Pass,41.929846,-87.646339,2011,6,0,1,21.7,19.266667,3184.0
60582,restaurant,Risk 1 (High),canvass,Fail,41.85213,-87.63185,2016,10,3,2480,11.1,12.066667,3209.0


### 1.1 Standardization

The first step is to normalise the numerical features :

*   Latitude
*   Longitude
*   LenViol
*   TMAX
*   MeanMaxTemp3Days
*   DaysInBusiness


In [6]:
# Work on copy of df
df_prep = df[["Latitude", "Longitude", "LenViol", "TMAX", "MeanMaxTemp3Days", "DaysInBusiness", "Facility Type", "Risk", "Inspection Type", "Year", "Month", "Weekday", "Results"]]
df_prep.sample(1)

Unnamed: 0,Latitude,Longitude,LenViol,TMAX,MeanMaxTemp3Days,DaysInBusiness,Facility Type,Risk,Inspection Type,Year,Month,Weekday,Results
11099,41.997996,-87.668224,414,25.0,18.0,584.0,restaurant,Risk 1 (High),canvass re-inspection,2012,9,1,Pass


In [7]:
# Standardization
from sklearn.preprocessing import StandardScaler
standardize = StandardScaler()
standardize.fit(df_prep.iloc[:, :6])
df_prep.iloc[:, :6] = standardize.transform(df_prep.iloc[:, :6])
df_prep.sample(1)

Unnamed: 0,Latitude,Longitude,LenViol,TMAX,MeanMaxTemp3Days,DaysInBusiness,Facility Type,Risk,Inspection Type,Year,Month,Weekday,Results
45249,0.997268,-0.198688,-0.448733,-1.223108,-1.616538,1.178934,restaurant,Risk 1 (High),canvass re-inspection,2013,11,1,Pass


### 1.2 Encoding

The following features need to be encoded :

*   Facility Type
*   Risk
*   Inspection Type
*   Year
*   Month
*   Weekday


In [0]:
# Encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#One hot encoding
one_hot = OneHotEncoder(categories="auto")
cat_to_onehot = one_hot.fit_transform(df_prep[["Facility Type", "Risk", "Inspection Type", "Year", "Month", "Weekday"]]).toarray()
cat_to_onehot = pd.DataFrame(cat_to_onehot)

# Generate X
feature_names = ["Latitude", "Longitude", "LenViol", "TMAX", "MeanMaxTemp3Days", "DaysInBusiness"]
X = df_prep[feature_names]
X = pd.concat((X, cat_to_onehot), axis=1)

# Generate y
lab_encoder = LabelEncoder()
cat_to_label = lab_encoder.fit_transform(df_prep["Results"])
Y = pd.DataFrame(cat_to_label)

In [10]:
X.sample(1)

Unnamed: 0,Latitude,Longitude,LenViol,TMAX,MeanMaxTemp3Days,DaysInBusiness,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67
86637,0.798957,-2.138999,-1.005143,-0.265436,-0.295069,-1.182473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
Y.sample(5)

Unnamed: 0,0
35236,1
9408,0
4332,1
109750,0
69165,1


### 1.3 Splitting the data set

In [0]:
# Split train/test sets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=72)

# 2. Prediction

### 2.1 Base rate

In [13]:
# The base rest is defined as : size(most frequent class)/size(dataset)
failCount = df[df["Results"] == "Fail"].shape[0]
passCount = df[df["Results"] == "Pass"].shape[0]

if passCount > failCount :
  print("The most frequent class is " + "'Pass' (" + str(passCount) + ") VS 'Fail' (" + str(failCount)+ ").")
else :
   print("The most frequent class is " + "'Fail' (" + str(failCount) + ") VS 'Pass' (" + str(passCount)+ ").")

The most frequent class is 'Pass' (92447) VS 'Fail' (30148).


In [14]:
# Base rate
br = max(passCount, failCount)/(passCount + failCount)

print("The base rate is : " + str(br))

The base rate is : 0.754084587462784


### 2.2 Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegressionCV

# Logistic regression with cross-validation
LR = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000)
LR.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=1000, multi_class='warn', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [16]:
# Train accuracy
LR.score(X_train,Y_train)

0.8465169868265426

In [17]:
# Test accuracy
LR.score(X_test, Y_test)

0.8419185121742322