In [86]:
import pandas as pd 
import numpy as np
import re
from pandarallel import pandarallel
import multiprocessing

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer


num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')
pandarallel.initialize(nb_workers=num_processors-6, use_memory_fs=False, progress_bar=True)
pandarallel.initialize()

pd.set_option('display.max_colwidth', None)

Available CPUs: 16
INFO: Pandarallel will run on 14 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [87]:
df = pd.read_csv('Food_Inspections.csv')

df.head()

Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,address,city,state,zip,...,results,violations,latitude,longitude,location,:@computed_region_awaf_s7ux,:@computed_region_6mkv_f3dw,:@computed_region_vrxf_vc4k,:@computed_region_bdys_3d7i,:@computed_region_43wa_7qmu
0,2569663,MARKET FRESH FOODS,MARKET FRESH FOODS,52593.0,Grocery Store,Risk 2 (Medium),800 N KEDZIE AVE,CHICAGO,IL,60651.0,...,Pass,"55. PHYSICAL FACILITIES INSTALLED, MAINTAINED & CLEAN - Comments: OBSERVED SOILED MOP HEADS NOT PROPERLY STORED. INSTRUCTED MANAGER TO HANG MOP HEADS TO PREVENT INSECT BREEDING.",41.895615,-87.706705,"{'latitude': '41.89561531354531', 'longitude': '-87.70670532510552'}",41.0,4299.0,24.0,150.0,46.0
1,2569650,"BURRITO ARCELIANO, INC.",BURRITO ARCELIANO,2886706.0,Restaurant,Risk 1 (High),3414 W IRVING PARK RD,CHICAGO,IL,60618.0,...,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL EMPLOYEE; KNOWLEDGE, RESPONSIBILITIES AND REPORTING - Comments: 2-102.14(O) FOUND NO EMPLOYEE HEALTH POLICY/TRAINING ON SITE. INSTRUCTED TO PROVIDE A SIGNED EMPLOYEE HEALTH POLICY FOR EACH EMPLOYEE. PRIORITY FOUNDATION VIOLATION 7-38-010 | 55. PHYSICAL FACILITIES INSTALLED, MAINTAINED & CLEAN - Comments: 6-201.16 INSTRUCTED TO REPLACE MISSING FLOOR TILE IN PREP/COOKING AREA AND IN THE DINING AREA. | 58. ALLERGEN TRAINING AS REQUIRED - Comments: 2-102.13 FOUND INCOMPLETE APPROVED ALLERGEN TRAINING CERTIFICATE AVAILABLE. INSTRUCTED ANY PERSON WITH CITY OF CHICAGO FOOD SERVICE SANITATION CERTIFICATE TO COMPLY WITH ALLERGEN TRAINING CERTIFICATION.",41.953957,-87.71356,"{'latitude': '41.953957216483', 'longitude': '-87.71355988117966'}",2.0,21538.0,16.0,23.0,12.0
2,2569647,Sauganash Elementary School,Sauganash Elementary School,25211.0,School,Risk 1 (High),6040 N Kilpatrick (4700W) AVE,CHICAGO,IL,60646.0,...,Pass,,,,,,,,,
3,2569640,TAO THONG,TAO THONG,2595274.0,Restaurant,Risk 1 (High),4700 N KIMBALL AVE,CHICAGO,IL,60625.0,...,Out of Business,,41.966671,-87.713458,"{'latitude': '41.96667122670007', 'longitude': '-87.71345814571208'}",2.0,21849.0,14.0,118.0,20.0
4,2569646,COUNCIL FOR JEWISH ELDERLY,COUNCIL FOR JEWISH ELDERLY,23540.0,,Risk 3 (Low),3003 W TOUHY AVE,CHICAGO,IL,60645.0,...,Out of Business,,42.01185,-87.70464,"{'latitude': '42.01185027650496', 'longitude': '-87.70464008544714'}",42.0,22528.0,20.0,32.0,27.0


In [88]:
df = df[['violations', 'results']]

In [89]:
%%time

df = df[df['violations'].notna()]


def extract_description(t):
    """
    Extracts description for the violation column
    """
    description_regex = r"\s[A-Z \W]+ -"
    descriptions = re.findall(description_regex, t)
    descriptions = [description[1:-2] for description in descriptions]
    return descriptions

df['descriptions'] = df['violations'].parallel_apply(lambda t: extract_description(t))


CPU times: user 587 ms, sys: 1.28 s, total: 1.87 s
Wall time: 3.75 s


In [90]:
## extract comments from the violations column
def extract_comments(df):
    
    for idx, row in df.iterrows():
        violation = row['violations']
        for description in row['descriptions']:
            violation = violation.replace(description, '')
        df.at[idx, 'violations'] = violation
    return df

df = extract_comments(df)

In [91]:
def modify_comments(t):
    """
    Clears numbers, special characters from the violations
    """
    pure_text_regex = r"[A-Z]*"
    pure_comments = re.findall(pure_text_regex, t)
    pure_comments = " ".join(pure_comments)
    
    single_text_regex = r"\b[a-zA-Z]\b"
    pure_comments = re.sub(single_text_regex, "", pure_comments)
    return pure_comments

df['violations'] = df['violations'].parallel_apply(lambda t: modify_comments(t))

In [92]:
df.head()

Unnamed: 0,violations,results,descriptions
0,OBSERVED SOILED MOP HEADS NOT PROPERLY STORED INSTRUCTED MANAGER TO HANG MOP HEADS TO PREVENT INSECT BREEDING,Pass,"[PHYSICAL FACILITIES INSTALLED, MAINTAINED & CLEAN]"
1,FOUND NO EMPLOYEE HEALTH POLICY TRAINING ON SITE INSTRUCTED TO PROVIDE SIGNED EMPLOYEE HEALTH POLICY FOR EACH EMPLOYEE PRIORITY FOUNDATION VIOLATION INSTRUCTED TO REPLACE MISSING FLOOR TILE IN PREP COOKING AREA AND IN THE DINING AREA FOUND INCOMPLETE APPROVED ALLERGEN TRAINING CERTIFICATE AVAILABLE INSTRUCTED ANY PERSON WITH CITY OF CHICAGO FOOD SERVICE SANITATION CERTIFICATE TO COMPLY WITH ALLERGEN TRAINING CERTIFICATION,Pass w/ Conditions,"[MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL EMPLOYEE; KNOWLEDGE, RESPONSIBILITIES AND REPORTING, PHYSICAL FACILITIES INSTALLED, MAINTAINED & CLEAN, ALLERGEN TRAINING AS REQUIRED]"
5,FOUND NO EMPLOYEE HEALTH POLICY TRAINING ON SITE INSTRUCTED TO PROVIDE SIGNED EMPLOYEE HEALTH POLICY FOR EACH EMPLOYEE PRIORITY FOUNDATION VIOLATION INSTRUCTED TO RODENT PROOF BOTTOM OF REAR EXIT DOOR,Pass w/ Conditions,"[MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL EMPLOYEE; KNOWLEDGE, RESPONSIBILITIES AND REPORTING, INSECTS, RODENTS, & ANIMALS NOT PRESENT]"
6,OBSERVED ICE SCOOP LAYING ON TOP OF TRAY INSTRUCTED MANAGER TO STORE ICE SCOOP HANDLE UP ABOVE ICE TO PREVENT CONTAMINATION OBSERVED EXCESSIVE ACCUMULATED DUST ON THE CONDENSOR IN THE WALK IN COOLER INSTRUCTED TO CLEAN OBSERVED AN EXCESS OF DIRT ENCRUSTED ONTO THE FLOOR OF THE WALK IN COOLER MUST DETAIL CLEAN AND MAINTAIN OBSERVED SOILED MOP HEADS NOT PROPERLY STORED INSTRUCTED MANAGER TO HANG MOP HEADS TO PREVENT INSECT BREEDING,Pass,"[IN-USE UTENSILS: PROPERLY STORED, NON-FOOD/FOOD CONTACT SURFACES CLEAN, NON-FOOD/FOOD CONTACT SURFACES CLEAN, PHYSICAL FACILITIES INSTALLED, MAINTAINED & CLEAN]"
8,INSTRUCTED TO REPLACE BURNT OUT LIGHT BULBS FOR LIGHT FIXTURE IN UTILITY MOP SINK CLOSET AND MAINTAIN ADEQUATE LIGHTING AT ALL TIMES,Pass,[ADEQUATE VENTILATION & LIGHTING; DESIGNATED AREAS USED]


## Create classification model, predicting the outcome of food safety inspection based on the inspectors’ comments

* Leverage the results of your homework from Week-1 and Week-2 to extract free-form text comments from inspectors
* Discard the text from “Health Code” – only keep inspectors’ comments
* Build classification model, predicting the outcome of inspection – your target variable is “Results”
* Explain why you selected a particular text pre-processing technique
* Visualize results of at least two text classifiers and select the most robust one
* You can choose to build a binary classifier (limiting your data to Pass / Fail) or multinomial classifier with all available values in Results

## Preprocessing

In [103]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics

In [94]:
unique_y = df['results'].unique().tolist()
unique_y

['Pass',
 'Pass w/ Conditions',
 'Fail',
 'No Entry',
 'Out of Business',
 'Not Ready']

In [95]:
binary_y_map = {'Pass': 0, 
                'Pass w/ Conditions': 2,
                'Fail': 1,
                'No Entry': 2,
                'Out of Business': 2,
                'Not Ready':2}

multinomial_y_map = {'Pass': 0, 
                     'Pass w/ Conditions': 1,
                     'Fail': 2,
                     'No Entry': 3,
                     'Out of Business': 4,
                     'Not Ready':5}


df['binary_y'] = df.results.map(binary_y_map)
df['multinomial_y'] = df.results.map(multinomial_y_map)

## Binary Classifier

#### Filtering binary targets

In [155]:
sub_df = df[df.binary_y != 2].copy()

In [156]:
# define X and y
X = sub_df.violations
y = sub_df.results

print(f"X Shape: {X.shape}")
print(f"y Shape: {y.shape}")

X Shape: (142293,)
y Shape: (142293,)


In [157]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
print(f"Training records, X_train: {X_train.shape} y_train: {y_train.shape}")
print(f"Testing records, X_test: {X_test.shape} y_test: {y_test.shape}")

Training records, X_train: (128063,) y_train: (128063,)
Testing records, X_test: (14230,) y_test: (14230,)


In [159]:
def fit_model(model, count_vectorizer, X_train, X_test, y_train, y_test, get_feature_importance=True):
    print("Model inputs are: ")
    print(count_vectorizer)
    print(model)
    print("Fitting the Count Vectorizer")
    count_vectorizer.fit(X_train)
    
    X_train_dtm = count_vectorizer.fit_transform(X_train)    
    X_test_dtm = count_vectorizer.transform(X_test)
    print("CountVectorizer is successfuly fitted for train and test data!")
    
    model.fit(X_train_dtm, y_train)
    y_pred_class = model.predict(X_test_dtm)
    print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred_class) * 100:.1f}%")
    
    print(classification_report(y_test, y_pred_class))
    
    if get_feature_importance:
        print("The most important features")
        print("-"*20)
        feature_names = count_vectorizer.get_feature_names()
        coefs_with_fns = zip(feature_names, model.coef_[0])

        coefs_with_fns_df = pd.DataFrame(coefs_with_fns,
                            columns=['feature', 'coefficient'])
        
        coefs_with_fns_df.sort_values(by='coefficient', ascending=True, inplace=True)
        print(coefs_with_fns_df)
        
        print("The least important features")
        print("-"*20)
        coefs_with_fns_df.sort_values(by='coefficient', ascending=False, inplace=True)
        print(coefs_with_fns_df)

In [160]:
nb = MultinomialNB()
fit_model(nb, count_vec, X_train, X_test, y_train, y_test, get_feature_importance=True)

Model inputs are: 
CountVectorizer(ngram_range=(1, 3))
MultinomialNB()
Fitting the Count Vectorizer
CountVectorizer is successfuly fitted for train and test data!
Test Accuracy: 92.3%
              precision    recall  f1-score   support

        Fail       0.87      0.89      0.88      4546
        Pass       0.95      0.94      0.94      9684

    accuracy                           0.92     14230
   macro avg       0.91      0.92      0.91     14230
weighted avg       0.92      0.92      0.92     14230

The most important features
--------------------
                        feature  coefficient
1052428    located in container   -16.639919
810112   good housekeeping must   -16.639919
810114    good housekeeping not   -16.639919
1585339     sanitize part sinks   -16.639919
810118          good humor deep   -16.639919
...                         ...          ...
1144941                    must    -4.830228
1878592                      to    -4.699774
1804243                     the    

### Logistic Regression Model

In [162]:
logreg = LogisticRegression(max_iter=10000)
fit_model(logreg, count_vec, X_train, X_test, y_train, y_test, get_feature_importance=True)

Model inputs are: 
CountVectorizer(ngram_range=(1, 3))
LogisticRegression(max_iter=10000)
Fitting the Count Vectorizer
CountVectorizer is successfuly fitted for train and test data!
Test Accuracy: 96.5%
              precision    recall  f1-score   support

        Fail       0.98      0.91      0.94      4546
        Pass       0.96      0.99      0.97      9684

    accuracy                           0.96     14230
   macro avg       0.97      0.95      0.96     14230
weighted avg       0.97      0.96      0.96     14230

The most important features
--------------------
                         feature  coefficient
1612250                  serious    -5.188269
492940                  critical    -3.524388
1372158                       pm    -3.099630
1412278                 priority    -2.971511
1613546        serious violation    -2.921242
...                          ...          ...
2004522          violation still     1.143805
1612437  serious citation issued     1.169564
1174487

### Improving model performance

In [163]:
# remove English stop words
count_vec2 = CountVectorizer(stop_words='english', ngram_range=(1, 3))

In [164]:
nb = MultinomialNB()
fit_model(nb, count_vec2, X_train, X_test, y_train, y_test, get_feature_importance=True)

Model inputs are: 
CountVectorizer(ngram_range=(1, 3), stop_words='english')
MultinomialNB()
Fitting the Count Vectorizer
CountVectorizer is successfuly fitted for train and test data!
Test Accuracy: 92.0%
              precision    recall  f1-score   support

        Fail       0.87      0.89      0.88      4546
        Pass       0.95      0.94      0.94      9684

    accuracy                           0.92     14230
   macro avg       0.91      0.91      0.91     14230
weighted avg       0.92      0.92      0.92     14230

The most important features
--------------------
                            feature  coefficient
1359440    paper washrooms evidence   -16.267587
1200832              metal beam dry   -16.267587
740724   floor washrooms inspection   -16.267587
1736585     shelves uncovered foods   -16.267587
1736584           shelves uncovered   -16.267587
...                             ...          ...
1426584                        prep    -5.231908
1146164                   

In [165]:
fit_model(logreg, count_vec2, X_train, X_test, y_train, y_test, get_feature_importance=True)

Model inputs are: 
CountVectorizer(ngram_range=(1, 3), stop_words='english')
LogisticRegression(max_iter=10000)
Fitting the Count Vectorizer
CountVectorizer is successfuly fitted for train and test data!
Test Accuracy: 96.3%
              precision    recall  f1-score   support

        Fail       0.98      0.90      0.94      4546
        Pass       0.96      0.99      0.97      9684

    accuracy                           0.96     14230
   macro avg       0.97      0.95      0.96     14230
weighted avg       0.96      0.96      0.96     14230

The most important features
--------------------
                     feature  coefficient
2080707            violation    -3.456105
2085343   violation observed    -3.316820
1399369                   pm    -3.187397
271181              citation    -2.933967
464299              critical    -2.538992
...                      ...          ...
2087202  violation remaining     1.804260
421831        core violation     1.921852
2087217    violation 

# Preprocessing Technique Selection
**Answers:** Explain why you selected a particular text pre-processing technique.

**I preferred to use the countvectorizer as the vectorization technique instead of other techniques. First of all, in count-vectorizer we can get more explainable results considering that it counts the occurrence of each word and the we can explain the exact meaning for each token. Secondly, we could have used TF-IDF to introduce word importance across the documents, but violations are limited in number and we have enough number of data points in our data. Therefore, I will be using CountVectorizer. As for the parameters of CountVectorizer, removing the stopwords yielded more explanatory n-gram features.**

# Model Selection

**I will be using the Logistics Regression for binary classification of pass/fail. The model has the highest accuracy compared to Naive Bayes and the errors for both target classes are almost the same. Therefore, the best modelling approach is to use CountVectorizer with the elimination of stopwords and use the Logistic Regression for the model.** 

In [166]:
logreg = LogisticRegression(max_iter=10000)
fit_model(logreg, count_vec2, X_train, X_test, y_train, y_test, get_feature_importance=True)

Model inputs are: 
CountVectorizer(ngram_range=(1, 3), stop_words='english')
LogisticRegression(max_iter=10000)
Fitting the Count Vectorizer
CountVectorizer is successfuly fitted for train and test data!
Test Accuracy: 96.3%
              precision    recall  f1-score   support

        Fail       0.98      0.90      0.94      4546
        Pass       0.96      0.99      0.97      9684

    accuracy                           0.96     14230
   macro avg       0.97      0.95      0.96     14230
weighted avg       0.96      0.96      0.96     14230

The most important features
--------------------
                     feature  coefficient
2080707            violation    -3.456105
2085343   violation observed    -3.316820
1399369                   pm    -3.187397
271181              citation    -2.933967
464299              critical    -2.538992
...                      ...          ...
2087202  violation remaining     1.804260
421831        core violation     1.921852
2087217    violation 