# Calm Code Dirty Cat Lesson

## Introduction

For lesson notes, see the `README.md` file at `daltonturner/calm-code-practice/dirty_cat`. This notebook will focus on the coding portion of the Dirty Cat video series, which presents methods to deal with categorical variables using `scikit-learn`. 

## Analysis

In [1]:
# These packages were installed using a virtual environment specifically set up for this video series
# For additional information on the dependencies for this series, see https://calmcode.io/dirty-cat/introduction.html
import numpy as np
import pandas as pd
from dirty_cat import datasets

employee_salaries = datasets.fetch_employee_salaries()
data = employee_salaries['data']

In [2]:
target_column = 'Current Annual Salary'
ml_df = data[[target_column, 'year_first_hired', 'assignment_category', 'employee_position_title']].dropna()
y = ml_df[target_column].values.ravel()
X = ml_df[['employee_position_title', 'year_first_hired', 'assignment_category']]
ml_df.head()

Unnamed: 0,Current Annual Salary,year_first_hired,assignment_category,employee_position_title
0,69222.18,1986.0,Fulltime-Regular,Office Services Coordinator
1,97392.47,1988.0,Fulltime-Regular,Master Police Officer
2,104717.28,1989.0,Fulltime-Regular,Social Worker IV
3,52734.57,2014.0,Fulltime-Regular,Resident Supervisor II
4,93396.0,2007.0,Fulltime-Regular,Planning Specialist III


In [3]:
# Value counts for each job title show us that certain titles appear much more frequently than others
ml_df['employee_position_title'].value_counts()

Police Officer III                         883
Firefighter/Rescuer III                    694
Bus Operator                               638
Manager III                                243
Correctional Officer III (Corporal)        228
                                          ... 
Medical Doctor IV - Physician                1
Director Department of General Services      1
Director Department of Finance               1
Information Technology Project Manager       1
Secretary to Appellate Judge                 1
Name: employee_position_title, Length: 385, dtype: int64

In [4]:
# CountVectorizer can be used to extract words within text, and build features with them
from sklearn.feature_extraction.text import CountVectorizer

# Fitting the CountVectorizer to the employee_position_title column generates a sparse matrix
# The sparse matrix tells us whether a given cell's contents contain an extracted word
# The sparse matrix is better at using less memory
cv = CountVectorizer().fit(ml_df['employee_position_title'])

In [5]:
# Using the .vocabulary_ method, we can see which words are being used to generate the sparse matrix
# The number next to the word indicates which column the word is associated with, not the number of times the word appears
cv.vocabulary_
len(cv.vocabulary_)

321

In [6]:
# The analyzer and ngram_range CountVectorizer inputs allow you to generate additonal sub-words
# This provides robustness against word mismatches
cv = CountVectorizer(analyzer = 'char', ngram_range = (3,3)).fit(ml_df['employee_position_title'])
len(cv.vocabulary_)

1264

In [7]:
# The shape of the transformation shows us that the number of columns we now have is equal to the length of the
# CountVectorizer's vocabulary
cv.transform(ml_df['employee_position_title']).shape

(9228, 1264)

In [8]:
# The dirty_cat package allows us to generate similarity scores across a matrix, rather than encoding as 0 or 1 
# for word or sub-word matches
import dirty_cat

# The SimilarityEncoder accepts optional inputs for categories and n_prototypes, and behaves similarly to 
# the CountVectorizer in terms of the transformation's shape
mod = dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=200)
mod.fit_transform(data[['employee_position_title']]).shape

(9228, 200)

## Vincent's Pipeline code

In [12]:
# This pipeline imports all required packages, sets up a method dictionary to be used in looping through various
# encoders, and then grid searches logging the 'neg_mean_absolute_error' for each model
from sklearn import set_config

set_config(display="diagram")

from sklearn.pipeline import Pipeline, FeatureUnion
from sklego.preprocessing import ColumnSelector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Ridge

method = {
    'sim_enc100': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=100),
    'sim_enc300': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=300),
    'sim_enc_all': dirty_cat.SimilarityEncoder(),
    'one-hot': OneHotEncoder(handle_unknown='ignore')
}

results = []

for k, encoder in method.items():
    pipe = Pipeline([
        ('split', FeatureUnion([
            ('cat', Pipeline([
                ('grab', ColumnSelector(['employee_position_title'])),
                ('handle', encoder)
            ])),
            ('one-hot', Pipeline([
                ('grab', ColumnSelector('assignment_category')),
                ('handle', OneHotEncoder(handle_unknown='ignore'))
            ])),
            ('floats', Pipeline([
                ('grab', ColumnSelector('year_first_hired')),
                ('scale', StandardScaler())
            ])),
        ])),
        ('mod', Ridge())
    ])

    grid = GridSearchCV(pipe, cv=10, param_grid={}, scoring=['r2', 'neg_mean_absolute_error'], refit='r2', n_jobs=-1)
    res_df = pd.DataFrame(grid.fit(X, y).cv_results_)
    res_df['key'] = k
    results.append(res_df)

In [13]:
# This pipeline imports all required packages, sets up a method dictionary to be used in looping through various
# count vectorizers, and then grid searches logging the 'neg_mean_absolute_error' for each model
# The count vectorizers are not included in the pipeline above because they accept a df column as input, while 
# the encoders accept a dataframe with one colum
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

method = {
    'cv': CountVectorizer(),
    'cv_ngram': CountVectorizer(analyzer='char', ngram_range=(2, 4)),
}

for k, encoder in method.items():
  pipe = Pipeline([
    ('split', FeatureUnion([
      ('cat', Pipeline([
        ('listify', FunctionTransformer(lambda d: [t for t in d['employee_position_title']])),
        ('handle', encoder)
      ])),
      ('one-hot', Pipeline([
        ('grab', ColumnSelector('assignment_category')),
        ('handle', OneHotEncoder(handle_unknown='ignore'))
      ])),

      ('floats', Pipeline([
        ('grab', ColumnSelector('year_first_hired')),
        ('scale', StandardScaler())
      ])),
    ])),
    ('mod', Ridge())
  ])
  grid = GridSearchCV(pipe, cv=10, param_grid={}, scoring=['r2', 'neg_mean_absolute_error'], refit='r2', n_jobs=-1)
  print(k)
  res_df = pd.DataFrame(grid.fit(X, y).cv_results_)
  res_df['key'] = k
  results.append(res_df)

cv
cv_ngram


In [16]:
# The lines below concatenate the results from each pipeline
plt_df = pd.concat(results)[['mean_test_neg_mean_absolute_error', 'mean_test_r2', 'key']]
plt_df.sort_values('mean_test_r2', ascending=False).reset_index()

Unnamed: 0,index,mean_test_neg_mean_absolute_error,mean_test_r2,key
0,0,-6279.078331,0.902077,cv_ngram
1,0,-6319.690323,0.901846,sim_enc_all
2,0,-6829.004776,0.885858,cv
3,0,-6577.095979,0.875012,sim_enc300
4,0,-6393.727621,0.86162,one-hot
5,0,-7779.914298,0.789175,sim_enc100
