In [1]:
#Basic categorical encoding using OneHotEncoding module
#reference https://www.dataquest.io/blog/sci-kit-learn-tutorial/
#also reference Pragmatic Marketing Data Science II lecture/exercises

%matplotlib inline
import matplotlib
import seaborn as sns
sns.set()
matplotlib.rcParams['figure.dpi'] = 144

Load the survey data.

In [2]:
import pandas as pd

df_survey = pd.read_csv('master_csv/ThePrideProjectSurvey.csv')

df_survey.head()

Unnamed: 0,Timestamp,Approximately how many games do you attend each season?,What is your favorite team?,Do you identify as LGBTQ+ ?,Did your favorite team host a Pride Night this season?,Did you attend Pride Night this season?,When planning future trips to the ballpark:,"If your team hosts a Pride Night, what do you think of the promotion and production? [Advertising]","If your team hosts a Pride Night, what do you think of the promotion and production? [Pre-Game Activities]","If your team hosts a Pride Night, what do you think of the promotion and production? [In-Game Entertainment/Features]","If your team hosts a Pride Night, what do you think of the promotion and production? [Promotional Items]",What is your overall impression of MLB Pride Night themes?,What is your favorite themed night or promotion to attend?
0,9/19/2019 22:26,43470,St. Louis Cardinals,No,Yes,"Didn't attend, but not because of Pride Night",I would go out of my way to attend on Pride Night,Just the right amount,Don't know/unaware of this feature,Don't know/unaware of this feature,Just the right amount,They’re great,I haven’t attended any
1,9/19/2019 22:28,43626,Atlanta Braves,No,Yes,"Didn't attend, but not because of Pride Night",Pride Night would not influence my choice of game,Just the right amount,Don't know/unaware of this feature,Don't know/unaware of this feature,Don't know/unaware of this feature,I'm all for inclusion and hope it brings fans ...,"Star Wars Night, but themed nights don't reall..."
2,9/19/2019 22:29,43470,New York Yankees,No,I don't know,"Didn't attend, but not because of Pride Night",Pride Night would not influence my choice of game,Don't know/unaware of this feature,Don't know/unaware of this feature,Don't know/unaware of this feature,Don't know/unaware of this feature,Don’t care,Don’t know
3,9/19/2019 22:29,43470,St. Louis Cardinals,Yes,I don't know,"Didn't attend, but not because of Pride Night",Pride Night would not influence my choice of game,Just the right amount,Just the right amount,Don't know/unaware of this feature,Just the right amount,Very cool,Bring your dog to the park.
4,9/19/2019 22:30,43470,Atlanta Braves,Prefer not to answer,Yes,"Didn't attend, but not because of Pride Night",Pride Night would not influence my choice of game,Just the right amount,Don't know/unaware of this feature,Don't know/unaware of this feature,Don't know/unaware of this feature,I don’t care,I don’t care


In [3]:
print('Number of observations: ', df_survey.shape[0])
print('Number of features: ', df_survey.shape[1])

Number of observations:  475
Number of features:  13


## Create a subset of the data and tidy it up. 

In [5]:
df_survey_1 = df_survey[["What is your favorite team?","Do you identify as LGBTQ+ ?","When planning future trips to the ballpark:"]]

df_survey_1.head()

Unnamed: 0,What is your favorite team?,Do you identify as LGBTQ+ ?,When planning future trips to the ballpark:
0,St. Louis Cardinals,No,I would go out of my way to attend on Pride Night
1,Atlanta Braves,No,Pride Night would not influence my choice of game
2,New York Yankees,No,Pride Night would not influence my choice of game
3,St. Louis Cardinals,Yes,Pride Night would not influence my choice of game
4,Atlanta Braves,Prefer not to answer,Pride Night would not influence my choice of game


In [7]:
#rename the columns
  
df_survey_1.columns = ['favteam', 'lgbtq', 'wouldgo'] 
print(df_survey_1.columns) 

Index(['favteam', 'lgbtq', 'wouldgo'], dtype='object')


In [8]:
#review the unique responses
df_survey_1.columns.unique

array(['I would go out of my way to attend on Pride Night',
       'Pride Night would not influence my choice of game',
       'I would go out of my way to avoid Pride Night',
       'I would only attend if I liked the promotional item', 'Undecided'],
      dtype=object)

## Encode the Categorical Variables

In [14]:
X = df_survey_1.drop('favteam', axis=1)
y = df_survey_1.favteam

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

transformer_name = 'ohe_on_all_categorical_features'
transformer = OneHotEncoder(sparse=False)
columns_to_encode = ['lgbtq', 'wouldgo']

ohe_final = ColumnTransformer([
    (transformer_name, transformer, columns_to_encode)], 
    remainder='passthrough')

ohe_final.fit(X)

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('ohe_on_all_categorical_features',
                                 OneHotEncoder(categorical_features=None,
                                               categories=None, drop=None,
                                               dtype=<class 'numpy.float64'>,
                                               handle_unknown='error',
                                               n_values=None, sparse=False),
                                 ['lgbtq', 'wouldgo'])],
                  verbose=False)

In [15]:
#review the categories
ohe_final.named_transformers_[transformer_name].categories_

[array(['No', 'Prefer not to answer', 'Yes'], dtype=object),
 array(['I would go out of my way to attend on Pride Night',
        'I would go out of my way to avoid Pride Night',
        'I would only attend if I liked the promotional item',
        'Pride Night would not influence my choice of game', 'Undecided'],
       dtype=object)]

In [16]:
list(enumerate(X.columns))  #get a list of the columns and their index positions

[(0, 'lgbtq'), (1, 'wouldgo')]

In [17]:
n_encode=(idx for idx, col in list(enumerate(X.columns)) if col in columns_to_encode)

In [18]:
print(n_encode)

<generator object <genexpr> at 0x00000258002251B0>


In [19]:
X_transformed = ohe_final.transform(X)
print('Shape of transformed data matrix: ', X_transformed.shape)

Shape of transformed data matrix:  (475, 8)
