In [1]:
import pandas as pd
data = pd.read_csv('thanksgiving.csv',encoding='latin-1')
data.head()

data['Do you celebrate Thanksgiving?'].value_counts()

# Filtering only those who celebrate Thanksgiving

data = data[data['Do you celebrate Thanksgiving?']=='Yes']


In [8]:
data['What is typically the main dish at your Thanksgiving dinner?'].value_counts()

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64

In [9]:
data[data['What is typically the main dish at your Thanksgiving dinner?']=='Tofurkey']['Do you typically have gravy?']

4      Yes
33     Yes
69      No
72      No
77     Yes
145    Yes
175    Yes
218     No
243    Yes
275     No
393    Yes
399    Yes
571    Yes
594    Yes
628     No
774     No
820     No
837    Yes
860     No
953    Yes
Name: Do you typically have gravy?, dtype: object

# Checking pies eaten

In [2]:
apple_isnull = data['Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple'].isnull()
pumpkin_isnull = data['Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin'].isnull()
pecan_isnull = data['Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan'].isnull()
ate_pies = apple_isnull & pumpkin_isnull & pecan_isnull
ate_pies.value_counts()

False    876
True     104
dtype: int64

In [11]:
data['Age'].value_counts()

45 - 59    269
60+        258
30 - 44    235
18 - 29    185
Name: Age, dtype: int64

In [12]:
def convert_age(string):
    if pd.isnull(string):
        return None
    str_first = string.split(' ')[0]
    str_clean = str_first.replace("+", "")
    return int(str_clean)

data['int_age'] = data['Age'].apply(convert_age)
data['int_age'].describe()

count    947.000000
mean      40.089757
std       15.352014
min       18.000000
25%       30.000000
50%       45.000000
75%       60.000000
max       60.000000
Name: int_age, dtype: float64

#### 15 yearly Age groups are evenly distributed. 
#### 60+ is completely lost information
#### Taking the first value in each range makes ages younger than true age
#### Describe() may not show enough about the distribution of ages compared to histogram 

In [13]:
def convert_income(string):
    if pd.isnull(string):
        return None
    
    str_first = string.split(' ')[0]
    if str_first == 'Prefer':
            return None

    str_clean = str_first.replace("$", "")
    str_clean = str_clean.replace(",", "")
    return int(str_clean)

data['int_income'] = data['How much total combined money did all members of your HOUSEHOLD earn last year?'].apply(convert_income)
data['int_income'].describe()

count       829.000000
mean      75965.018094
std       59068.636748
min           0.000000
25%       25000.000000
50%       75000.000000
75%      100000.000000
max      200000.000000
Name: int_income, dtype: float64

    Max income is alot higher than mean. Income skews down because first value in each range is taken
    

In [14]:
data_income_less = data[data['int_income']<150000] 
data_income_less['How far will you travel for Thanksgiving?'].value_counts()

Thanksgiving is happening at my home--I won't travel at all                         281
Thanksgiving is local--it will take place in the town I live in                     203
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    150
Thanksgiving is out of town and far away--I have to drive several hours or fly       55
Name: How far will you travel for Thanksgiving?, dtype: int64

In [15]:
data_income_more = data[data['int_income']>150000] 
data_income_more['How far will you travel for Thanksgiving?'].value_counts()

Thanksgiving is happening at my home--I won't travel at all                         49
Thanksgiving is local--it will take place in the town I live in                     25
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    16
Thanksgiving is out of town and far away--I have to drive several hours or fly      12
Name: How far will you travel for Thanksgiving?, dtype: int64

**Many more people have income < 150000. Among them, most people won't travel at all. As the distance gets longer, less people travel that distance. 
This pattern applies similarly to income > 150000. There is little correlation between income and travel distance. **

In [16]:
data.pivot_table(index ='Have you ever tried to meet up with hometown friends on Thanksgiving night?',
                 columns = 'Have you ever attended a "Friendsgiving?"' ,
                 values = 'int_age')

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,42.283702,37.010526
Yes,41.47541,33.976744


More Younger people try to meetup with hometown friends than older people. Same for Attending friendsgiving.
The youngest do both.

In [17]:
data["What is typically the main dish at your Thanksgiving dinner?"].value_counts() 

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64

# Most common Dessert

In [42]:
import pandas as pd

desert_count = {}
deserts = data.loc[:,'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Apple cobbler':'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Peach cobbler']
for colname in deserts.columns:
    lastname = colname.split(' ')[-1]
    if lastname in desert_count:
        lastname = 'Peach cobbler'
    desert_count[lastname] = pd.notnull(deserts[colname]).value_counts()[True]
    
desert_count 

{'Blondies': 16,
 'Brownies': 128,
 'Cheesecake': 191,
 'Cookies': 204,
 'Fudge': 43,
 'Peach cobbler': 103,
 'cake': 72,
 'cobbler': 110,
 'cream': 266}

In [43]:
pd.notnull(data['Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Apple cobbler']).value_counts()[True]

110