In [29]:
# Python Pandas practice

### Dataset and tutorial from DATAQUEST

# To answer a few questions
    # 1. Do people in Suburban areas eat more Tofurkey than people in Rural areas?
    # 2. Is ther a correlation between celebrating Thanksgiving and income?
    # 3. What income groups are most likely to have homemade cranberry sauce?

#-----------------------------------------------------------------------------------

# Import pandas library
import pandas as pd

# Check the default directory: pwd

# Import csv
data = pd.read_csv("C:\\Users\\tiga\\My python scripts\\thanksgiving_2015_poll_data.txt", encoding = "Latin-1")\

# Have a look at the data
data.head()

Unnamed: 0,RespondentID,Do you celebrate Thanksgiving?,What is typically the main dish at your Thanksgiving dinner?,What is typically the main dish at your Thanksgiving dinner? - Other (please specify),How is the main dish typically cooked?,How is the main dish typically cooked? - Other (please specify),What kind of stuffing/dressing do you typically have?,What kind of stuffing/dressing do you typically have? - Other (please specify),What type of cranberry saucedo you typically have?,What type of cranberry saucedo you typically have? - Other (please specify),...,Have you ever tried to meet up with hometown friends on Thanksgiving night?,"Have you ever attended a ""Friendsgiving?""",Will you shop any Black Friday sales on Thanksgiving Day?,Do you work in retail?,Will you employer make you work on Black Friday?,How would you describe where you live?,Age,What is your gender?,How much total combined money did all members of your HOUSEHOLD earn last year?,US Region
0,4337954960,Yes,Turkey,,Baked,,Bread-based,,,,...,Yes,No,No,No,,Suburban,18 - 29,Male,"$75,000 to $99,999",Middle Atlantic
1,4337951949,Yes,Turkey,,Baked,,Bread-based,,Other (please specify),Homemade cranberry gelatin ring,...,No,No,Yes,No,,Rural,18 - 29,Female,"$50,000 to $74,999",East South Central
2,4337935621,Yes,Turkey,,Roasted,,Rice-based,,Homemade,,...,Yes,Yes,Yes,No,,Suburban,18 - 29,Male,"$0 to $9,999",Mountain
3,4337933040,Yes,Turkey,,Baked,,Bread-based,,Homemade,,...,Yes,No,No,No,,Urban,30 - 44,Male,"$200,000 and up",Pacific
4,4337931983,Yes,Tofurkey,,Baked,,Bread-based,,Canned,,...,Yes,No,No,No,,Urban,30 - 44,Male,"$100,000 to $124,999",Pacific


In [30]:
data.shape

(1058, 65)

In [31]:
# Check the unique values in 'Do you celebrate Thanksgiving?' column
data["Do you celebrate Thanksgiving?"].unique()

array(['Yes', 'No'], dtype=object)

In [37]:
# To answer question 1: Do people in Suburban areas eat more Tofurkey than people in Rural areas?
data[data["What is typically the main dish at your Thanksgiving dinner?"]=="Tofurkey"]["How would you describe where you live?"].value_counts()

Suburban    9
Urban       8
Rural       3
Name: How would you describe where you live?, dtype: int64

In [None]:
# To answer question 2: Is ther a correlation between celebrating Thanksgiving and income?
data.iloc[:,-2].unique()

In [41]:
data.iloc[:,-2].unique()

array(['$75,000 to $99,999', '$50,000 to $74,999', '$0 to $9,999',
       '$200,000 and up', '$100,000 to $124,999', '$25,000 to $49,999',
       'Prefer not to answer', '$10,000 to $24,999',
       '$150,000 to $174,999', '$175,000 to $199,999',
       '$125,000 to $149,999', nan], dtype=object)

In [45]:
import numpy as np
import math

def clean_income(value):
    if value == "$200,000 and up":
        return 200000
    elif value == "Prefer not to answer":
        return np.nan
    elif isinstance(value,float) and math.isnan(value):
        return np.nan
    value = value.replace(",","").replace("$","")
    income_high, income_low = value.split(" to ")
    return (int(income_high) + int(income_low))/2

data["income"] = data.iloc[:,-2].apply(clean_income)

In [49]:
data["income"].value_counts(dropna=False)

37499.5     180
NaN         169
62499.5     135
87499.5     133
112499.5    111
200000.0     80
17499.5      68
4999.5       66
137499.5     49
162499.5     40
187499.5     27
Name: income, dtype: int64

In [50]:
data.groupby("Do you celebrate Thanksgiving?")["income"].agg(np.mean)

Do you celebrate Thanksgiving?
No     57249.533333
Yes    86486.276840
Name: income, dtype: float64

In [53]:
# To answer question 3. What income groups are most likely to have homemade cranberry sauce?
data.groupby("income")["What type of cranberry saucedo you typically have?"].value_counts()

income    What type of cranberry saucedo you typically have?
4999.5    Canned                                                20
          Homemade                                              19
          None                                                  13
17499.5   Canned                                                39
          Homemade                                              13
          None                                                   7
          Other (please specify)                                 1
37499.5   Canned                                                91
          Homemade                                              46
          None                                                  22
          Other (please specify)                                 7
62499.5   Canned                                                79
          Homemade                                              26
          None                                                  19
 

In [55]:
data[data["What type of cranberry saucedo you typically have?"]=="Homemade"]["income"].value_counts()

37499.5     46
87499.5     40
200000.0    32
112499.5    29
62499.5     26
4999.5      19
137499.5    18
162499.5    15
17499.5     13
187499.5     9
Name: income, dtype: int64