# Dataframe Manipulation Warmup

In [1]:
import numpy as np
import pandas as pd

np.random.seed(406)

n = 5000
df = pd.DataFrame({
    'favorite_animal': np.random.choice(['cat', 'dog', 'frog', 'lemur', 'panda'], n),
    'favorite_vegetable': np.random.choice(['brussel sprouts', 'potato', 'squash'], n),
    'favorite_fruit': np.random.choice(['banana', 'apple', 'blueberries'], n),
    'wears_glasses': np.random.choice(['yes', 'no'], n),
    'netflix_consumption': np.random.normal(10, 2, n),
    'open_browser_tabs': np.random.randint(2, 90, n),
})

- What is the highest amount of netflix consumption? `17.535`
- How many people wear glasses? What percentage of people is this? `2555`, `.511`
- How many people's favorite animal is a dog? `1002`
- What is the most common favorite animal? `lemur`
- What is the average netflix consumption for people that prefer brussel
  sprouts? `10.008`
- What is the most common favorite fruit for people who wear glasses and have
  more than 40 open browser tabs? `blueberries`
- What percentage of people have a netflix consumption lower than 7? `.0716`
- What is the average netflix consumption for people with less than 30 open
  browser tabs? `9.91935`
- How many people *don't* wear glasses, have a favorite animal of a panda, have
  a favorite fruit of blueberries, and have more than 60 open browser tabs? What
  is the median netflix consumption for this group? What is the most common
  favorite vegetable for this group? `46`, `10.455`, `potato`
- What is the least popular combination of favorite fruit and vegetable? `apple` and `potato`
- Which combination of favorite animal and wearing glasses has the highest average
  netflix consumption? people that wear glasses and prefer pandas
- **Bonus**: for each of the above questions, what kind of visualization would
  be the most effective in conveying your answer?

In [2]:
df.head()

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs
0,lemur,potato,apple,yes,8.313351,44
1,panda,potato,apple,yes,11.801073,10
2,cat,squash,blueberries,yes,10.105141,35
3,lemur,squash,apple,no,11.024605,70
4,dog,brussel sprouts,apple,yes,6.732698,73


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
favorite_animal        5000 non-null object
favorite_vegetable     5000 non-null object
favorite_fruit         5000 non-null object
wears_glasses          5000 non-null object
netflix_consumption    5000 non-null float64
open_browser_tabs      5000 non-null int64
dtypes: float64(1), int64(1), object(4)
memory usage: 234.5+ KB


In [5]:
df.describe()

Unnamed: 0,netflix_consumption,open_browser_tabs
count,5000.0,5000.0
mean,9.962152,45.9498
std,2.018416,25.380199
min,2.130308,2.0
25%,8.570916,25.0
50%,9.950686,46.0
75%,11.348473,68.0
max,17.534819,89.0


In [10]:
# What is the highest amount of netflix consumption? 17.535

df.netflix_consumption.max()

17.534818515438925

In [33]:
# How many people wear glasses? What percentage of people is this? 2555, .511
mask = (df['wears_glasses'] == 'yes')
wear_glasses_df = df[mask]
wear_glasses_df.value_count()

AttributeError: 'DataFrame' object has no attribute 'value_count'

In [119]:
(df.wears_glasses == 'yes').sum()

2555

In [122]:
df.wears_glasses.value_counts(normalize=True)

yes    0.511
no     0.489
Name: wears_glasses, dtype: float64

In [37]:
wear_glasses_percentage = len(wear_glasses_df) / len(df)
wear_glasses_percentage

0.511

In [38]:
# How many people's favorite animal is a dog? 1002
df.head(3)

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs
0,lemur,potato,apple,yes,8.313351,44
1,panda,potato,apple,yes,11.801073,10
2,cat,squash,blueberries,yes,10.105141,35


In [124]:
dog_mask = (df['favorite_animal'] == 'dog')
fav_dog = df[dog_mask]
fav_dog.info

<bound method DataFrame.info of      favorite_animal favorite_vegetable favorite_fruit wears_glasses  \
4                dog    brussel sprouts          apple           yes   
8                dog             squash         banana           yes   
10               dog             squash    blueberries            no   
13               dog    brussel sprouts         banana           yes   
15               dog             squash    blueberries           yes   
...              ...                ...            ...           ...   
4984             dog             squash          apple            no   
4988             dog             squash          apple            no   
4989             dog    brussel sprouts          apple           yes   
4992             dog    brussel sprouts          apple            no   
4999             dog             potato         banana           yes   

      netflix_consumption  open_browser_tabs  40_tab_glasses  
4                6.732698               

In [43]:
# What is the most common favorite animal? 

df.head(3)

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs
0,lemur,potato,apple,yes,8.313351,44
1,panda,potato,apple,yes,11.801073,10
2,cat,squash,blueberries,yes,10.105141,35


In [48]:
fav_animal_series = df['favorite_animal']
fav_animal_series.value_counts()

lemur    1028
frog     1015
dog      1002
cat       980
panda     975
Name: favorite_animal, dtype: int64

In [49]:
# What is the average netflix consumption for people that prefer brussel sprouts? 10.008

df.head(3)

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs
0,lemur,potato,apple,yes,8.313351,44
1,panda,potato,apple,yes,11.801073,10
2,cat,squash,blueberries,yes,10.105141,35


In [57]:
brus_mask = (df['favorite_vegetable'] == 'brussel sprouts')
brus_lovers = df[brus_mask]
brus_lovers

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs
4,dog,brussel sprouts,apple,yes,6.732698,73
13,dog,brussel sprouts,banana,yes,10.552754,75
22,lemur,brussel sprouts,blueberries,no,10.973445,79
23,frog,brussel sprouts,banana,yes,11.200120,24
26,panda,brussel sprouts,blueberries,yes,9.105439,58
...,...,...,...,...,...,...
4987,lemur,brussel sprouts,blueberries,no,7.634127,35
4989,dog,brussel sprouts,apple,yes,10.858496,58
4990,frog,brussel sprouts,banana,no,11.128940,35
4991,lemur,brussel sprouts,apple,yes,6.372664,68


In [65]:
brus_lovers['netflix_consumption'].mean()

10.00847154798366

In [125]:
# Class version

df[df.favorite_vegetable == 'brussel sprouts'].netflix_consumption.mean()

10.00847154798366

In [69]:
# What is the most common favorite fruit for people who wear glasses and have more than 40 open browser tabs?

df.head(3)

df['40_tab_glasses'] = (df.wears_glasses == 'yes') & (df.open_browser_tabs > 40)

In [70]:
df.head(3)

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs,40_tab_glasses
0,lemur,potato,apple,yes,8.313351,44,True
1,panda,potato,apple,yes,11.801073,10,False
2,cat,squash,blueberries,yes,10.105141,35,False


In [73]:
tabs_glass = df[df['40_tab_glasses']]

In [77]:
tabs_glass.favorite_fruit.value_counts()

blueberries    498
apple          477
banana         472
Name: favorite_fruit, dtype: int64

In [126]:
# Class version

df[(df.wears_glasses == 'yes') & (df.open_browser_tabs > 40)].favorite_fruit.value_counts()

blueberries    498
apple          477
banana         472
Name: favorite_fruit, dtype: int64

In [78]:
# What percentage of people have a netflix consumption lower than 7?

df.head(3)

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs,40_tab_glasses
0,lemur,potato,apple,yes,8.313351,44,True
1,panda,potato,apple,yes,11.801073,10,False
2,cat,squash,blueberries,yes,10.105141,35,False


In [82]:
lower_nf_watchers = df[df.netflix_consumption < 7]

In [83]:
len(lower_nf_watchers) / len(df)

0.0716

In [127]:
# Class version

(df.netflix_consumption < 7).mean()

0.0716

In [86]:
# What is the average netflix consumption for people with less than 30 open browser tabs?

less_than_30_tabs = df[df.open_browser_tabs < 30]

In [88]:
less_than_30_tabs.netflix_consumption.mean()

9.91935736918227

In [89]:
# How many people don't wear glasses, have a favorite animal of a panda, have a favorite fruit of blueberries,
# and have more than 60 open browser tabs? What is the median netflix consumption for this group? What is the
# most common favorite vegetable for this group?

no_glasses_panda_blueberries_60_tabs = (df['wears_glasses'] == 'no') & (df['favorite_animal'] == 'panda') & (df['favorite_fruit'] == 'blueberries') & (df['open_browser_tabs'] > 60 )

In [93]:
no_glasses_panda_blueb_60 = df[no_glasses_panda_blueberries_60_tabs]

In [97]:
no_glasses_panda_blueb_60.describe()

Unnamed: 0,netflix_consumption,open_browser_tabs
count,46.0,46.0
mean,10.144887,76.782609
std,2.227496,8.801295
min,5.759472,61.0
25%,8.279804,71.5
50%,10.454798,78.0
75%,11.125058,84.0
max,15.847422,89.0


In [99]:
no_glasses_panda_blueb_60.netflix_consumption.median()

10.45479760071613

In [100]:
no_glasses_panda_blueb_60.favorite_vegetable.value_counts()

potato             19
brussel sprouts    14
squash             13
Name: favorite_vegetable, dtype: int64

In [130]:
# Class versions

no_glasses = df.wears_glasses == 'no'
loves_pandas = df.favorite_animal == 'panda'
too_many_browser_tabs = df.open_browser_tabs > 60
likes_blueberries = df.favorite_fruit == 'blueberries'

mask = no_glasses & loves
df[mask].netflix_consumption.median()

10.45479760071613

In [132]:
df[mask].favorite_vegetable.value_counts()

potato             19
brussel sprouts    14
squash             13
Name: favorite_vegetable, dtype: int64

In [133]:
df[mask].shape

(46, 7)

In [134]:
subgroup = df[df.wears_glasses == 'no']
subgroup = subgroup[subgroup.favorite_animal == 'panda']
subgroup = subgroup[subgroup.open_browser_tabs > 60]
subgroup = subgroup[subgroup.favorite_fruit == 'blueberries']
subgroup.shape

(46, 7)

In [137]:
(df.query('wears_glasses == "no"')
 .query('favorite_animal == "panda"')
 .query('open_browser_tabs > 60')
 .query('favorite_fruit == "blueberries"')
 .netflix_consumption.median())


10.45479760071613

In [101]:
# What is the least popular combination of favorite fruit and vegetable? apple and potato

df.head()

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs,40_tab_glasses
0,lemur,potato,apple,yes,8.313351,44,True
1,panda,potato,apple,yes,11.801073,10,False
2,cat,squash,blueberries,yes,10.105141,35,False
3,lemur,squash,apple,no,11.024605,70,False
4,dog,brussel sprouts,apple,yes,6.732698,73,True


In [102]:
df.favorite_fruit.value_counts()

blueberries    1698
banana         1670
apple          1632
Name: favorite_fruit, dtype: int64

In [103]:
df.favorite_vegetable.value_counts()

brussel sprouts    1696
squash             1662
potato             1642
Name: favorite_vegetable, dtype: int64

In [104]:
apple_potato = (df['favorite_fruit'] == 'apple') & (df['favorite_vegetable'] == 'potato')

In [105]:
df[apple_potato]

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs,40_tab_glasses
0,lemur,potato,apple,yes,8.313351,44,True
1,panda,potato,apple,yes,11.801073,10,False
12,panda,potato,apple,no,4.803158,82,False
14,lemur,potato,apple,no,6.750781,30,False
36,cat,potato,apple,yes,12.392229,72,True
...,...,...,...,...,...,...,...
4932,frog,potato,apple,no,9.606204,14,False
4969,panda,potato,apple,yes,10.417786,34,False
4976,cat,potato,apple,yes,6.586360,60,True
4983,panda,potato,apple,no,10.524827,23,False


In [139]:
# Class version

df.groupby(['favorite_fruit', 'favorite_vegetable']).size().sort_values()

favorite_fruit  favorite_vegetable
apple           potato                512
banana          squash                524
apple           squash                555
blueberries     brussel sprouts       555
                potato                560
apple           brussel sprouts       565
banana          potato                570
                brussel sprouts       576
blueberries     squash                583
dtype: int64

In [140]:
pd.crosstab(df.favorite_fruit, df.favorite_vegetable)

favorite_vegetable,brussel sprouts,potato,squash
favorite_fruit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
apple,565,512,555
banana,576,570,524
blueberries,555,560,583


In [113]:
# Which combination of favorite animal and wearing glasses has the highest average netflix consumption?
# people that wear glasses and prefer pandas

panda_glasses = (df['favorite_animal'] == 'panda') & (df['wears_glasses'] == 'yes')
panda_and_glasses = df[panda_glasses]

In [114]:
panda_and_glasses.netflix_consumption.mean()

10.092272771883268

In [115]:
lemur_glasses = (df['favorite_animal'] == 'lemur') & (df['wears_glasses'] == 'yes')
lemur_and_glasses = df[lemur_glasses]
lemur_and_glasses.netflix_consumption.mean()

10.010196323329442

In [116]:
dog_glasses = (df['favorite_animal'] == 'dog') & (df['wears_glasses'] == 'yes')
dog_and_glasses = df[dog_glasses]
dog_and_glasses.netflix_consumption.mean()

10.0873519754841

In [117]:
cat_glasses = (df['favorite_animal'] == 'cat') & (df['wears_glasses'] == 'yes')
cat_and_glasses = df[cat_glasses]
cat_and_glasses.netflix_consumption.mean()

9.884685190731133

In [118]:
frog_glasses = (df['favorite_animal'] == 'frog') & (df['wears_glasses'] == 'yes')
frog_and_glasses = df[frog_glasses]
frog_and_glasses.netflix_consumption.mean()

9.834740212128033

In [142]:
# Class version

df.groupby(['favorite_animal', 'wears_glasses']).netflix_consumption.mean().sort_values()

favorite_animal  wears_glasses
frog             yes               9.834740
cat              no                9.846183
                 yes               9.884685
dog              no                9.933246
panda            no                9.946293
frog             no                9.962311
lemur            yes              10.010196
                 no               10.024557
dog              yes              10.087352
panda            yes              10.092273
Name: netflix_consumption, dtype: float64

In [143]:
#values(numeric), rows(category), columns(category)
df.pivot_table('netflix_consumption', 'favorite_animal', 'wears_glasses')

wears_glasses,no,yes
favorite_animal,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.846183,9.884685
dog,9.933246,10.087352
frog,9.962311,9.83474
lemur,10.024557,10.010196
panda,9.946293,10.092273


## Bonus: for each of the above questions, what kind of visualization would be the most effective in conveying your answer?

- What is the highest amount of netflix consumption? 17.535
### histogram
- How many people wear glasses? What percentage of people is this? 2555, .511
### bar graph
- How many people's favorite animal is a dog? 1002
### pie chart
- What is the most common favorite animal? lemur
### bar graph
- What is the average netflix consumption for people that prefer brussel sprouts? 10.008
### histogram
- What is the most common favorite fruit for people who wear glasses and have more than 40 open browser tabs? blueberries
### bar graph
- What percentage of people have a netflix consumption lower than 7? .0716
### histogram
- What is the average netflix consumption for people with less than 30 open browser tabs? 9.91935
### distplot
- How many people don't wear glasses, have a favorite animal of a panda, have a favorite fruit of blueberries, and have more than 60 open browser tabs? What is the median netflix consumption for this group? What is the most common favorite vegetable for this group? 46, 10.455, potato
### infographic
- What is the least popular combination of favorite fruit and vegetable? apple and potato
### bar graph
- Which combination of favorite animal and wearing glasses has the highest average netflix consumption? people that wear glasses and prefer pandas
### bar graph

# Or one giant infographic for all the things