# Dataframe Manipulation Warmup

In [1]:
import numpy as np
import pandas as pd

np.random.seed(406)

n = 5000
df = pd.DataFrame({
    'favorite_animal': np.random.choice(['cat', 'dog', 'frog', 'lemur', 'panda'], n),
    'favorite_vegetable': np.random.choice(['brussel sprouts', 'potato', 'squash'], n),
    'favorite_fruit': np.random.choice(['banana', 'apple', 'blueberries'], n),
    'wears_glasses': np.random.choice(['yes', 'no'], n),
    'netflix_consumption': np.random.normal(10, 2, n),
    'open_browser_tabs': np.random.randint(2, 90, n),
})

- What is the highest amount of netflix consumption? `17.535`
- How many people wear glasses? What percentage of people is this? `2555`, `.511`
- How many people's favorite animal is a dog? `1002`
- What is the most common favorite animal? `lemur`
- What is the average netflix consumption for people that prefer brussel
  sprouts? `10.008`
- What is the most common favorite fruit for people who wear glasses and have
  more than 40 open browser tabs? `blueberries`
- What percentage of people have a netflix consumption lower than 7? `.0716`
- What is the average netflix consumption for people with less than 30 open
  browser tabs? `9.91935`
- How many people *don't* wear glasses, have a favorite animal of a panda, have
  a favorite fruit of blueberries, and have more than 60 open browser tabs? What
  is the median netflix consumption for this group? What is the most common
  favorite vegetable for this group? `46`, `10.455`, `potato`
- What is the least popular combination of favorite fruit and vegetable? `apple` and `potato`
- Which combination of favorite animal and wearing glasses has the highest average
  netflix consumption? people that wear glasses and prefer pandas
- **Bonus**: for each of the above questions, what kind of visualization would
  be the most effective in conveying your answer?

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
favorite_animal        5000 non-null object
favorite_vegetable     5000 non-null object
favorite_fruit         5000 non-null object
wears_glasses          5000 non-null object
netflix_consumption    5000 non-null float64
open_browser_tabs      5000 non-null int64
dtypes: float64(1), int64(1), object(4)
memory usage: 234.5+ KB


In [3]:
df.head()

Unnamed: 0,favorite_animal,favorite_vegetable,favorite_fruit,wears_glasses,netflix_consumption,open_browser_tabs
0,lemur,potato,apple,yes,8.313351,44
1,panda,potato,apple,yes,11.801073,10
2,cat,squash,blueberries,yes,10.105141,35
3,lemur,squash,apple,no,11.024605,70
4,dog,brussel sprouts,apple,yes,6.732698,73


What is the highest amount of netflix consumption? 17.535

In [4]:
df.netflix_consumption.max()

17.534818515438925

- How many people wear glasses? What percentage of people is this? `2555`, `.511`

In [5]:
df.wears_glasses.value_counts()

yes    2555
no     2445
Name: wears_glasses, dtype: int64

In [6]:
df.wears_glasses.value_counts(normalize = True)

yes    0.511
no     0.489
Name: wears_glasses, dtype: float64

- How many people's favorite animal is a dog? `1002`

In [7]:
df.favorite_animal[df['favorite_animal']== 'dog'].count()

1002

- What is the most common favorite animal? `lemur`

In [8]:
df.favorite_animal.value_counts()

lemur    1028
frog     1015
dog      1002
cat       980
panda     975
Name: favorite_animal, dtype: int64

- What is the average netflix consumption for people that prefer brussel
  sprouts? `10.008`

In [9]:
df.netflix_consumption[df.favorite_vegetable == 'brussel sprouts'].mean()

10.00847154798366

- What is the most common favorite fruit for people who wear glasses and have
  more than 40 open browser tabs? `blueberries`

In [10]:
mask = (df['wears_glasses'] == 'yes') & (df['open_browser_tabs'] > 4)

In [11]:
df[mask].favorite_fruit.max()

'blueberries'

- What percentage of people have a netflix consumption lower than 7? `.0716`

In [12]:
netflix_lower_than_seven = (df['netflix_consumption'] < 7).sum()

In [13]:
total_people = df.netflix_consumption.count()

In [14]:
netflix_lower_than_seven/total_people

0.0716

- What is the average netflix consumption for people with less than 30 open
  browser tabs? `9.91935`

In [15]:
mask = df['open_browser_tabs'] < 30

In [16]:
df[mask].netflix_consumption.mean()

9.91935736918227

- How many people *don't* wear glasses, have a favorite animal of a panda, have
  a favorite fruit of blueberries, and have more than 60 open browser tabs? 

In [17]:
no_glasses = (df['wears_glasses'] == 'no')
loves_pandas = (df['favorite_animal'] == 'panda')
loves_blueberries = (df['favorite_fruit'] == 'blueberries')
loves_tabs = (df['open_browser_tabs'] > 60)

In [18]:
mask = no_glasses & loves_pandas & loves_blueberries & loves_tabs

In [19]:
df[mask].count()

favorite_animal        46
favorite_vegetable     46
favorite_fruit         46
wears_glasses          46
netflix_consumption    46
open_browser_tabs      46
dtype: int64

 What
  is the median netflix consumption for this group? 

In [20]:
df[mask].netflix_consumption.median()

10.45479760071613

What is the most common
  favorite vegetable for this group?

In [21]:
df[mask].favorite_vegetable.value_counts()

potato             19
brussel sprouts    14
squash             13
Name: favorite_vegetable, dtype: int64

- What is the least popular combination of favorite fruit and vegetable?

In [22]:
df.groupby(['favorite_fruit', 'favorite_vegetable']).size().idxmin()

('apple', 'potato')

- Which combination of favorite animal and wearing glasses has the highest average
  netflix consumption?

In [23]:
df.groupby(['wears_glasses', 'favorite_animal']).netflix_consumption.mean().sort_values(ascending = False)

wears_glasses  favorite_animal
yes            panda              10.092273
               dog                10.087352
no             lemur              10.024557
yes            lemur              10.010196
no             frog                9.962311
               panda               9.946293
               dog                 9.933246
yes            cat                 9.884685
no             cat                 9.846183
yes            frog                9.834740
Name: netflix_consumption, dtype: float64