In [2]:
#1. Create a df from the LemonadeStand.csv file. Skip any lines you need to skip and a meaningful index column.
import pandas as pd
import numpy as np
df = pd.read_csv('LemonadeStand-1.csv', skiprows=3, index_col='DayNo')
print(df)

Original df:
         HiTemp  SunHrs  WaspCt Custs (AM/PM)
DayNo                                       
Day 1      101     6.0    19.0         42/43
Day 2       99     7.0    15.0         39/55
Day 3       78    14.0    10.0         40/51
Day 4       90    12.0    15.0         39/40
Day 5       83    10.0    13.0         38/45
Day 6       81     NaN    28.0          6/34
Day 7       82     NaN    14.0         25/28
Day 8       75    10.0    24.0         23/38
Day 9       78     9.0    19.0         40/34
Day 10     100    13.0    12.0         45/57
Day 11     117     NaN     NaN           NaN

Cleaned df:
         HiTemp  SunHrs  WaspCt Custs (AM/PM)
DayNo                                       
Day 1      101     6.0    19.0         42/43
Day 2       99     7.0    15.0         39/55
Day 3       78    14.0    10.0         40/51
Day 4       90    12.0    15.0         39/40
Day 5       83    10.0    13.0         38/45
Day 6       81     NaN    28.0          6/34
Day 7       82     NaN    1

In [3]:
#2. Output the values in the HiTemp and WaspCt columns only.
print(df[['HiTemp', 'WaspCt']])

        HiTemp  WaspCt
DayNo                 
Day 1      101    19.0
Day 2       99    15.0
Day 3       78    10.0
Day 4       90    15.0
Day 5       83    13.0
Day 6       81    28.0
Day 7       82    14.0
Day 8       75    24.0
Day 9       78    19.0
Day 10     100    12.0
Day 11     117     NaN


In [4]:
#3. How many sun hours were there on Day 8 only? Output only this value using .loc.
print("Sun hours on day 8:", df.loc['Day 8']['SunHrs'])

Sun hours on day 8: 10.0


In [5]:
#4 Replace any null values in the SunHrs column ONLY with the value 0.
df['SunHrs'].replace(to_replace=np.nan, value=0, inplace=True)
print(df)

        HiTemp  SunHrs  WaspCt Custs (AM/PM)
DayNo                                       
Day 1      101     6.0    19.0         42/43
Day 2       99     7.0    15.0         39/55
Day 3       78    14.0    10.0         40/51
Day 4       90    12.0    15.0         39/40
Day 5       83    10.0    13.0         38/45
Day 6       81     0.0    28.0          6/34
Day 7       82     0.0    14.0         25/28
Day 8       75    10.0    24.0         23/38
Day 9       78     9.0    19.0         40/34
Day 10     100    13.0    12.0         45/57
Day 11     117     0.0     NaN           NaN


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['SunHrs'].replace(to_replace=np.nan, value=0, inplace=True)


In [6]:
#5. Drop any rows where there are still null values.
df.dropna(inplace=True)  # Remove rows with NaN
print(df)

        HiTemp  SunHrs  WaspCt Custs (AM/PM)
DayNo                                       
Day 1      101     6.0    19.0         42/43
Day 2       99     7.0    15.0         39/55
Day 3       78    14.0    10.0         40/51
Day 4       90    12.0    15.0         39/40
Day 5       83    10.0    13.0         38/45
Day 6       81     0.0    28.0          6/34
Day 7       82     0.0    14.0         25/28
Day 8       75    10.0    24.0         23/38
Day 9       78     9.0    19.0         40/34
Day 10     100    13.0    12.0         45/57


In [7]:
#6. Create a new column called Ocast ("overcast"), where if there were 0 sun hours the value is 1, otherwise the value is 0.
df['Ocast'] = np.where(df['SunHrs'] == 0, 1, 0)
# Also create a new column called TempCat, where the value is "high" if the temperature is over 95, "medium" if it's between 80 and 95 (inclusive), and "low" if it's ANY OTHER VALUE.
conditions = [
    df['HiTemp'] > 95, 
    df['HiTemp'] > 80, 
    True
]
values = ['high', 'medium', 'low']
df['TempCat'] = np.select(conditions, values)

print(df)

        HiTemp  SunHrs  WaspCt Custs (AM/PM)  Ocast TempCat
DayNo                                                      
Day 1      101     6.0    19.0         42/43      0    high
Day 2       99     7.0    15.0         39/55      0    high
Day 3       78    14.0    10.0         40/51      0     low
Day 4       90    12.0    15.0         39/40      0  medium
Day 5       83    10.0    13.0         38/45      0  medium
Day 6       81     0.0    28.0          6/34      1  medium
Day 7       82     0.0    14.0         25/28      1  medium
Day 8       75    10.0    24.0         23/38      0     low
Day 9       78     9.0    19.0         40/34      0     low
Day 10     100    13.0    12.0         45/57      0    high


In [8]:
#7. Output the descriptive statistics for the WaspCt column only broken up by temperature category (TempCat).
df[['TempCat', 'WaspCt']].groupby('TempCat').describe()

Unnamed: 0_level_0,WaspCt,WaspCt,WaspCt,WaspCt,WaspCt,WaspCt,WaspCt,WaspCt
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
TempCat,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
high,3.0,15.333333,3.511885,12.0,13.5,15.0,17.0,19.0
low,3.0,17.666667,7.094599,10.0,14.5,19.0,21.5,24.0
medium,4.0,17.5,7.047458,13.0,13.75,14.5,18.25,28.0


In [38]:
#8. What is the average number of customers in the morning? Show this value broken out by TempCat value.
df[['AMCusts', 'PMCusts']] = df['Custs (AM/PM)'].str.split('/', expand=True)
df['AMCusts'] = df['AMCusts'].astype('int')
print(df[['AMCusts', 'TempCat']].groupby('TempCat').agg('mean'))

           AMCusts
TempCat           
high     42.000000
low      34.333333
medium   27.000000
