In [39]:
import pandas as pd
import numpy as np
from pydataset import data

In [40]:
airquality = data ('airquality')

In [41]:
data()

Unnamed: 0,dataset_id,title
0,AirPassengers,Monthly Airline Passenger Numbers 1949-1960
1,BJsales,Sales Data with Leading Indicator
2,BOD,Biochemical Oxygen Demand
3,Formaldehyde,Determination of Formaldehyde
4,HairEyeColor,Hair and Eye Color of Statistics Students
...,...,...
752,VerbAgg,Verbal Aggression item responses
753,cake,Breakage Angle of Chocolate Cakes
754,cbpp,Contagious bovine pleuropneumonia
755,grouseticks,Data on red grouse ticks from Elston et al. 2001


## Inspect the structure

In [42]:
airquality.shape

(153, 6)

In [43]:
airquality.dtypes

Ozone      float64
Solar.R    float64
Wind       float64
Temp         int64
Month        int64
Day          int64
dtype: object

In [44]:
airquality.columns

Index(['Ozone', 'Solar.R', 'Wind', 'Temp', 'Month', 'Day'], dtype='object')

In [45]:
airquality.head()

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
1,41.0,190.0,7.4,67,5,1
2,36.0,118.0,8.0,72,5,2
3,12.0,149.0,12.6,74,5,3
4,18.0,313.0,11.5,62,5,4
5,,,14.3,56,5,5


In [46]:
airquality.tail()

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
149,30.0,193.0,6.9,70,9,26
150,,145.0,13.2,77,9,27
151,14.0,191.0,14.3,75,9,28
152,18.0,131.0,8.0,76,9,29
153,20.0,223.0,11.5,68,9,30


In [82]:
airquality.index

Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
       ...
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153],
      dtype='int64', length=153)

In [47]:
airquality.info()

<class 'pandas.core.frame.DataFrame'>
Index: 153 entries, 1 to 153
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Ozone    116 non-null    float64
 1   Solar.R  146 non-null    float64
 2   Wind     153 non-null    float64
 3   Temp     153 non-null    int64  
 4   Month    153 non-null    int64  
 5   Day      153 non-null    int64  
dtypes: float64(3), int64(3)
memory usage: 8.4 KB


# Basic summary stats

In [48]:
airquality.describe()

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
count,116.0,146.0,153.0,153.0,153.0,153.0
mean,42.12931,185.931507,9.957516,77.882353,6.993464,15.803922
std,32.987885,90.058422,3.523001,9.46527,1.416522,8.86452
min,1.0,7.0,1.7,56.0,5.0,1.0
25%,18.0,115.75,7.4,72.0,6.0,8.0
50%,31.5,205.0,9.7,79.0,7.0,16.0
75%,63.25,258.75,11.5,85.0,8.0,23.0
max,168.0,334.0,20.7,97.0,9.0,31.0


In [49]:
airquality.max()

Ozone      168.0
Solar.R    334.0
Wind        20.7
Temp        97.0
Month        9.0
Day         31.0
dtype: float64

In [50]:
airquality.min()

Ozone       1.0
Solar.R     7.0
Wind        1.7
Temp       56.0
Month       5.0
Day         1.0
dtype: float64

In [51]:
airquality.nunique()


Ozone       67
Solar.R    117
Wind        31
Temp        40
Month        5
Day         31
dtype: int64

# Missing values

In [52]:
airquality.isna()

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,True,True,False,False,False,False
6,False,True,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False
10,True,False,False,False,False,False


In [53]:
airquality.isna(). sum()

Ozone      37
Solar.R     7
Wind        0
Temp        0
Month       0
Day         0
dtype: int64

In [74]:
airquality_mv = airquality.copy()

In [75]:
print(airquality_mv.head())

   Ozone  Solar.R  Wind  Temp  Month  Day
1   41.0    190.0   7.4    67      5    1
2   36.0    118.0   8.0    72      5    2
3   12.0    149.0  12.6    74      5    3
4   18.0    313.0  11.5    62      5    4
5   31.5      NaN  14.3    56      5    5


In [76]:
airquality_mv.copy()

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
1,41.0,190.0,7.4,67,5,1
2,36.0,118.0,8.0,72,5,2
3,12.0,149.0,12.6,74,5,3
4,18.0,313.0,11.5,62,5,4
5,31.5,,14.3,56,5,5
6,28.0,,14.9,66,5,6
7,23.0,299.0,8.6,65,5,7
8,19.0,99.0,13.8,59,5,8
9,8.0,19.0,20.1,61,5,9
10,31.5,194.0,8.6,69,5,10


In [81]:
airquality_mv.fillna({'Ozone': airquality_mv ['Ozone'].median(), 'Solar.R': airquality_mv['Solar.R'].mean()}, inplace=True)
print (airquality_mv.head())
#The median is preferred over the mean when data has outliers or is skewed, because it better reflects the typical value without being distorted by extreme numbers.


   Ozone     Solar.R  Wind  Temp  Month  Day
1   41.0  190.000000   7.4    67      5    1
2   36.0  118.000000   8.0    72      5    2
3   12.0  149.000000  12.6    74      5    3
4   18.0  313.000000  11.5    62      5    4
5   31.5  185.931507  14.3    56      5    5


# Selecting columns and rows 

In [87]:
airquality_sel = airquality[['Ozone', 'Solar.R', 'Temp']]
print(airquality_sel.head(10))

    Ozone  Solar.R  Temp
1    41.0    190.0    67
2    36.0    118.0    72
3    12.0    149.0    74
4    18.0    313.0    62
5    31.5      NaN    56
6    28.0      NaN    66
7    23.0    299.0    65
8    19.0     99.0    59
9     8.0     19.0    61
10   31.5    194.0    69


In [93]:
airquality [airquality['Month'].isin([5,6])]

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
1,41.0,190.0,7.4,67,5,1
2,36.0,118.0,8.0,72,5,2
3,12.0,149.0,12.6,74,5,3
4,18.0,313.0,11.5,62,5,4
5,31.5,,14.3,56,5,5
6,28.0,,14.9,66,5,6
7,23.0,299.0,8.6,65,5,7
8,19.0,99.0,13.8,59,5,8
9,8.0,19.0,20.1,61,5,9
10,31.5,194.0,8.6,69,5,10


In [58]:
airquality.iloc[1:4]

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
2,36.0,118.0,8.0,72,5,2
3,12.0,149.0,12.6,74,5,3
4,18.0,313.0,11.5,62,5,4


In [59]:
airquality.columns

Index(['Ozone', 'Solar.R', 'Wind', 'Temp', 'Month', 'Day'], dtype='object')

In [63]:
df_sel= pd.DataFrame({ 'Ozone': airquality ['Ozone'] , 'Solar.R' : airquality ['Solar.R'] , 'Temp': airquality ['Temp'] }) 
df_sel.head(10)

Unnamed: 0,Ozone,Solar.R,Temp
1,41.0,190.0,67
2,36.0,118.0,72
3,12.0,149.0,74
4,18.0,313.0,62
5,,,56
6,28.0,,66
7,23.0,299.0,65
8,19.0,99.0,59
9,8.0,19.0,61
10,,194.0,69


# filtering and conditions 