# Advanced Panda Functions
- read data, search, filter, etc

In [78]:
# libraries
import numpy as np
import pandas as pd

In [79]:
#options
pd.set_option('display.max_columns', None)  

In [93]:
from pydataset import data
mtcars = data('mtcars')
mtcars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


### Advanced
- change dtypes
- encode
- rowname to columns
- filter
- sort
- select dtypes
- pivot
- melt
- wide

### A data frame with 32 observations on 11 (numeric) variables.
- mpg	Miles/(US) gallon
- cyl	Number of cylinders : Discrete/Category
- disp	Displacement (cu.in.)
- hp	Gross horsepower
- drat	Rear axle ratio
- wt	Weight (1000 lbs)
- qsec	1/4 mile time
- vs	Engine (0 = V-shaped, 1 = straight) : Category
- am	Transmission (0 = automatic, 1 = manual) : Category
- gear	Number of forward gears : Category

In [94]:
mtcars.dtypes

mpg     float64
cyl       int64
disp    float64
hp        int64
drat    float64
wt      float64
qsec    float64
vs        int64
am        int64
gear      int64
carb      int64
dtype: object

In [95]:
# change data types
mtcars.head(2)

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4


In [100]:
# Convert selected columns to category
mtcars = data('mtcars')
mtcars = mtcars.astype({ 'cyl':'category', 'vs': 'category',  'carb': 'category',  'am':   'category',  'gear':  'category' } )

In [101]:
mtcars.dtypes

mpg      float64
cyl     category
disp     float64
hp         int64
drat     float64
wt       float64
qsec     float64
vs      category
am      category
gear    category
carb    category
dtype: object

In [102]:
mtcars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [104]:
# encode - map
mtcars['am'] = mtcars['am'].map({0: 'auto', 1: 'manual'})
mtcars.head()
#mtcars['am'] = mtcars['am'].replace({0: 'auto', 1: 'manual'})

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,manual,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,manual,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,manual,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,auto,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,auto,3,2


In [105]:
mtcars['vs'] = mtcars['vs'].map({0: 'Vshaped', 1: 'Straight'})
mtcars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,Vshaped,manual,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,Vshaped,manual,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,Straight,manual,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,Straight,auto,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,Vshaped,auto,3,2


In [112]:
#mtcars.reset_index()

In [106]:
mtcars.index

Index(['Mazda RX4', 'Mazda RX4 Wag', 'Datsun 710', 'Hornet 4 Drive',
       'Hornet Sportabout', 'Valiant', 'Duster 360', 'Merc 240D', 'Merc 230',
       'Merc 280', 'Merc 280C', 'Merc 450SE', 'Merc 450SL', 'Merc 450SLC',
       'Cadillac Fleetwood', 'Lincoln Continental', 'Chrysler Imperial',
       'Fiat 128', 'Honda Civic', 'Toyota Corolla', 'Toyota Corona',
       'Dodge Challenger', 'AMC Javelin', 'Camaro Z28', 'Pontiac Firebird',
       'Fiat X1-9', 'Porsche 914-2', 'Lotus Europa', 'Ford Pantera L',
       'Ferrari Dino', 'Maserati Bora', 'Volvo 142E'],
      dtype='object')

In [107]:
# carnames
mtcars['car_name'] = mtcars.index

In [108]:
mtcars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,Vshaped,manual,4,4,Mazda RX4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,Vshaped,manual,4,4,Mazda RX4 Wag
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,Straight,manual,4,1,Datsun 710
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,Straight,auto,3,1,Hornet 4 Drive
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,Vshaped,auto,3,2,Hornet Sportabout


In [109]:
mtcars.dtypes  #float, category, object(text), int

mpg          float64
cyl         category
disp         float64
hp             int64
drat         float64
wt           float64
qsec         float64
vs          category
am          category
gear        category
carb        category
car_name      object
dtype: object

In [118]:
print(len(mtcars[mtcars['mpg'] > 25]))
mtcars[mtcars['mpg'] > 25]
# list cars with mileage more than 25

6


Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,Straight,manual,4,1,Fiat 128
Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,Straight,manual,4,2,Honda Civic
Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,Straight,manual,4,1,Toyota Corolla
Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,Straight,manual,4,1,Fiat X1-9
Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,Vshaped,manual,5,2,Porsche 914-2
Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,Straight,manual,5,2,Lotus Europa


In [119]:
mtcars.query("mpg > 25")

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,Straight,manual,4,1,Fiat 128
Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,Straight,manual,4,2,Honda Civic
Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,Straight,manual,4,1,Toyota Corolla
Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,Straight,manual,4,1,Fiat X1-9
Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,Vshaped,manual,5,2,Porsche 914-2
Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,Straight,manual,5,2,Lotus Europa


In [121]:
mtcars.filter(items =['mpg','hp'])
# list the data with columns mpg & hp

Unnamed: 0,mpg,hp
Mazda RX4,21.0,110
Mazda RX4 Wag,21.0,110
Datsun 710,22.8,93
Hornet 4 Drive,21.4,110
Hornet Sportabout,18.7,175
Valiant,18.1,105
Duster 360,14.3,245
Merc 240D,24.4,62
Merc 230,22.8,95
Merc 280,19.2,123


In [123]:
mtcars[mtcars['am'] == 'auto']
# list cars which auto

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,Straight,auto,3,1,Hornet 4 Drive
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,Vshaped,auto,3,2,Hornet Sportabout
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,Straight,auto,3,1,Valiant
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,Vshaped,auto,3,4,Duster 360
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,Straight,auto,4,2,Merc 240D
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,Straight,auto,4,2,Merc 230
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,Straight,auto,4,4,Merc 280
Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,Straight,auto,4,4,Merc 280C
Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,Vshaped,auto,3,3,Merc 450SE
Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,Vshaped,auto,3,3,Merc 450SL


In [126]:
mtcars[(mtcars['am'] == 'auto') & (mtcars['cyl'] != '5') & (mtcars['mpg'] > 22 )]
# Use & for “AND” between them. cars = auto, cyl <> 5, mpg>22 ; all 3 conditions
# in Pandas, | = OR, ~ = NOT.

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,Straight,auto,4,2,Merc 240D
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,Straight,auto,4,2,Merc 230


In [135]:
mtcars[mtcars['mpg'].between(left=23, right=26.1, inclusive='left')] #inclusive{“both”, “neither”, “left”, “right”}
# cars with mpg between 23 and 26

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,Straight,auto,4,2,Merc 240D
Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,Vshaped,manual,5,2,Porsche 914-2


In [136]:
mtcars[mtcars['mpg'].between(23, 30, inclusive='neither')]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,Straight,auto,4,2,Merc 240D
Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,Straight,manual,4,1,Fiat X1-9
Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,Vshaped,manual,5,2,Porsche 914-2


In [137]:
mtcars[mtcars['mpg'].between(23, 30) & (mtcars['am'] == 'manual')]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,Straight,manual,4,1,Fiat X1-9
Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,Vshaped,manual,5,2,Porsche 914-2


In [142]:
mtcars[mtcars['gear'].isin([4,5])]
#mtcars['gear'].isin([4,5])
# list cars whose gear is in values 4,5

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,Vshaped,manual,4,4,Mazda RX4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,Vshaped,manual,4,4,Mazda RX4 Wag
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,Straight,manual,4,1,Datsun 710
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,Straight,auto,4,2,Merc 240D
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,Straight,auto,4,2,Merc 230
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,Straight,auto,4,4,Merc 280
Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,Straight,auto,4,4,Merc 280C
Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,Straight,manual,4,1,Fiat 128
Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,Straight,manual,4,2,Honda Civic
Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,Straight,manual,4,1,Toyota Corolla


In [143]:
mtcars[ (mtcars['gear'].isin([4, 5])) | (mtcars['am'] =='auto')]
# all auto or gear = 4,5

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,Vshaped,manual,4,4,Mazda RX4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,Vshaped,manual,4,4,Mazda RX4 Wag
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,Straight,manual,4,1,Datsun 710
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,Straight,auto,3,1,Hornet 4 Drive
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,Vshaped,auto,3,2,Hornet Sportabout
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,Straight,auto,3,1,Valiant
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,Vshaped,auto,3,4,Duster 360
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,Straight,auto,4,2,Merc 240D
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,Straight,auto,4,2,Merc 230
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,Straight,auto,4,4,Merc 280


In [144]:
mtcars[~mtcars['gear'].isin([4, 5])]
# cars other gears values in 4,5 : ~ opposite

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,Straight,auto,3,1,Hornet 4 Drive
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,Vshaped,auto,3,2,Hornet Sportabout
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,Straight,auto,3,1,Valiant
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,Vshaped,auto,3,4,Duster 360
Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,Vshaped,auto,3,3,Merc 450SE
Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,Vshaped,auto,3,3,Merc 450SL
Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18.0,Vshaped,auto,3,3,Merc 450SLC
Cadillac Fleetwood,10.4,8,472.0,205,2.93,5.25,17.98,Vshaped,auto,3,4,Cadillac Fleetwood
Lincoln Continental,10.4,8,460.0,215,3.0,5.424,17.82,Vshaped,auto,3,4,Lincoln Continental
Chrysler Imperial,14.7,8,440.0,230,3.23,5.345,17.42,Vshaped,auto,3,4,Chrysler Imperial


### select dtypes

In [146]:
mtcars.dtypes

mpg          float64
cyl         category
disp         float64
hp             int64
drat         float64
wt           float64
qsec         float64
vs          category
am          category
gear        category
carb        category
car_name      object
dtype: object

In [149]:
mtcars.select_dtypes(include='number') #.head
# list columns which are numeric

Unnamed: 0,mpg,disp,hp,drat,wt,qsec
Mazda RX4,21.0,160.0,110,3.9,2.62,16.46
Mazda RX4 Wag,21.0,160.0,110,3.9,2.875,17.02
Datsun 710,22.8,108.0,93,3.85,2.32,18.61
Hornet 4 Drive,21.4,258.0,110,3.08,3.215,19.44
Hornet Sportabout,18.7,360.0,175,3.15,3.44,17.02
Valiant,18.1,225.0,105,2.76,3.46,20.22
Duster 360,14.3,360.0,245,3.21,3.57,15.84
Merc 240D,24.4,146.7,62,3.69,3.19,20.0
Merc 230,22.8,140.8,95,3.92,3.15,22.9
Merc 280,19.2,167.6,123,3.92,3.44,18.3


In [150]:
mtcars.select_dtypes(include='category').head()
# list columns which category type

Unnamed: 0,cyl,vs,am,gear,carb
Mazda RX4,6,Vshaped,manual,4,4
Mazda RX4 Wag,6,Vshaped,manual,4,4
Datsun 710,4,Straight,manual,4,1
Hornet 4 Drive,6,Straight,auto,3,1
Hornet Sportabout,8,Vshaped,auto,3,2


In [151]:
mtcars.select_dtypes(include='object').head()
# string, car names

Unnamed: 0,car_name
Mazda RX4,Mazda RX4
Mazda RX4 Wag,Mazda RX4 Wag
Datsun 710,Datsun 710
Hornet 4 Drive,Hornet 4 Drive
Hornet Sportabout,Hornet Sportabout


### nlargest for a column

In [153]:
mtcars.nlargest(5, 'mpg')
# list cars which are among top 5 mpg

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,Straight,manual,4,1,Toyota Corolla
Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,Straight,manual,4,1,Fiat 128
Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,Straight,manual,4,2,Honda Civic
Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,Straight,manual,5,2,Lotus Europa
Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,Straight,manual,4,1,Fiat X1-9


In [154]:
mtcars.nsmallest(5, 'wt') 

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,Straight,manual,5,2,Lotus Europa
Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,Straight,manual,4,2,Honda Civic
Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,Straight,manual,4,1,Toyota Corolla
Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,Straight,manual,4,1,Fiat X1-9
Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,Vshaped,manual,5,2,Porsche 914-2


### sort values

In [156]:
mtcars.sort_values(by='mpg', ascending=True, inplace=False) #.head()
# list mtcars by mpg, in increasing order

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Lincoln Continental,10.4,8,460.0,215,3.0,5.424,17.82,Vshaped,auto,3,4,Lincoln Continental
Cadillac Fleetwood,10.4,8,472.0,205,2.93,5.25,17.98,Vshaped,auto,3,4,Cadillac Fleetwood
Camaro Z28,13.3,8,350.0,245,3.73,3.84,15.41,Vshaped,auto,3,4,Camaro Z28
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,Vshaped,auto,3,4,Duster 360
Chrysler Imperial,14.7,8,440.0,230,3.23,5.345,17.42,Vshaped,auto,3,4,Chrysler Imperial
Maserati Bora,15.0,8,301.0,335,3.54,3.57,14.6,Vshaped,manual,5,8,Maserati Bora
Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18.0,Vshaped,auto,3,3,Merc 450SLC
AMC Javelin,15.2,8,304.0,150,3.15,3.435,17.3,Vshaped,auto,3,2,AMC Javelin
Dodge Challenger,15.5,8,318.0,150,2.76,3.52,16.87,Vshaped,auto,3,2,Dodge Challenger
Ford Pantera L,15.8,8,351.0,264,4.22,3.17,14.5,Vshaped,manual,5,4,Ford Pantera L


In [157]:
mtcars.sort_values(['gear','mpg']) #.head()  #first by gear, mpg

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Cadillac Fleetwood,10.4,8,472.0,205,2.93,5.25,17.98,Vshaped,auto,3,4,Cadillac Fleetwood
Lincoln Continental,10.4,8,460.0,215,3.0,5.424,17.82,Vshaped,auto,3,4,Lincoln Continental
Camaro Z28,13.3,8,350.0,245,3.73,3.84,15.41,Vshaped,auto,3,4,Camaro Z28
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,Vshaped,auto,3,4,Duster 360
Chrysler Imperial,14.7,8,440.0,230,3.23,5.345,17.42,Vshaped,auto,3,4,Chrysler Imperial
Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18.0,Vshaped,auto,3,3,Merc 450SLC
AMC Javelin,15.2,8,304.0,150,3.15,3.435,17.3,Vshaped,auto,3,2,AMC Javelin
Dodge Challenger,15.5,8,318.0,150,2.76,3.52,16.87,Vshaped,auto,3,2,Dodge Challenger
Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,Vshaped,auto,3,3,Merc 450SE
Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,Vshaped,auto,3,3,Merc 450SL


In [158]:
mtcars.sort_values(['gear','mpg'], ascending=[False,True])#.head()  #first by gear, mpg

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Maserati Bora,15.0,8,301.0,335,3.54,3.57,14.6,Vshaped,manual,5,8,Maserati Bora
Ford Pantera L,15.8,8,351.0,264,4.22,3.17,14.5,Vshaped,manual,5,4,Ford Pantera L
Ferrari Dino,19.7,6,145.0,175,3.62,2.77,15.5,Vshaped,manual,5,6,Ferrari Dino
Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,Vshaped,manual,5,2,Porsche 914-2
Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,Straight,manual,5,2,Lotus Europa
Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,Straight,auto,4,4,Merc 280C
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,Straight,auto,4,4,Merc 280
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,Vshaped,manual,4,4,Mazda RX4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,Vshaped,manual,4,4,Mazda RX4 Wag
Volvo 142E,21.4,4,121.0,109,4.11,2.78,18.6,Straight,manual,4,2,Volvo 142E


# Hold - 16Aug25

In [2]:
import pandas as pd
import numpy as np
from pydataset import data
mtcars = data('mtcars')
mtcars.shape

(32, 11)

### Pivot Table
- index= → What goes down the rows
- columns= → What goes across the top
- values= → What fills the table
- aggfunc= → How to summarize if there are duplicates

In [None]:
# how may cars have gear=5, am=0

In [3]:
mtcars.pivot_table(values = 'mpg', index='gear', observed=False)
# no aggregation or default aggregation ie mean
# Mean mpg of Gear 3 cars is 16

Unnamed: 0_level_0,mpg
gear,Unnamed: 1_level_1
3,16.106667
4,24.533333
5,21.38


In [5]:
mtcars.pivot_table(values = 'hp', index='cyl', observed=False)
# mean hp of Cyl-4 cars is 82

Unnamed: 0_level_0,hp
cyl,Unnamed: 1_level_1
4,82.636364
6,122.285714
8,209.214286


In [9]:
pd.pivot_table( data=mtcars, observed=False,
    index='gear',          # Rows
    values='mpg',          # Values to summarize
    aggfunc='median'         # Aggregation function
)
# Median of Gear-3 cars is 15.5

Unnamed: 0_level_0,mpg
gear,Unnamed: 1_level_1
3,15.5
4,22.8
5,19.7


In [13]:
pivot1 = pd.pivot_table(
    data=mtcars, observed=False,
    index='gear',          # Rows
    columns='am',          # Columns
    values='mpg',          # Values to summarize
    aggfunc='mean'         # Aggregation function
)

In [14]:
pivot1
# Mean of Gear-3 & Auto Tx - 16, while for Gear-3 & Manual Tx- No cars

am,0,1
gear,Unnamed: 1_level_1,Unnamed: 2_level_1
3,16.106667,
4,21.05,26.275
5,,21.38


In [20]:
mtcars.am.value_counts()
mtcars.cyl.value_counts()

cyl
8    14
4    11
6     7
Name: count, dtype: int64

In [18]:
pd.pivot_table( observed=False , data= mtcars,  index='gear',  columns='am',  values='mpg', \
                aggfunc=['mean', 'max', 'min' ,'count','sum'] ).reset_index()
# 32 cars

Unnamed: 0_level_0,gear,mean,mean,max,max,min,min,count,count,sum,sum
am,Unnamed: 1_level_1,0,1,0,1,0,1,0,1,0,1
0,3,16.106667,,21.5,,10.4,,15.0,,241.6,
1,4,21.05,26.275,24.4,33.9,17.8,21.0,4.0,8.0,84.2,210.2
2,5,,21.38,,30.4,,15.0,,5.0,,106.9


In [24]:
pd.pivot_table(data= mtcars,  index=['gear','am'],  columns='cyl',  values='mpg', \
                aggfunc=['mean', 'max', 'min' ,'count','sum','std'] ).reset_index()
#19.75 mean mpg - gear3, am0, cyl6

Unnamed: 0_level_0,gear,am,mean,mean,mean,max,max,max,min,min,min,count,count,count,sum,sum,sum,std,std,std
cyl,Unnamed: 1_level_1,Unnamed: 2_level_1,4,6,8,4,6,8,4,6,8,4,6,8,4,6,8,4,6,8
0,3,0,21.5,19.75,15.05,21.5,21.4,19.2,21.5,18.1,10.4,1.0,2.0,12.0,21.5,39.5,180.6,,2.333452,2.774396
1,4,0,23.6,18.5,,24.4,19.2,,22.8,17.8,,2.0,2.0,,47.2,37.0,,1.131371,0.989949,
2,4,1,28.033333,21.0,,33.9,21.0,,21.4,21.0,,6.0,2.0,,168.2,42.0,,5.118854,0.0,
3,5,1,28.2,19.7,15.4,30.4,19.7,15.8,26.0,19.7,15.0,2.0,1.0,2.0,56.4,19.7,30.8,3.11127,,0.565685


In [28]:
pd.pivot_table(data=mtcars, index='cyl', columns='carb', values='hp', aggfunc=['min','max','count'])

Unnamed: 0_level_0,min,min,min,min,min,min,max,max,max,max,max,max,count,count,count,count,count,count
carb,1,2,3,4,6,8,1,2,3,4,6,8,1,2,3,4,6,8
cyl,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
4,65.0,52.0,,,,,97.0,113.0,,,,,5.0,6.0,,,,
6,105.0,,,110.0,175.0,,110.0,,,123.0,175.0,,2.0,,,4.0,1.0,
8,,150.0,180.0,205.0,,335.0,,175.0,180.0,264.0,,335.0,,4.0,3.0,6.0,,1.0


##  Advanced - 2
- wide, long
- missing values
- airquality dataset is perfect
    - Wide format: Columns for Ozone, Solar.R, Wind, Temp, Month, Day
    - Has missing values in Ozone and Solar.R → great for .isnull(), .fillna(), .dropna() practice
    - Can be melted into long format to have "variable" and "value" columns

In [29]:
from pydataset import data

aq = data('airquality')
aq.head()

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
1,41.0,190.0,7.4,67,5,1
2,36.0,118.0,8.0,72,5,2
3,12.0,149.0,12.6,74,5,3
4,18.0,313.0,11.5,62,5,4
5,,,14.3,56,5,5


In [30]:
aq.info()   #153 rows, 6 cols

<class 'pandas.core.frame.DataFrame'>
Index: 153 entries, 1 to 153
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Ozone    116 non-null    float64
 1   Solar.R  146 non-null    float64
 2   Wind     153 non-null    float64
 3   Temp     153 non-null    int64  
 4   Month    153 non-null    int64  
 5   Day      153 non-null    int64  
dtypes: float64(3), int64(3)
memory usage: 8.4 KB


In [32]:
print(aq.isnull().sum())  # Shows missing values

Ozone      37
Solar.R     7
Wind        0
Temp        0
Month       0
Day         0
dtype: int64


In [35]:
aq.head()  #original data

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
1,41.0,190.0,7.4,67,5,1
2,36.0,118.0,8.0,72,5,2
3,12.0,149.0,12.6,74,5,3
4,18.0,313.0,11.5,62,5,4
5,,,14.3,56,5,5


In [33]:
# long
aq_long = aq.melt(  id_vars=['Month', 'Day'],  value_vars=['Ozone', 'Solar.R', 'Wind', 'Temp'], \
                     var_name='Measurement',    value_name='Value' )

aq_long.head()

Unnamed: 0,Month,Day,Measurement,Value
0,5,1,Ozone,41.0
1,5,2,Ozone,36.0
2,5,3,Ozone,12.0
3,5,4,Ozone,18.0
4,5,5,Ozone,


In [38]:
153 * 4

612

In [36]:
aq_long  # 153 rows x 4 columns

Unnamed: 0,Month,Day,Measurement,Value
0,5,1,Ozone,41.0
1,5,2,Ozone,36.0
2,5,3,Ozone,12.0
3,5,4,Ozone,18.0
4,5,5,Ozone,
...,...,...,...,...
607,9,26,Temp,70.0
608,9,27,Temp,77.0
609,9,28,Temp,75.0
610,9,29,Temp,76.0


### Melt
- https://pandas.pydata.org/docs/reference/api/pandas.melt.html
-  id_vars=['Month', 'Day'],  value_vars=['Ozone', 'Solar.R', 'Wind', 'Temp'], \
                     var_name='Measurement',    value_name='Value' )


In [42]:
pd.melt(frame=aq, id_vars=['Month', 'Day'], value_vars=['Ozone', 'Wind'],\
        var_name='Measurement', value_name='value', col_level=None, ignore_index=True)

Unnamed: 0,Month,Day,Measurement,value
0,5,1,Ozone,41.0
1,5,2,Ozone,36.0
2,5,3,Ozone,12.0
3,5,4,Ozone,18.0
4,5,5,Ozone,
...,...,...,...,...
301,9,26,Wind,6.9
302,9,27,Wind,13.2
303,9,28,Wind,14.3
304,9,29,Wind,8.0


In [44]:
aq_long.head()

Unnamed: 0,Month,Day,Measurement,Value
0,5,1,Ozone,41.0
1,5,2,Ozone,36.0
2,5,3,Ozone,12.0
3,5,4,Ozone,18.0
4,5,5,Ozone,


In [48]:
# wide
aq_wide = aq_long.pivot_table( index=['Month', 'Day'], columns='Measurement', values='Value' ).reset_index()
aq_wide.head(2)

Measurement,Month,Day,Ozone,Solar.R,Temp,Wind
0,5,1,41.0,190.0,67.0,7.4
1,5,2,36.0,118.0,72.0,8.0


In [47]:
aq.head(2)

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
1,41.0,190.0,7.4,67,5,1
2,36.0,118.0,8.0,72,5,2


In [None]:
# missing data
# Drop rows with any missing values
aq_drop = aq.dropna()

# Fill missing Ozone with mean
aq_fill = aq.copy()
aq_fill['Ozone'] = aq_fill['Ozone'].fillna(aq_fill['Ozone'].mean())

In [None]:
aq_fill   #ozone missing values filled

## Practise with Student Data
- https://docs.google.com/spreadsheets/d/1jPk4sZyDn5NSIQ4iQK1P3aGh5ZAS2oaYexRoUyocAw0
- sheet_id = '1ABCDefGhIJklMNopQRstuVWxyz12345678'
- gid1 = 'pData1'
- gid2 = 'sData1'
- url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&gid={gid}"
- df = pd.read_csv(url)

In [None]:
sheet_id = '1jPk4sZyDn5NSIQ4iQK1P3aGh5ZAS2oaYexRoUyocAw0'
gid1 = '132995188' #pData1
gid2 = '330609257' #'sData1

In [None]:
url1 = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&gid={gid1}"
print(url1)

In [None]:
url2 = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&gid={gid2}"
print(url2)

In [None]:
#url1 = "https://docs.google.com/spreadsheets/d/1jPk4sZyDn5NSIQ4iQK1P3aGh5ZAS2oaYexRoUyocAw0/export?format=csv&gid=132995188"
pdata = pd.read_csv(url1)

In [None]:
pdata.head()

In [None]:
sdata = pd.read_csv(url2)
sdata.head()

In [None]:
## Join Data
merged = pd.merge(pdata, sdata, on='rollno' , how = 'inner')
merged.head()

In [None]:
## Melt data -  Wide to Long
subvar = ['maths','english', 'stats','python']
idvar = ['rollno','name','gender','program']
marks_long = merged.melt( id_vars = idvar, value_vars =subvar, var_name='subject', value_name ='marks')

In [None]:
marks_long.head()

In [None]:
# Long to Wide 
marks_wide = ( marks_long.pivot_table( index=["rollno", "name"], columns = "subject", values= "marks", aggfunc="first")
    .reset_index())

In [None]:
marks_wide.head()

In [None]:
## Missing Data
missing_summary = merged.isnull().sum()
missing_summary

In [None]:
merged.head()

In [None]:
merged2 = merged.copy()
merged2["gender"] = merged2["gender"].fillna("Not Specified")
merged2.head()

In [None]:
for s in subvar:
    merged2[s] = merged2[s].fillna(merged2[s].mean())
merged2.head()  #see changes in missing values of maths, english, stats, python

### Missing Values
- Remove Rows or Columns

In [None]:
subvar

In [None]:
merged3R = merged.dropna(subset=subvar) #rows remove

In [None]:
merged3R.shape  #less rows now

In [None]:
# Columns wise Missing %
merged.isnull().mean()

In [None]:
threshold = 0.3   #remove columns which have missing values more than 30%
merged3C = merged.loc[:, merged.isnull().mean() <= threshold]

In [None]:
merged3C

### Lambda Function
-

In [None]:
merged.head()

In [None]:
merged["maths"].apply( lambda x: x * 1.15 if pd.notnull(x) else x).head()

In [None]:
merged.assign( englishU = lambda df: merged["english"] * 1.10 ).head()

## Dates
- pd.Timestamp(date.today()) → current date
- errors="coerce" → invalid dates become NaT (Not a Time) instead of crashing.
- Pandas auto-detects most formats (YYYY-MM-DD, DD/MM/YYYY, etc.), but you can force a format:
- Subtract DOB from today → 
- To calculate age in years, you need to divide the timedelta by a constant (e.g., 365.25 days) or use dateutil.relativedelta for an exact year/month calculation.

In [None]:
import pandas as pd
from datetime import date
from dateutil.relativedelta import relativedelta

In [None]:
merged.head()

In [None]:
merged.dtypes

In [None]:
merged['dob'] = pd.to_datetime(merged['dob'], errors="coerce")
# pd.to_datetime(merged['dob'], format="%Y-%m-%d")
merged.dtypes

In [None]:
merged.head()

In [None]:
today = pd.Timestamp(date.today())
today

In [None]:
merged["age"] = (today - merged["dob"]).dt.days // 365 # // as to interpret division
merged.head()

In [None]:
merged["ageYM"] = merged["dob"].apply (
    lambda dob: f"{relativedelta(date.today(), dob).years} years, {relativedelta(date.today(), dob).months} months"
    if pd.notnull(dob) else None
)
merged.head()

In [None]:
# students whose bday comes in Sep
merged[merged["dob"].dt.month == 2]

## END HERE
- now practise aggregations, groupby,
- time series