# Advanced Panda Functions
- read data, search, filter, etc

In [3]:
# libraries
import numpy as np
import pandas as pd

In [4]:
#options
pd.set_option('display.max_columns', None)  

In [53]:
from pydataset import data
mtcars = data('mtcars')
mtcars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


### Advanced
- change dtypes
- encode
- rowname to columns
- filter
- sort
- select dtypes
- pivot
- melt
- wide

In [54]:
# change data types

In [55]:
# Convert selected columns to category
mtcars = mtcars.astype({
    'vs': 'category',
    'carb': 'category',
    'am':   'category',
    'gear':  'category'   
})

In [56]:
mtcars.dtypes

mpg      float64
cyl        int64
disp     float64
hp         int64
drat     float64
wt       float64
qsec     float64
vs      category
am      category
gear    category
carb    category
dtype: object

In [57]:
# encode
mtcars['am'] = mtcars['am'].map({0: 'auto', 1: 'manual'})
#mtcars['am'] = mtcars['am'].replace({0: 'auto', 1: 'manual'})

In [58]:
# carnames
mtcars['car_name'] = mtcars.index

In [59]:
mtcars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,manual,4,4,Mazda RX4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,manual,4,4,Mazda RX4 Wag
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,manual,4,1,Datsun 710
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,auto,3,1,Hornet 4 Drive
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,auto,3,2,Hornet Sportabout


In [60]:
mtcars.dtypes  #float, category, object(text), int

mpg          float64
cyl            int64
disp         float64
hp             int64
drat         float64
wt           float64
qsec         float64
vs          category
am          category
gear        category
carb        category
car_name      object
dtype: object

In [61]:
mtcars[mtcars['mpg'] > 25]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,manual,4,1,Fiat 128
Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,manual,4,2,Honda Civic
Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,manual,4,1,Toyota Corolla
Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,1,manual,4,1,Fiat X1-9
Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,0,manual,5,2,Porsche 914-2
Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,manual,5,2,Lotus Europa


In [62]:
mtcars[mtcars['am'] == 'auto']

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,auto,3,1,Hornet 4 Drive
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,auto,3,2,Hornet Sportabout
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,auto,3,1,Valiant
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,auto,3,4,Duster 360
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,auto,4,2,Merc 240D
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,auto,4,2,Merc 230
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,auto,4,4,Merc 280
Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,auto,4,4,Merc 280C
Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,auto,3,3,Merc 450SE
Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,auto,3,3,Merc 450SL


In [70]:
mtcars[(mtcars['am'] == 'auto') & (mtcars['cyl'] != '5') & (mtcars['mpg'] > 22 )]
# Use & for “AND” between them.
# in Pandas, | = OR, ~ = NOT.

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,auto,4,2,Merc 240D
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,auto,4,2,Merc 230


In [72]:
mtcars[mtcars['mpg'].between(23, 30)]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,auto,4,2,Merc 240D
Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,1,manual,4,1,Fiat X1-9
Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,0,manual,5,2,Porsche 914-2


In [74]:
mtcars[mtcars['mpg'].between(23, 30, inclusive='neither')]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,auto,4,2,Merc 240D
Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,1,manual,4,1,Fiat X1-9
Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,0,manual,5,2,Porsche 914-2


In [77]:
mtcars[mtcars['mpg'].between(23, 30) & (mtcars['am'] == 'manual')]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,1,manual,4,1,Fiat X1-9
Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,0,manual,5,2,Porsche 914-2


In [78]:
mtcars[mtcars['gear'].isin([4, 5])]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,manual,4,4,Mazda RX4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,manual,4,4,Mazda RX4 Wag
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,manual,4,1,Datsun 710
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,auto,4,2,Merc 240D
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,auto,4,2,Merc 230
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,auto,4,4,Merc 280
Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,auto,4,4,Merc 280C
Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,manual,4,1,Fiat 128
Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,manual,4,2,Honda Civic
Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,manual,4,1,Toyota Corolla


In [80]:
mtcars[ (mtcars['gear'].isin([4, 5])) | (mtcars['am'] =='auto')]
# all auto or gear = 4,5

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,manual,4,4,Mazda RX4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,manual,4,4,Mazda RX4 Wag
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,manual,4,1,Datsun 710
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,auto,3,1,Hornet 4 Drive
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,auto,3,2,Hornet Sportabout
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,auto,3,1,Valiant
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,auto,3,4,Duster 360
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,auto,4,2,Merc 240D
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,auto,4,2,Merc 230
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,auto,4,4,Merc 280


In [81]:
mtcars[~mtcars['gear'].isin([4, 5])]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,auto,3,1,Hornet 4 Drive
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,auto,3,2,Hornet Sportabout
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,auto,3,1,Valiant
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,auto,3,4,Duster 360
Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,auto,3,3,Merc 450SE
Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,auto,3,3,Merc 450SL
Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18.0,0,auto,3,3,Merc 450SLC
Cadillac Fleetwood,10.4,8,472.0,205,2.93,5.25,17.98,0,auto,3,4,Cadillac Fleetwood
Lincoln Continental,10.4,8,460.0,215,3.0,5.424,17.82,0,auto,3,4,Lincoln Continental
Chrysler Imperial,14.7,8,440.0,230,3.23,5.345,17.42,0,auto,3,4,Chrysler Imperial


### select dtypes

In [83]:
mtcars.select_dtypes(include='number').head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02


In [84]:
mtcars.select_dtypes(include='category').head()

Unnamed: 0,vs,am,gear,carb
Mazda RX4,0,manual,4,4
Mazda RX4 Wag,0,manual,4,4
Datsun 710,1,manual,4,1
Hornet 4 Drive,1,auto,3,1
Hornet Sportabout,0,auto,3,2


In [85]:
mtcars.select_dtypes(include='object').head()

Unnamed: 0,car_name
Mazda RX4,Mazda RX4
Mazda RX4 Wag,Mazda RX4 Wag
Datsun 710,Datsun 710
Hornet 4 Drive,Hornet 4 Drive
Hornet Sportabout,Hornet Sportabout


### nlargest for a column

In [86]:
mtcars.nlargest(5, 'mpg')

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,manual,4,1,Toyota Corolla
Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,manual,4,1,Fiat 128
Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,manual,4,2,Honda Civic
Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,manual,5,2,Lotus Europa
Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,1,manual,4,1,Fiat X1-9


### sort values

In [91]:
mtcars.sort_values(by='mpg', ascending=True, inplace=False).head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Lincoln Continental,10.4,8,460.0,215,3.0,5.424,17.82,0,auto,3,4,Lincoln Continental
Cadillac Fleetwood,10.4,8,472.0,205,2.93,5.25,17.98,0,auto,3,4,Cadillac Fleetwood
Camaro Z28,13.3,8,350.0,245,3.73,3.84,15.41,0,auto,3,4,Camaro Z28
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,auto,3,4,Duster 360
Chrysler Imperial,14.7,8,440.0,230,3.23,5.345,17.42,0,auto,3,4,Chrysler Imperial


In [92]:
mtcars.sort_values(['gear','mpg']).head()  #first by gear, mpg

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Cadillac Fleetwood,10.4,8,472.0,205,2.93,5.25,17.98,0,auto,3,4,Cadillac Fleetwood
Lincoln Continental,10.4,8,460.0,215,3.0,5.424,17.82,0,auto,3,4,Lincoln Continental
Camaro Z28,13.3,8,350.0,245,3.73,3.84,15.41,0,auto,3,4,Camaro Z28
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,auto,3,4,Duster 360
Chrysler Imperial,14.7,8,440.0,230,3.23,5.345,17.42,0,auto,3,4,Chrysler Imperial


In [93]:
mtcars.sort_values(['gear','mpg'], ascending=[False,True]).head()  #first by gear, mpg

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,car_name
Maserati Bora,15.0,8,301.0,335,3.54,3.57,14.6,0,manual,5,8,Maserati Bora
Ford Pantera L,15.8,8,351.0,264,4.22,3.17,14.5,0,manual,5,4,Ford Pantera L
Ferrari Dino,19.7,6,145.0,175,3.62,2.77,15.5,0,manual,5,6,Ferrari Dino
Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,0,manual,5,2,Porsche 914-2
Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,manual,5,2,Lotus Europa


### Pivot Table
- index= → What goes down the rows
- columns= → What goes across the top
- values= → What fills the table
- aggfunc= → How to summarize if there are duplicates

In [105]:
mtcars.pivot_table(values ='mpg', index='gear', observed=False)
# no aggregation or default aggregation ie mean

Unnamed: 0_level_0,mpg
gear,Unnamed: 1_level_1
3,16.106667
4,24.533333
5,21.38


In [99]:
pd.pivot_table( data=mtcars, observed=False,
    index='gear',          # Rows
    values='mpg',          # Values to summarize
    aggfunc='mean'         # Aggregation function
)

Unnamed: 0_level_0,mpg
gear,Unnamed: 1_level_1
3,16.106667
4,24.533333
5,21.38


In [96]:
pivot = pd.pivot_table(
    data=mtcars, observed=False,
    index='gear',          # Rows
    columns='am',          # Columns
    values='mpg',          # Values to summarize
    aggfunc='mean'         # Aggregation function
)

In [97]:
pivot

am,auto,manual
gear,Unnamed: 1_level_1,Unnamed: 2_level_1
3,16.106667,
4,21.05,26.275
5,,21.38


In [104]:
pd.pivot_table( observed=False , data= mtcars,  index='gear',  columns='am',  values='mpg',  aggfunc=['mean', 'max', 'min'
,'count'] )

Unnamed: 0_level_0,mean,mean,max,max,min,min,count,count
am,auto,manual,auto,manual,auto,manual,auto,manual
gear,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
3,16.106667,,21.5,,10.4,,15,0
4,21.05,26.275,24.4,33.9,17.8,21.0,4,8
5,,21.38,,30.4,,15.0,0,5


##  Advanced - 2
- wide, long
- missing values
- airquality dataset is perfect
    - Wide format: Columns for Ozone, Solar.R, Wind, Temp, Month, Day
    - Has missing values in Ozone and Solar.R → great for .isnull(), .fillna(), .dropna() practice
    - Can be melted into long format to have "variable" and "value" columns

In [107]:
from pydataset import data

aq = data('airquality')
aq.head()

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
1,41.0,190.0,7.4,67,5,1
2,36.0,118.0,8.0,72,5,2
3,12.0,149.0,12.6,74,5,3
4,18.0,313.0,11.5,62,5,4
5,,,14.3,56,5,5


In [109]:
aq.info()   #153 rows, 6 cols

<class 'pandas.core.frame.DataFrame'>
Index: 153 entries, 1 to 153
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Ozone    116 non-null    float64
 1   Solar.R  146 non-null    float64
 2   Wind     153 non-null    float64
 3   Temp     153 non-null    int64  
 4   Month    153 non-null    int64  
 5   Day      153 non-null    int64  
dtypes: float64(3), int64(3)
memory usage: 8.4 KB


In [110]:
print(aq.isnull().sum())  # Shows missing values

Ozone      37
Solar.R     7
Wind        0
Temp        0
Month       0
Day         0
dtype: int64


In [111]:
# long
aq_long = aq.melt(   id_vars=['Month', 'Day'], 
    value_vars=['Ozone', 'Solar.R', 'Wind', 'Temp'],
    var_name='Measurement',    value_name='Value' )

print(aq_long.head())

   Month  Day Measurement  Value
0      5    1       Ozone   41.0
1      5    2       Ozone   36.0
2      5    3       Ozone   12.0
3      5    4       Ozone   18.0
4      5    5       Ozone    NaN


In [113]:
# wide
aq_wide = aq_long.pivot_table(
    index=['Month', 'Day'],
    columns='Measurement',
    values='Value'
).reset_index()

print(aq_wide.head())

Measurement  Month  Day  Ozone  Solar.R  Temp  Wind
0                5    1   41.0    190.0  67.0   7.4
1                5    2   36.0    118.0  72.0   8.0
2                5    3   12.0    149.0  74.0  12.6
3                5    4   18.0    313.0  62.0  11.5
4                5    5    NaN      NaN  56.0  14.3


In [114]:
# missing data
# Drop rows with any missing values
aq_drop = aq.dropna()

# Fill missing Ozone with mean
aq_fill = aq.copy()
aq_fill['Ozone'] = aq_fill['Ozone'].fillna(aq_fill['Ozone'].mean())

In [116]:
aq_fill   #ozone missing values filled

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
1,41.0,190.0,7.4,67,5,1
2,36.0,118.0,8.0,72,5,2
3,12.0,149.0,12.6,74,5,3
4,18.0,313.0,11.5,62,5,4
5,42.12931,,14.3,56,5,5
6,28.0,,14.9,66,5,6
7,23.0,299.0,8.6,65,5,7
8,19.0,99.0,13.8,59,5,8
9,8.0,19.0,20.1,61,5,9
10,42.12931,194.0,8.6,69,5,10


## Practise with Student Data
- https://docs.google.com/spreadsheets/d/1jPk4sZyDn5NSIQ4iQK1P3aGh5ZAS2oaYexRoUyocAw0
- sheet_id = '1ABCDefGhIJklMNopQRstuVWxyz12345678'
- gid1 = 'pData1'
- gid2 = 'sData1'
- url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&gid={gid}"
- df = pd.read_csv(url)

In [128]:
sheet_id = '1jPk4sZyDn5NSIQ4iQK1P3aGh5ZAS2oaYexRoUyocAw0'
gid1 = '132995188' #pData1
gid2 = '330609257' #'sData1

In [132]:
url1 = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&gid={gid1}"
print(url1)

https://docs.google.com/spreadsheets/d/1jPk4sZyDn5NSIQ4iQK1P3aGh5ZAS2oaYexRoUyocAw0/export?format=csv&gid=132995188


In [137]:
url2 = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&gid={gid2}"
print(url2)

https://docs.google.com/spreadsheets/d/1jPk4sZyDn5NSIQ4iQK1P3aGh5ZAS2oaYexRoUyocAw0/export?format=csv&gid=330609257


In [138]:
#url1 = "https://docs.google.com/spreadsheets/d/1jPk4sZyDn5NSIQ4iQK1P3aGh5ZAS2oaYexRoUyocAw0/export?format=csv&gid=132995188"
pdata = pd.read_csv(url1)

In [139]:
pdata.head()

Unnamed: 0,rollno,name,gender,dob,height,program
0,1,Student_1,Male,2000-08-16,156,BCA
1,2,Student_2,,2000-02-21,170,BBA
2,3,Student_3,Female,2001-07-17,158,BCA
3,4,Student_4,Female,2001-05-16,188,BBA
4,5,Student_5,Male,2001-04-02,167,BCA


In [140]:
sdata = pd.read_csv(url2)
sdata.head()

Unnamed: 0,rollno,maths,english,stats,python
0,1,,,63.0,40.0
1,2,83.0,47.0,63.0,50.0
2,3,90.0,56.0,47.0,74.0
3,4,74.0,72.0,98.0,
4,5,81.0,78.0,,80.0


In [141]:
## Join Data
merged = pd.merge(pdata, sdata, on='rollno' , how = 'inner')
merged.head()

Unnamed: 0,rollno,name,gender,dob,height,program,maths,english,stats,python
0,1,Student_1,Male,2000-08-16,156,BCA,,,63.0,40.0
1,2,Student_2,,2000-02-21,170,BBA,83.0,47.0,63.0,50.0
2,3,Student_3,Female,2001-07-17,158,BCA,90.0,56.0,47.0,74.0
3,4,Student_4,Female,2001-05-16,188,BBA,74.0,72.0,98.0,
4,5,Student_5,Male,2001-04-02,167,BCA,81.0,78.0,,80.0


In [149]:
## Melt data -  Wide to Long
subvar = ['maths','english', 'stats','python']
idvar = ['rollno','name','gender','program']
marks_long = merged.melt( id_vars = idvar, value_vars =subvar, var_name='subject', value_name ='marks')

In [150]:
marks_long.head()

Unnamed: 0,rollno,name,gender,program,subject,marks
0,1,Student_1,Male,BCA,maths,
1,2,Student_2,,BBA,maths,83.0
2,3,Student_3,Female,BCA,maths,90.0
3,4,Student_4,Female,BBA,maths,74.0
4,5,Student_5,Male,BCA,maths,81.0


In [151]:
# Long to Wide 
marks_wide = ( marks_long.pivot_table( index=["rollno", "name"], columns = "subject", values= "marks", aggfunc="first")
    .reset_index())

In [153]:
marks_wide.head()

subject,rollno,name,english,maths,python,stats
0,1,Student_1,,,40.0,63.0
1,2,Student_2,47.0,83.0,50.0,63.0
2,3,Student_3,56.0,90.0,74.0,47.0
3,4,Student_4,72.0,74.0,,98.0
4,5,Student_5,78.0,81.0,80.0,


In [154]:
## Missing Data
missing_summary = merged.isnull().sum()
missing_summary

rollno     0
name       0
gender     2
dob        0
height     0
program    0
maths      4
english    3
stats      5
python     2
dtype: int64

In [156]:
merged.head()

Unnamed: 0,rollno,name,gender,dob,height,program,maths,english,stats,python
0,1,Student_1,Male,2000-08-16,156,BCA,,,63.0,40.0
1,2,Student_2,,2000-02-21,170,BBA,83.0,47.0,63.0,50.0
2,3,Student_3,Female,2001-07-17,158,BCA,90.0,56.0,47.0,74.0
3,4,Student_4,Female,2001-05-16,188,BBA,74.0,72.0,98.0,
4,5,Student_5,Male,2001-04-02,167,BCA,81.0,78.0,,80.0


In [155]:
merged2 = merged.copy()
merged2["gender"] = merged2["gender"].fillna("Not Specified")
merged2.head()

Unnamed: 0,rollno,name,gender,dob,height,program,maths,english,stats,python
0,1,Student_1,Male,2000-08-16,156,BCA,,,63.0,40.0
1,2,Student_2,Not Specified,2000-02-21,170,BBA,83.0,47.0,63.0,50.0
2,3,Student_3,Female,2001-07-17,158,BCA,90.0,56.0,47.0,74.0
3,4,Student_4,Female,2001-05-16,188,BBA,74.0,72.0,98.0,
4,5,Student_5,Male,2001-04-02,167,BCA,81.0,78.0,,80.0


In [159]:
for s in subjects:
    merged2[s] = merged2[s].fillna(merged2[s].mean())
merged2.head()  #see changes in missing values of maths, english, stats, python

Unnamed: 0,rollno,name,gender,dob,height,program,maths,english,stats,python
0,1,Student_1,Male,2000-08-16,156,BCA,73.884615,65.0,63.0,40.0
1,2,Student_2,Not Specified,2000-02-21,170,BBA,83.0,47.0,63.0,50.0
2,3,Student_3,Female,2001-07-17,158,BCA,90.0,56.0,47.0,74.0
3,4,Student_4,Female,2001-05-16,188,BBA,74.0,72.0,98.0,68.178571
4,5,Student_5,Male,2001-04-02,167,BCA,81.0,78.0,69.44,80.0


### Missing Values
- Remove Rows or Columns

In [160]:
subvar

['maths', 'english', 'stats', 'python']

In [161]:
merged3R = merged.dropna(subset=subvar) #rows remove

In [162]:
merged3R.shape  #less rows now

(19, 10)

In [165]:
# Columns wise Missing %
merged.isnull().mean()

rollno     0.000000
name       0.000000
gender     0.066667
dob        0.000000
height     0.000000
program    0.000000
maths      0.133333
english    0.100000
stats      0.166667
python     0.066667
dtype: float64

In [163]:
threshold = 0.3   #remove columns which have missing values more than 30%
merged3C = merged.loc[:, merged.isnull().mean() <= threshold]

In [164]:
merged3C

Unnamed: 0,rollno,name,gender,dob,height,program,maths,english,stats,python
0,1,Student_1,Male,2000-08-16,156,BCA,,,63.0,40.0
1,2,Student_2,,2000-02-21,170,BBA,83.0,47.0,63.0,50.0
2,3,Student_3,Female,2001-07-17,158,BCA,90.0,56.0,47.0,74.0
3,4,Student_4,Female,2001-05-16,188,BBA,74.0,72.0,98.0,
4,5,Student_5,Male,2001-04-02,167,BCA,81.0,78.0,,80.0
5,6,Student_6,Male,2000-10-12,153,BCA,67.0,46.0,48.0,47.0
6,7,Student_7,Male,2000-07-28,174,BBA,51.0,73.0,72.0,87.0
7,8,Student_8,Female,2000-06-27,163,BCA,94.0,62.0,63.0,76.0
8,9,Student_9,Female,2002-05-14,158,BBA,74.0,83.0,79.0,61.0
9,10,Student_10,Female,2000-03-06,175,BCA,66.0,74.0,40.0,74.0


### Lambda Function
-

In [195]:
merged.head()

Unnamed: 0,rollno,name,gender,dob,height,program,maths,english,stats,python,age,ageYM
0,1,Student_1,Male,2000-08-16,156,BCA,,,63.0,40.0,24,"24 years, 11 months"
1,2,Student_2,,2000-02-21,170,BBA,83.0,47.0,63.0,50.0,25,"25 years, 5 months"
2,3,Student_3,Female,2001-07-17,158,BCA,90.0,56.0,47.0,74.0,24,"24 years, 0 months"
3,4,Student_4,Female,2001-05-16,188,BBA,74.0,72.0,98.0,,24,"24 years, 2 months"
4,5,Student_5,Male,2001-04-02,167,BCA,81.0,78.0,,80.0,24,"24 years, 4 months"


In [194]:
merged["maths"].apply( lambda x: x * 1.15 if pd.notnull(x) else x).head()

0       NaN
1     95.45
2    103.50
3     85.10
4     93.15
Name: maths, dtype: float64

In [193]:
merged.assign( englishU = lambda df: merged["english"] * 1.10 ).head()

Unnamed: 0,rollno,name,gender,dob,height,program,maths,english,stats,python,age,ageYM,englishU
0,1,Student_1,Male,2000-08-16,156,BCA,,,63.0,40.0,24,"24 years, 11 months",
1,2,Student_2,,2000-02-21,170,BBA,83.0,47.0,63.0,50.0,25,"25 years, 5 months",51.7
2,3,Student_3,Female,2001-07-17,158,BCA,90.0,56.0,47.0,74.0,24,"24 years, 0 months",61.6
3,4,Student_4,Female,2001-05-16,188,BBA,74.0,72.0,98.0,,24,"24 years, 2 months",79.2
4,5,Student_5,Male,2001-04-02,167,BCA,81.0,78.0,,80.0,24,"24 years, 4 months",85.8


## Dates
- pd.Timestamp(date.today()) → current date
- errors="coerce" → invalid dates become NaT (Not a Time) instead of crashing.
- Pandas auto-detects most formats (YYYY-MM-DD, DD/MM/YYYY, etc.), but you can force a format:
- Subtract DOB from today → 
- To calculate age in years, you need to divide the timedelta by a constant (e.g., 365.25 days) or use dateutil.relativedelta for an exact year/month calculation.

In [182]:
import pandas as pd
from datetime import date
from dateutil.relativedelta import relativedelta

In [167]:
merged.head()

Unnamed: 0,rollno,name,gender,dob,height,program,maths,english,stats,python
0,1,Student_1,Male,2000-08-16,156,BCA,,,63.0,40.0
1,2,Student_2,,2000-02-21,170,BBA,83.0,47.0,63.0,50.0
2,3,Student_3,Female,2001-07-17,158,BCA,90.0,56.0,47.0,74.0
3,4,Student_4,Female,2001-05-16,188,BBA,74.0,72.0,98.0,
4,5,Student_5,Male,2001-04-02,167,BCA,81.0,78.0,,80.0


In [175]:
merged.dtypes

rollno       int64
name        object
gender      object
dob         object
height       int64
program     object
maths      float64
english    float64
stats      float64
python     float64
dtype: object

In [176]:
merged['dob'] = pd.to_datetime(merged['dob'], errors="coerce")
# pd.to_datetime(merged['dob'], format="%Y-%m-%d")
merged.dtypes

rollno              int64
name               object
gender             object
dob        datetime64[ns]
height              int64
program            object
maths             float64
english           float64
stats             float64
python            float64
dtype: object

In [179]:
merged.head()

Unnamed: 0,rollno,name,gender,dob,height,program,maths,english,stats,python
0,1,Student_1,Male,2000-08-16,156,BCA,,,63.0,40.0
1,2,Student_2,,2000-02-21,170,BBA,83.0,47.0,63.0,50.0
2,3,Student_3,Female,2001-07-17,158,BCA,90.0,56.0,47.0,74.0
3,4,Student_4,Female,2001-05-16,188,BBA,74.0,72.0,98.0,
4,5,Student_5,Male,2001-04-02,167,BCA,81.0,78.0,,80.0


In [177]:
today = pd.Timestamp(date.today())
today

Timestamp('2025-08-08 00:00:00')

In [180]:
merged["age"] = (today - merged["dob"]).dt.days // 365 # // as to interpret division
merged.head()

Unnamed: 0,rollno,name,gender,dob,height,program,maths,english,stats,python,age
0,1,Student_1,Male,2000-08-16,156,BCA,,,63.0,40.0,24
1,2,Student_2,,2000-02-21,170,BBA,83.0,47.0,63.0,50.0,25
2,3,Student_3,Female,2001-07-17,158,BCA,90.0,56.0,47.0,74.0,24
3,4,Student_4,Female,2001-05-16,188,BBA,74.0,72.0,98.0,,24
4,5,Student_5,Male,2001-04-02,167,BCA,81.0,78.0,,80.0,24


In [183]:
merged["ageYM"] = merged["dob"].apply (
    lambda dob: f"{relativedelta(date.today(), dob).years} years, {relativedelta(date.today(), dob).months} months"
    if pd.notnull(dob) else None
)
merged.head()

Unnamed: 0,rollno,name,gender,dob,height,program,maths,english,stats,python,age,ageYM
0,1,Student_1,Male,2000-08-16,156,BCA,,,63.0,40.0,24,"24 years, 11 months"
1,2,Student_2,,2000-02-21,170,BBA,83.0,47.0,63.0,50.0,25,"25 years, 5 months"
2,3,Student_3,Female,2001-07-17,158,BCA,90.0,56.0,47.0,74.0,24,"24 years, 0 months"
3,4,Student_4,Female,2001-05-16,188,BBA,74.0,72.0,98.0,,24,"24 years, 2 months"
4,5,Student_5,Male,2001-04-02,167,BCA,81.0,78.0,,80.0,24,"24 years, 4 months"


In [187]:
# students whose bday comes in Sep
merged[merged["dob"].dt.month == 2]

Unnamed: 0,rollno,name,gender,dob,height,program,maths,english,stats,python,age,ageYM
1,2,Student_2,,2000-02-21,170,BBA,83.0,47.0,63.0,50.0,25,"25 years, 5 months"
15,16,Student_16,Male,2000-02-24,184,BBA,78.0,88.0,,,25,"25 years, 5 months"
16,17,Student_17,Male,2001-02-11,163,BBA,71.0,43.0,69.0,76.0,24,"24 years, 5 months"


## END HERE
- now practise aggregations, groupby,
- time series