## Data Manipulation and Analysis with Pandas

In [2]:
import pandas as pd 

In [3]:
df=pd.read_csv('data.csv')
## Fetch first 5 rows
df.head(5)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0


In [4]:
df.tail(5)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4
168,75,125,150,330.4


In [5]:
df.describe()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
count,169.0,169.0,169.0,164.0
mean,63.846154,107.461538,134.047337,375.790244
std,42.299949,14.510259,16.450434,266.379919
min,15.0,80.0,100.0,50.3
25%,45.0,100.0,124.0,250.925
50%,60.0,105.0,131.0,318.6
75%,60.0,111.0,141.0,387.6
max,300.0,159.0,184.0,1860.4


In [7]:
df.dtypes

Duration      int64
Pulse         int64
Maxpulse      int64
Calories    float64
dtype: object

In [11]:
##Handling missing values
df.isnull().any(axis=0)

Duration    False
Pulse       False
Maxpulse    False
Calories     True
dtype: bool

In [12]:
df.sum()

Duration    10790.0
Pulse       18161.0
Maxpulse    22654.0
Calories    61629.6
dtype: float64

In [13]:
df.isnull().sum()

Duration    0
Pulse       0
Maxpulse    0
Calories    5
dtype: int64

In [15]:
df_filled = df.fillna(0)

In [18]:
## Filling missing values with mean of the column
df['Category_fillNA'] = df['Calories'].fillna(df['Calories'].mean())
df 


Unnamed: 0,Duration,Pulse,Maxpulse,Calories,Category_fillNA
0,60,110,130,409.1,409.1
1,60,117,145,479.0,479.0
2,60,103,135,340.0,340.0
3,45,109,175,282.4,282.4
4,45,117,148,406.0,406.0
...,...,...,...,...,...
164,60,105,140,290.8,290.8
165,60,110,145,300.0,300.0
166,60,115,145,310.2,310.2
167,75,120,150,320.4,320.4


In [19]:
##Renaming Columns
df.rename(columns={'Category_fillNA':'Calories_fillNA'})

Unnamed: 0,Duration,Pulse,Maxpulse,Calories,Calories_fillNA
0,60,110,130,409.1,409.1
1,60,117,145,479.0,479.0
2,60,103,135,340.0,340.0
3,45,109,175,282.4,282.4
4,45,117,148,406.0,406.0
...,...,...,...,...,...
164,60,105,140,290.8,290.8
165,60,110,145,300.0,300.0
166,60,115,145,310.2,310.2
167,75,120,150,320.4,320.4


In [26]:
##Change Datatypes 
df['Value_new'] = df['Pulse'].astype('int64')
df.head()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories,Category_fillNA,Value_new
0,60,110,130,409.1,409.1,110
1,60,117,145,479.0,479.0,117
2,60,103,135,340.0,340.0,103
3,45,109,175,282.4,282.4,109
4,45,117,148,406.0,406.0,117


In [27]:
##Data Aggregating and Grouping
df.head(5)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories,Category_fillNA,Value_new
0,60,110,130,409.1,409.1,110
1,60,117,145,479.0,479.0,117
2,60,103,135,340.0,340.0,103
3,45,109,175,282.4,282.4,109
4,45,117,148,406.0,406.0,117


In [29]:
grouped_mean = df.groupby('Duration')['Calories'].mean()
grouped_mean

Duration
15       87.350000
20      151.600000
25      244.200000
30      192.125000
45      273.236364
60      339.675000
75      325.400000
80      643.100000
90      541.800000
120     666.833333
150     939.400000
160     943.700000
180     733.600000
210    1618.200000
270    1729.000000
300    1500.200000
Name: Calories, dtype: float64

In [34]:
grouped_sum = df.groupby(['Duration', 'Pulse'])['Calories'].sum()
grouped_sum

Duration  Pulse
15        80         50.5
          124       124.2
20        83         50.3
          95         77.7
          106       110.4
                    ...  
180       101       600.1
210       108      1376.0
          137      1860.4
270       100      1729.0
300       108      1500.2
Name: Calories, Length: 94, dtype: float64

In [36]:
##Multiple Aggregate Functions
grouped_agg = df.groupby('Duration')['Calories'].agg(['mean', 'sum', 'min', 'max'])
grouped_agg

Unnamed: 0_level_0,mean,sum,min,max
Duration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15,87.35,174.7,50.5,124.2
20,151.6,1364.4,50.3,229.4
25,244.2,244.2,244.2,244.2
30,192.125,3074.0,86.2,319.2
45,273.236364,9016.8,100.7,406.0
60,339.675,25815.3,215.2,486.0
75,325.4,650.8,320.4,330.4
80,643.1,643.1,643.1,643.1
90,541.8,4334.4,466.4,700.0
120,666.833333,2000.5,500.0,1000.1


In [37]:
## Merging and Joining DataFrames
#Create Sample DataFrames
df1 = pd.DataFrame({
    'Key': ['A', 'B', 'C', 'D'],
    'Value1': [10, 20, 30, 40]
})
df2 = pd.DataFrame({
    'Key': ['B', 'C', 'D', 'E'],
    'Value2': [100, 200, 300, 400]
})


In [38]:
df1

Unnamed: 0,Key,Value1
0,A,10
1,B,20
2,C,30
3,D,40


In [None]:
df2

Unnamed: 0,Key,Value2
0,B,100
1,C,200
2,D,300
3,E,400


In [40]:
## Merge DataFrames on the 'Key' column 
pd.merge(df1, df2, on="Key", how='inner')

Unnamed: 0,Key,Value1,Value2
0,B,20,100
1,C,30,200
2,D,40,300


In [41]:
pd.merge(df1, df2, on="Key", how='outer')

Unnamed: 0,Key,Value1,Value2
0,A,10.0,
1,B,20.0,100.0
2,C,30.0,200.0
3,D,40.0,300.0
4,E,,400.0


In [42]:
pd.merge(df1, df2, on="Key", how='left')

Unnamed: 0,Key,Value1,Value2
0,A,10,
1,B,20,100.0
2,C,30,200.0
3,D,40,300.0


In [43]:
pd.merge(df1, df2, on="Key", how='right')

Unnamed: 0,Key,Value1,Value2
0,B,20.0,100
1,C,30.0,200
2,D,40.0,300
3,E,,400
