In [2]:
import pandas as pd

# Reading csv file

In [3]:
df = pd.read_csv('fellowship_data.csv')

In [4]:
df.head(50)

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  32 non-null     int64  
 1   Date      31 non-null     object 
 2   Pulse     32 non-null     int64  
 3   Maxpulse  32 non-null     int64  
 4   Calories  30 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4+ KB


In [6]:
df.describe()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
count,32.0,32.0,32.0,30.0
mean,68.4375,103.5,128.5,304.68
std,70.039591,7.832933,12.998759,66.003779
min,30.0,90.0,101.0,195.1
25%,60.0,100.0,120.0,250.7
50%,60.0,102.5,127.5,291.2
75%,60.0,106.5,132.25,343.975
max,450.0,130.0,175.0,479.0


# Checking the datatypes

In [7]:
df.dtypes

Duration      int64
Date         object
Pulse         int64
Maxpulse      int64
Calories    float64
dtype: object

# Converting date column into datetime

In [8]:
df.Date = df.Date.astype('datetime64')

#alternative: use .to_datetime
#pd.to_datetime

In [10]:
df.dtypes

Duration             int64
Date        datetime64[ns]
Pulse                int64
Maxpulse             int64
Calories           float64
dtype: object

In [12]:
def convert_to_date(date):
    return pd.to_datetime(date, format='%Y/%m/%d')

#apply function
df['Date'] = df['Date'].apply(convert_to_date)

df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2020-12-01,110,130,409.1
1,60,2020-12-02,117,145,479.0
2,60,2020-12-03,103,135,340.0
3,45,2020-12-04,109,175,282.4
4,45,2020-12-05,117,148,406.0
5,60,2020-12-06,102,127,300.0
6,60,2020-12-07,110,136,374.0
7,450,2020-12-08,104,134,253.3
8,30,2020-12-09,109,133,195.1
9,60,2020-12-10,98,124,269.0


# Find of there are any duplicates

In [13]:
df[df.duplicated()]

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
12,60,2020-12-12,100,120,250.7


# Drop the duplicated and fix the missing index afterward.


In [14]:
df = df.drop_duplicates()

In [15]:
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2020-12-01,110,130,409.1
1,60,2020-12-02,117,145,479.0
2,60,2020-12-03,103,135,340.0
3,45,2020-12-04,109,175,282.4
4,45,2020-12-05,117,148,406.0
5,60,2020-12-06,102,127,300.0
6,60,2020-12-07,110,136,374.0
7,450,2020-12-08,104,134,253.3
8,30,2020-12-09,109,133,195.1
9,60,2020-12-10,98,124,269.0


In [16]:
df = df.reset_index(drop=True)
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2020-12-01,110,130,409.1
1,60,2020-12-02,117,145,479.0
2,60,2020-12-03,103,135,340.0
3,45,2020-12-04,109,175,282.4
4,45,2020-12-05,117,148,406.0
5,60,2020-12-06,102,127,300.0
6,60,2020-12-07,110,136,374.0
7,450,2020-12-08,104,134,253.3
8,30,2020-12-09,109,133,195.1
9,60,2020-12-10,98,124,269.0


# There is an anomalous value in the data. Replace it.

In [17]:
df.loc[df['Duration']>60, 'Duration'] = 60

print(df)

    Duration       Date  Pulse  Maxpulse  Calories
0         60 2020-12-01    110       130     409.1
1         60 2020-12-02    117       145     479.0
2         60 2020-12-03    103       135     340.0
3         45 2020-12-04    109       175     282.4
4         45 2020-12-05    117       148     406.0
5         60 2020-12-06    102       127     300.0
6         60 2020-12-07    110       136     374.0
7         60 2020-12-08    104       134     253.3
8         30 2020-12-09    109       133     195.1
9         60 2020-12-10     98       124     269.0
10        60 2020-12-11    103       147     329.3
11        60 2020-12-12    100       120     250.7
12        60 2020-12-13    106       128     345.3
13        60 2020-12-14    104       132     379.3
14        60 2020-12-15     98       123     275.0
15        60 2020-12-16     98       120     215.2
16        60 2020-12-17    100       120     300.0
17        45 2020-12-18     90       112       NaN
18        60 2020-12-19    103 

# Create a column that finds difference in Max Pulse and pulse


In [18]:
df['Difference']=df['Maxpulse']-df['Pulse']

In [19]:
df.head()

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories,Difference
0,60,2020-12-01,110,130,409.1,20
1,60,2020-12-02,117,145,479.0,28
2,60,2020-12-03,103,135,340.0,32
3,45,2020-12-04,109,175,282.4,66
4,45,2020-12-05,117,148,406.0,31


# Find how many calories they burned per second.

In [20]:
df['CaloriesPerSecond'] = df['Calories']/df['Duration']
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories,Difference,CaloriesPerSecond
0,60,2020-12-01,110,130,409.1,20,6.818333
1,60,2020-12-02,117,145,479.0,28,7.983333
2,60,2020-12-03,103,135,340.0,32,5.666667
3,45,2020-12-04,109,175,282.4,66,6.275556
4,45,2020-12-05,117,148,406.0,31,9.022222
5,60,2020-12-06,102,127,300.0,25,5.0
6,60,2020-12-07,110,136,374.0,26,6.233333
7,60,2020-12-08,104,134,253.3,30,4.221667
8,30,2020-12-09,109,133,195.1,24,6.503333
9,60,2020-12-10,98,124,269.0,26,4.483333


# Sort the data frame by calories..


In [21]:
sorted_df = df.sort_values(by='Calories')
sorted_df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories,Difference,CaloriesPerSecond
8,30,2020-12-09,109,133,195.1,24,6.503333
15,60,2020-12-16,98,120,215.2,22,3.586667
26,60,2020-12-27,92,118,241.0,26,4.016667
30,60,2020-12-31,92,115,243.0,23,4.05
19,45,2020-12-20,97,125,243.0,28,5.4
23,45,2020-12-24,105,132,246.0,27,5.466667
25,60,2020-12-26,100,120,250.0,20,4.166667
11,60,2020-12-12,100,120,250.7,20,4.178333
7,60,2020-12-08,104,134,253.3,30,4.221667
9,60,2020-12-10,98,124,269.0,26,4.483333


# Retrieve a subset of the dataframes where calories are higher than 400, less than
200 and between 200-300.


In [22]:
subset_df = df[df['Calories']>400]
subset_df.head()

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories,Difference,CaloriesPerSecond
0,60,2020-12-01,110,130,409.1,20,6.818333
1,60,2020-12-02,117,145,479.0,28,7.983333
4,45,2020-12-05,117,148,406.0,31,9.022222


In [23]:
lessthan_subset = df[df['Calories']<200]
lessthan_subset

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories,Difference,CaloriesPerSecond
8,30,2020-12-09,109,133,195.1,24,6.503333


In [24]:
between_subset = df[(df['Calories']>200) & (df['Calories']<300)]
between_subset

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories,Difference,CaloriesPerSecond
3,45,2020-12-04,109,175,282.4,66,6.275556
7,60,2020-12-08,104,134,253.3,30,4.221667
9,60,2020-12-10,98,124,269.0,26,4.483333
11,60,2020-12-12,100,120,250.7,20,4.178333
14,60,2020-12-15,98,123,275.0,25,4.583333
15,60,2020-12-16,98,120,215.2,22,3.586667
19,45,2020-12-20,97,125,243.0,28,5.4
21,45,NaT,100,119,282.0,19,6.266667
23,45,2020-12-24,105,132,246.0,27,5.466667
25,60,2020-12-26,100,120,250.0,20,4.166667


# Find Unique values in duration


In [25]:
uniquevalues = df['Duration'].unique()
uniquevalues

array([60, 45, 30], dtype=int64)

# Count how many missing values are there in the dataframe.


In [27]:
missing = df.isna().sum()
missing

Duration             0
Date                 1
Pulse                0
Maxpulse             0
Calories             2
Difference           0
CaloriesPerSecond    2
dtype: int64