In [134]:
import pandas as pd
import numpy as np
import os

pd.__version__

'2.2.2'

In [135]:
root_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(root_dir, 'data')
data_set_path = os.path.join(data_dir, 'health_monitor_data.csv')

In [136]:
"""Read File"""
health_monitor_data = pd.read_csv(data_set_path)
health_monitor_data.head()

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0


In [137]:
health_monitor_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  32 non-null     int64  
 1   Date      31 non-null     object 
 2   Pulse     32 non-null     int64  
 3   Maxpulse  32 non-null     int64  
 4   Calories  30 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.5+ KB


In [138]:
"""Handle Missing values
-Approach 1: Remove Rows
-Approach 2: Replace empty value with a value (Imputation) 
"""

'Handle Missing values\n-Approach 1: Remove Rows\n-Approach 2: Replace empty value with a value (Imputation) \n'

In [139]:
"""Approach 1"""
health_monitor_data.isnull().sum()

Duration    0
Date        1
Pulse       0
Maxpulse    0
Calories    2
dtype: int64

In [140]:
"""Identify Missing Values"""
health_monitor_data[health_monitor_data.isnull().any(axis=1)]

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
18,45,'2020/12/18',90,112,
22,45,,100,119,282.0
28,60,'2020/12/28',103,132,


In [141]:
"""Replace"""
health_monitor_data['Calories'].fillna(
    value = health_monitor_data['Calories'].mean(),
    inplace = True
)
health_monitor_data[health_monitor_data.isnull().any(axis=1)]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  health_monitor_data['Calories'].fillna(


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
22,45,,100,119,282.0


In [142]:
"""Approach 2"""
health_monitor_data.dropna(inplace=True)

In [143]:
health_monitor_data.isnull().sum()

Duration    0
Date        0
Pulse       0
Maxpulse    0
Calories    0
dtype: int64

In [144]:
"""Remove Duplicates"""
health_monitor_data.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12     True
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
dtype: bool

In [145]:
health_monitor_data[health_monitor_data.duplicated()]

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
12,60,'2020/12/12',100,120,250.7


In [146]:
health_monitor_data.drop_duplicates(inplace=True)

In [147]:
health_monitor_data[health_monitor_data.duplicated()]

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories


In [148]:
"""Type Conversion"""
print(health_monitor_data['Date'].dtype)

object


In [149]:
health_monitor_data['Date'] = health_monitor_data['Date'].astype('datetime64[ns]')
print(health_monitor_data['Date'].dtype)

datetime64[ns]


In [150]:
"""Filtering Outliers"""

'Filtering Outliers'

In [151]:
Q1 = health_monitor_data['Pulse'].quantile(0.25)
Q2 = health_monitor_data['Pulse'].quantile(0.75)
IQR = Q2 - Q1
lower_bound = Q1 -1.5*IQR
upper_bound = Q2 +1.5*IQR
not_outliers = health_monitor_data['Pulse'].between(lower_bound, upper_bound)
print(lower_bound, upper_bound)
print(not_outliers)

88.75 118.75
0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
23    False
24     True
25     True
26     True
27     True
28     True
29     True
30     True
31     True
Name: Pulse, dtype: bool


In [152]:
# health_monitor_data = health_monitor_data[~not_outliers]
# health_monitor_data.head()

In [153]:
health_monitor_data = health_monitor_data[not_outliers]
health_monitor_data

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2020-12-01,110,130,409.1
1,60,2020-12-02,117,145,479.0
2,60,2020-12-03,103,135,340.0
3,45,2020-12-04,109,175,282.4
4,45,2020-12-05,117,148,406.0
5,60,2020-12-06,102,127,300.0
6,60,2020-12-07,110,136,374.0
7,450,2020-12-08,104,134,253.3
8,30,2020-12-09,109,133,195.1
9,60,2020-12-10,98,124,269.0


In [154]:
"""Data Processing -> Numeric Data"""
def standard_scaler(data):
    mean = data.mean()
    std= data.std()
    return (data - mean) / std

health_monitor_data['Pulse_Scaled'] = health_monitor_data['Pulse'].agg(standard_scaler)
health_monitor_data

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories,Pulse_Scaled
0,60,2020-12-01,110,130,409.1,1.113308
1,60,2020-12-02,117,145,479.0,2.199855
2,60,2020-12-03,103,135,340.0,0.026762
3,45,2020-12-04,109,175,282.4,0.958088
4,45,2020-12-05,117,148,406.0,2.199855
5,60,2020-12-06,102,127,300.0,-0.128459
6,60,2020-12-07,110,136,374.0,1.113308
7,450,2020-12-08,104,134,253.3,0.181983
8,30,2020-12-09,109,133,195.1,0.958088
9,60,2020-12-10,98,124,269.0,-0.749342


In [155]:
root_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(root_dir, 'data')
data_set_path = os.path.join(data_dir, 'temperatures.csv')
temperatures = pd.read_csv(data_set_path)
temperatures['country'].value_counts()

country
China                                 2640
India                                 2310
Brazil                                 990
Pakistan                               495
Nigeria                                495
Egypt                                  495
Turkey                                 495
United States                          495
Russia                                 330
Indonesia                              330
Saudi Arabia                           330
Canada                                 330
South Africa                           330
Colombia                               330
Japan                                  330
Australia                              330
Somalia                                165
Mexico                                 165
Côte D'Ivoire                          165
Italy                                  165
Kenya                                  165
France                                 165
Burma                                  165
Phi

In [156]:
"""One Hot Encoder"""
# temperatures = pd.get_dummies(
#     temperatures,
#     columns=['country'],
#     sparse=True
# )
# temperatures

'One Hot Encoder'

In [162]:
"""Label Encoder"""