### Importing necessary libraries

In [1]:
import pandas as pd

### Loading the dataset 'Weight.Log'

In [2]:
weight_df = pd.read_csv("Weight.csv")

### Check structure, missing values, duplicates and data types

In [3]:
weight_df.head()

Unnamed: 0,Customer_ID,Date,Day_of_Week,Date.1,Time,Weight_Kg,Weight_Pounds,Fat,BMI,Is_Manual_Report,Manual_Report,Log_Id
0,1503960000.0,5/2/2016 23:59:59,Monday,5/2/2016,23:59:59,52.6,115.96,22.0,22.65,True,1.0,1462234000000.0
1,1927972000.0,4/13/2016 1:08:52,Wednesday,4/13/2016,1:08:52,133.5,294.32,,47.54,False,2.0,1460510000000.0
2,2873213000.0,4/21/2016 23:59:59,Thursday,4/21/2016,23:59:59,56.7,125.0,,21.45,True,1.0,1461283000000.0
3,2873213000.0,5/12/2016 23:59:59,Thursday,5/12/2016,23:59:59,57.3,126.32,,21.69,True,1.0,1463098000000.0
4,4319704000.0,4/17/2016 23:59:59,Sunday,4/17/2016,23:59:59,72.4,159.61,25.0,27.45,True,1.0,1460938000000.0


In [4]:
weight_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Customer_ID       66 non-null     float64
 1   Date              66 non-null     object 
 2   Day_of_Week       66 non-null     object 
 3   Date.1            66 non-null     object 
 4   Time              66 non-null     object 
 5   Weight_Kg         67 non-null     object 
 6   Weight_Pounds     67 non-null     object 
 7   Fat               4 non-null      object 
 8   BMI               68 non-null     object 
 9   Is_Manual_Report  66 non-null     object 
 10  Manual_Report     66 non-null     float64
 11  Log_Id            66 non-null     float64
dtypes: float64(3), object(9)
memory usage: 15.0+ KB


In [5]:
weight_df.describe()

Unnamed: 0,Customer_ID,Manual_Report,Log_Id
count,66.0,66.0,66.0
mean,7092696000.0,1.393939,1461763000000.0
std,1840903000.0,0.492366,786012600.0
min,1503960000.0,1.0,1460444000000.0
25%,6962181000.0,1.0,1461064000000.0
50%,6962181000.0,1.0,1461771000000.0
75%,8877689000.0,2.0,1462391000000.0
max,8877689000.0,2.0,1463098000000.0


In [6]:
# Inspect columns
weight_df.columns

Index(['Customer_ID', 'Date', 'Day_of_Week', 'Date.1', 'Time', 'Weight_Kg',
       'Weight_Pounds', 'Fat', 'BMI', 'Is_Manual_Report', 'Manual_Report',
       'Log_Id'],
      dtype='object')

In [7]:
# Column cleaning and standardization
weight_df.columns = (
    weight_df.columns
    .str.strip()                   # removes invisible spaces
    .str.lower()                   # makes everything lowercase
    .str.replace(" ", "_")         # replaces spaces with underscores
    .str.replace(".", "_", regex=False)  # replaces literal periods with underscores
)    

# Verifying the changes
weight_df.columns

Index(['customer_id', 'date', 'day_of_week', 'date_1', 'time', 'weight_kg',
       'weight_pounds', 'fat', 'bmi', 'is_manual_report', 'manual_report',
       'log_id'],
      dtype='object')

### Converting 'date' column from object into datetime format

In [8]:
weight_df['date'] = pd.to_datetime(weight_df['date'])

#Verifying the changes
weight_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   customer_id       66 non-null     float64       
 1   date              66 non-null     datetime64[ns]
 2   day_of_week       66 non-null     object        
 3   date_1            66 non-null     object        
 4   time              66 non-null     object        
 5   weight_kg         67 non-null     object        
 6   weight_pounds     67 non-null     object        
 7   fat               4 non-null      object        
 8   bmi               68 non-null     object        
 9   is_manual_report  66 non-null     object        
 10  manual_report     66 non-null     float64       
 11  log_id            66 non-null     float64       
dtypes: datetime64[ns](1), float64(3), object(8)
memory usage: 15.0+ KB


### Extracting date only column

In [9]:
weight_df['date'] = weight_df['date'].dt.date

In [10]:
# Verifying the changes
weight_df.columns

Index(['customer_id', 'date', 'day_of_week', 'date_1', 'time', 'weight_kg',
       'weight_pounds', 'fat', 'bmi', 'is_manual_report', 'manual_report',
       'log_id'],
      dtype='object')

### Converting Weight and BMI columns to Numeric before rounding 

In [11]:
weight_df['weight_kg'] = pd.to_numeric(weight_df['weight_kg'], errors='coerce')
weight_df['weight_pounds'] = pd.to_numeric(weight_df['weight_pounds'], errors='coerce')
weight_df['bmi'] = pd.to_numeric(weight_df['bmi'], errors='coerce')
weight_df['fat'] = pd.to_numeric(weight_df['fat'], errors='coerce')

# Verifying the changes
weight_df.dtypes

customer_id         float64
date                 object
day_of_week          object
date_1               object
time                 object
weight_kg           float64
weight_pounds       float64
fat                 float64
bmi                 float64
is_manual_report     object
manual_report       float64
log_id              float64
dtype: object

### Dropping redundant columns 

In [12]:
weight_df = weight_df.drop(columns=['date_1', 'time', 'manual_report', 'day_of_week'])    


In [13]:
# Verifying the changes
weight_df.columns

Index(['customer_id', 'date', 'weight_kg', 'weight_pounds', 'fat', 'bmi',
       'is_manual_report', 'log_id'],
      dtype='object')

### Handle missing values

In [14]:
# Since we do not have additional data to fill in missing values in Fat column, we will keep it as it is for now.
# Total numnber of missing values in each column
weight_df.isna().sum()

customer_id          93
date                 93
weight_kg            93
weight_pounds        93
fat                 157
bmi                  93
is_manual_report     93
log_id               93
dtype: int64

### Rounding off Weight and BMI values

In [15]:
weight_df['weight_kg'] = weight_df['weight_kg'].round(1)
weight_df['weight_pounds'] = weight_df['weight_pounds'].round(1)
weight_df['bmi'] = weight_df['bmi'].round(1)


# Verifying the changes
weight_df.head()

Unnamed: 0,customer_id,date,weight_kg,weight_pounds,fat,bmi,is_manual_report,log_id
0,1503960000.0,2016-05-02,52.6,116.0,22.0,22.6,True,1462234000000.0
1,1927972000.0,2016-04-13,133.5,294.3,,47.5,False,1460510000000.0
2,2873213000.0,2016-04-21,56.7,125.0,,21.4,True,1461283000000.0
3,2873213000.0,2016-05-12,57.3,126.3,,21.7,True,1463098000000.0
4,4319704000.0,2016-04-17,72.4,159.6,25.0,27.4,True,1460938000000.0


### Remove Duplicates

In [16]:
weight_df = weight_df.drop_duplicates()

### Final look at the cleaned dataset

In [17]:
weight_df.head()

Unnamed: 0,customer_id,date,weight_kg,weight_pounds,fat,bmi,is_manual_report,log_id
0,1503960000.0,2016-05-02,52.6,116.0,22.0,22.6,True,1462234000000.0
1,1927972000.0,2016-04-13,133.5,294.3,,47.5,False,1460510000000.0
2,2873213000.0,2016-04-21,56.7,125.0,,21.4,True,1461283000000.0
3,2873213000.0,2016-05-12,57.3,126.3,,21.7,True,1463098000000.0
4,4319704000.0,2016-04-17,72.4,159.6,25.0,27.4,True,1460938000000.0


### Saving the cleaned dataset

In [18]:
# Saving the cleaned dataset
weight_df.to_csv("Cleaned_Weight.csv", index=False)