### Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np

### Loading the dataset "Daily.Activity"

###

In [2]:
df = pd.read_csv("Activity.csv")

### Check structure, missing values, duplicates and data types

In [3]:
df.head()

Unnamed: 0,Customer_ID,Activity_Date,Day_of_Week,Total_Steps,Total_Distance,Tracker_Distance,Very_Active_Distance,Moderately_Active_Distance,Light_Active_Distance,Sedentary_Active_Distance,Very_Active_Minutes,Fairly_Active_Minutes,Lightly_Active_Minutes,Sedentary_Minutes,Calories
0,1503960366,4/12/2016,Tuesday,13162,8.5,8.5,1.88,0.55,6.06,0.0,25,13,328,728,1985
1,1503960366,4/13/2016,Wednesday,10735,6.97,6.97,1.57,0.69,4.71,0.0,21,19,217,776,1797
2,1503960366,4/14/2016,Thursday,10460,6.74,6.74,2.44,0.4,3.91,0.0,30,11,181,1218,1776
3,1503960366,4/15/2016,Friday,9762,6.28,6.28,2.14,1.26,2.83,0.0,29,34,209,726,1745
4,1503960366,4/16/2016,Saturday,12669,8.16,8.16,2.71,0.41,5.04,0.0,36,10,221,773,1863


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Customer_ID                 940 non-null    int64  
 1   Activity_Date               940 non-null    object 
 2   Day_of_Week                 940 non-null    object 
 3   Total_Steps                 940 non-null    int64  
 4   Total_Distance              940 non-null    float64
 5   Tracker_Distance            940 non-null    float64
 6   Very_Active_Distance        940 non-null    float64
 7   Moderately_Active_Distance  940 non-null    float64
 8   Light_Active_Distance       940 non-null    float64
 9   Sedentary_Active_Distance   940 non-null    float64
 10  Very_Active_Minutes         940 non-null    int64  
 11  Fairly_Active_Minutes       940 non-null    int64  
 12  Lightly_Active_Minutes      940 non-null    int64  
 13  Sedentary_Minutes           940 non

In [5]:
df.describe()

Unnamed: 0,Customer_ID,Total_Steps,Total_Distance,Tracker_Distance,Very_Active_Distance,Moderately_Active_Distance,Light_Active_Distance,Sedentary_Active_Distance,Very_Active_Minutes,Fairly_Active_Minutes,Lightly_Active_Minutes,Sedentary_Minutes,Calories
count,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0
mean,4855407000.0,7637.910638,5.489702,5.475351,1.502681,0.567543,3.340819,0.001606,21.164894,13.564894,192.812766,991.210638,2303.609574
std,2424805000.0,5087.150742,3.924606,3.907276,2.658941,0.88358,2.040655,0.007346,32.844803,19.987404,109.1747,301.267437,718.166862
min,1503960000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2320127000.0,3789.75,2.62,2.62,0.0,0.0,1.945,0.0,0.0,0.0,127.0,729.75,1828.5
50%,4445115000.0,7405.5,5.245,5.245,0.21,0.24,3.365,0.0,4.0,6.0,199.0,1057.5,2134.0
75%,6962181000.0,10727.0,7.7125,7.71,2.0525,0.8,4.7825,0.0,32.0,19.0,264.0,1229.5,2793.25
max,8877689000.0,36019.0,28.03,28.03,21.92,6.48,10.71,0.11,210.0,143.0,518.0,1440.0,4900.0


In [6]:
# Inspect columns
df.columns

Index(['Customer_ID', 'Activity_Date', 'Day_of_Week', 'Total_Steps',
       'Total_Distance', 'Tracker_Distance', 'Very_Active_Distance',
       'Moderately_Active_Distance', 'Light_Active_Distance',
       'Sedentary_Active_Distance', 'Very_Active_Minutes',
       'Fairly_Active_Minutes', 'Lightly_Active_Minutes', 'Sedentary_Minutes',
       'Calories'],
      dtype='object')

In [7]:
# checking for null values
df.isnull().sum()

Customer_ID                   0
Activity_Date                 0
Day_of_Week                   0
Total_Steps                   0
Total_Distance                0
Tracker_Distance              0
Very_Active_Distance          0
Moderately_Active_Distance    0
Light_Active_Distance         0
Sedentary_Active_Distance     0
Very_Active_Minutes           0
Fairly_Active_Minutes         0
Lightly_Active_Minutes        0
Sedentary_Minutes             0
Calories                      0
dtype: int64

### Column cleaning and standardization

In [8]:
# Column cleaning and standardization
df.columns = (
    df.columns
    .str.strip()                   # removes invisible spaces
    .str.lower()                   # makes everything lowercase
    .str.replace(" ", "_")         # replaces spaces with underscores
    .str.replace(".", "_", regex=False)  # replaces literal periods with underscores
)    

# Verifying the changes
df.columns

Index(['customer_id', 'activity_date', 'day_of_week', 'total_steps',
       'total_distance', 'tracker_distance', 'very_active_distance',
       'moderately_active_distance', 'light_active_distance',
       'sedentary_active_distance', 'very_active_minutes',
       'fairly_active_minutes', 'lightly_active_minutes', 'sedentary_minutes',
       'calories'],
      dtype='object')

In [9]:
# converting 'Activity_Date' to datetime format
df['activity_date'] = pd.to_datetime(df['activity_date'])

# Verifying the changes
df.columns

# confirming the conversion
df.dtypes

customer_id                            int64
activity_date                 datetime64[ns]
day_of_week                           object
total_steps                            int64
total_distance                       float64
tracker_distance                     float64
very_active_distance                 float64
moderately_active_distance           float64
light_active_distance                float64
sedentary_active_distance            float64
very_active_minutes                    int64
fairly_active_minutes                  int64
lightly_active_minutes                 int64
sedentary_minutes                      int64
calories                               int64
dtype: object

In [10]:
# renaming 'activity_date' column to 'date':
df = df.rename(columns={'activity_date': 'date'})

# Verifying the changes
df.columns

Index(['customer_id', 'date', 'day_of_week', 'total_steps', 'total_distance',
       'tracker_distance', 'very_active_distance',
       'moderately_active_distance', 'light_active_distance',
       'sedentary_active_distance', 'very_active_minutes',
       'fairly_active_minutes', 'lightly_active_minutes', 'sedentary_minutes',
       'calories'],
      dtype='object')

In [11]:
# checking for duplicate rows
df.duplicated().sum()

np.int64(0)

### 
Adding Useful Columns

In [12]:
# Adding useful columns
df['month'] = df['date'].dt.month

# Calculate total minutes of activity
df['total_active_minutes'] = (df['very_active_minutes'] + df['fairly_active_minutes'] + df['lightly_active_minutes'])


In [13]:
# Steps to distance ratio (to detect outliers)
df['steps_to_distance_ratio'] = round(df['total_steps'] / df['total_distance'].replace(0, np.nan),1)  # to avoid division by zero    

In [14]:
# Saving the cleaned dataframe
df.to_csv("Cleaned_Activity.csv", index=False)
