In [1]:
import pandas as pd

In [3]:
# Read the csv file
df1 = pd.read_csv(r'sensors_dataset_corrected.csv')
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46782 entries, 0 to 46781
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   part_id  46782 non-null  int64 
 1   ts_date  46782 non-null  int64 
 2   ts_time  46782 non-null  object
 3   room     46782 non-null  object
dtypes: int64(2), object(2)
memory usage: 1.4+ MB


## **Calculate the percentage of the time the person has spent in the rooms**


In [4]:
# Filter the dataset to include only the relevant rooms
relevant_rooms = ['Bedroom', 'Bathroom', 'Livingroom', 'Kitchen']
df1_filtered = df1[df1['room'].isin(relevant_rooms)]

# Calculate the total time spent in each room for each person
room_time = df1_filtered.groupby(['part_id', 'room']).size().unstack(fill_value=0)

# Calculate the total time spent by each person
total_time = room_time.sum(axis=1)

# Calculate the percentage of time spent in each room for each person
percentage_time = room_time.div(total_time, axis=0) * 100

# Rename the columns with room names and reset the index
percentage_time.columns = [f"Percentage_{room}" for room in percentage_time.columns]
percentage_time = percentage_time.reset_index()

# Print the new dataset
print("New Dataset:")
print(percentage_time)

New Dataset:
     part_id  Percentage_Bathroom  Percentage_Bedroom  Percentage_Kitchen  \
0       1001            14.018692           17.757009           29.906542   
1       1003            27.500000            0.000000           12.500000   
2       1005             0.000000            0.000000            0.000000   
3       1006             8.974359           21.794872           41.025641   
4       1007            10.769231           16.923077           46.153846   
..       ...                  ...                 ...                 ...   
285     3593             6.060606            4.545455           39.393939   
286     3594             0.000000           19.895288           39.790576   
287     3600             2.782819            4.416213           49.062311   
288     3601             0.000000           29.739777           58.364312   
289     3611            10.600707           34.982332           35.689046   

     Percentage_Livingroom  
0                38.317757  
1   

In [5]:
# display the number of missing values in the columns
print(percentage_time.isnull().sum())

part_id                  0
Percentage_Bathroom      0
Percentage_Bedroom       0
Percentage_Kitchen       0
Percentage_Livingroom    0
dtype: int64


In [6]:
# checking if all the values in each column of the DataFrame
# can be successfully converted to numeric type without resulting in any null values
is_numeric = percentage_time.apply(lambda x: pd.to_numeric(x, errors='coerce').notnull().all())

# This will return a boolean series indicating whether each column contains only numeric data
print(is_numeric)


part_id                  True
Percentage_Bathroom      True
Percentage_Bedroom       True
Percentage_Kitchen       True
Percentage_Livingroom    True
dtype: bool


## **Save the percentage dataset**


In [7]:
# Save the corrected dataset
percentage_time.to_csv('percentage_in_rooms_dataset.csv', index=False)
