In [1]:
import pandas as pd
# Load CSV into a DataFrame and check the top 5 rows. 
df = pd.read_csv("data/tempdata-av8.csv")
df.head()

Unnamed: 0,ts_local,value_num,state,unit,name
0,2024-10-24 05:12:36,,11.32,°C,av8_temp
1,2024-10-24 05:13:36,,11.51,°C,av8_temp
2,2024-10-24 05:14:36,,11.55,°C,av8_temp
3,2024-10-24 05:15:37,,11.41,°C,av8_temp
4,2024-10-24 05:16:37,,11.44,°C,av8_temp


In [2]:
# Controlled the datatypes
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 405811 entries, 0 to 405810
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   ts_local   405811 non-null  str    
 1   value_num  8755 non-null    float64
 2   state      405811 non-null  float64
 3   unit       405811 non-null  str    
 4   name       405811 non-null  str    
dtypes: float64(2), str(3)
memory usage: 15.5 MB


In [3]:
# Look for null values and identify redundant column
df.isnull().sum()

ts_local          0
value_num    397056
state             0
unit              0
name              0
dtype: int64

In [4]:
# Summary statistics before cleaning
df.describe()

Unnamed: 0,value_num,state
count,8755.0,405811.0
mean,-1.788963,10.88837
std,2.400782,7.178695
min,-19.48,-19.48
25%,-2.4,5.34
50%,-1.06,9.02
75%,-0.4,16.71
max,-0.01,382.35


In [5]:
# Confirmed the values of "state"
df["state"].head(10)

0    11.32
1    11.51
2    11.55
3    11.41
4    11.44
5    11.42
6    11.43
7    11.46
8    11.42
9    11.51
Name: state, dtype: float64

In [6]:
# Converted "ts_local" from string to datetime
df["ts_local"] = pd.to_datetime(df["ts_local"])

In [7]:
# Verified my changes in the data
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 405811 entries, 0 to 405810
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   ts_local   405811 non-null  datetime64[us]
 1   value_num  8755 non-null    float64       
 2   state      405811 non-null  float64       
 3   unit       405811 non-null  str           
 4   name       405811 non-null  str           
dtypes: datetime64[us](1), float64(2), str(2)
memory usage: 15.5 MB


In [8]:
# Removed the column with null values and verified again
df = df.drop(columns=["value_num"])
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 405811 entries, 0 to 405810
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   ts_local  405811 non-null  datetime64[us]
 1   state     405811 non-null  float64       
 2   unit      405811 non-null  str           
 3   name      405811 non-null  str           
dtypes: datetime64[us](1), float64(1), str(2)
memory usage: 12.4 MB


In [9]:
# Confirmed homogeneous dataset (unit = °C, name = av8_temp)
df["unit"].unique(), df["name"].unique()

(<StringArray>
 ['°C']
 Length: 1, dtype: str,
 <StringArray>
 ['av8_temp']
 Length: 1, dtype: str)

In [10]:
# Summary statistics after removing redundant column
df.describe()

Unnamed: 0,ts_local,state
count,405811,405811.0
mean,2025-03-30 20:57:57.126684,10.88837
min,2024-10-24 05:12:36,-19.48
25%,2025-01-11 09:26:03,5.34
50%,2025-04-02 22:26:28,9.02
75%,2025-06-17 00:01:55,16.71
max,2025-08-31 05:01:22,382.35
std,,7.178695


In [11]:
# Sort and set increasing index for resampling and aggregation
df = df.sort_values("ts_local")
df = df.set_index("ts_local")
df.head()

Unnamed: 0_level_0,state,unit,name
ts_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-10-24 05:12:36,11.32,°C,av8_temp
2024-10-24 05:13:36,11.51,°C,av8_temp
2024-10-24 05:14:36,11.55,°C,av8_temp
2024-10-24 05:15:37,11.41,°C,av8_temp
2024-10-24 05:16:37,11.44,°C,av8_temp


In [12]:
# Rename columns and index for better understanding 
df = df.rename(columns={
    "state": "temperature",
    "name": "sensor_name"
})

df.index.name = "date_time"

df.head()

Unnamed: 0_level_0,temperature,unit,sensor_name
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-10-24 05:12:36,11.32,°C,av8_temp
2024-10-24 05:13:36,11.51,°C,av8_temp
2024-10-24 05:14:36,11.55,°C,av8_temp
2024-10-24 05:15:37,11.41,°C,av8_temp
2024-10-24 05:16:37,11.44,°C,av8_temp


In [13]:
# Identify extreme outliers
df[df["temperature"] > 100]

Unnamed: 0_level_0,temperature,unit,sensor_name
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-01-11 10:10:33,382.35,°C,av8_temp
2025-01-11 15:22:54,382.35,°C,av8_temp


In [15]:
# Define a normal temperature interval and remove extreme outliers
df = df[(df["temperature"] > -40) & (df["temperature"] < 50)]

df.describe()

Unnamed: 0,temperature
count,405809.0
mean,10.886539
std,7.13119
min,-19.48
25%,5.34
50%,9.02
75%,16.71
max,37.32


After constraining the dataset to a realistic temperature interval (-40°C to 50°C), the maximum temperature is now within expected bounds.
The dataset is considered clean and ready for time-based analysis.