# Basic Data Analysis

In [42]:
# === Libraries ===
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For plotting and visualizations
import seaborn as sns  # For enhanced data visualizations
import plotly.express as px  # For interactive data visualizations
from sklearn.model_selection import KFold, train_test_split  # For splitting data and cross-validation
from sklearn.linear_model import LinearRegression  # For linear regression modeling
from sklearn.metrics import mean_squared_error, r2_score  # For model evaluation metrics
from scipy.optimize import curve_fit  # For curve fitting functions

In [43]:
df = pd.read_csv('../aggregated_measurements_data.csv', index_col=0)

In [44]:
print("Column names:")
print(df.columns)

print("\nTotal row count:", len(df))

Column names:
Index(['device_id', 'co2', 'humidity', 'pm25', 'pressure', 'temperature',
       'rssi', 'snr', 'SF', 'frequency', 'f_count', 'p_count', 'toa',
       'distance', 'c_walls', 'w_walls', 'exp_pl', 'n_power', 'esp'],
      dtype='object')

Total row count: 346804


In [45]:
print("Sorted distinct 'device_id' values:")
print(sorted(df['device_id'].unique()))

Sorted distinct 'device_id' values:
['ED0', 'ED1', 'ED2', 'ED3', 'ED4', 'ED5']


## Presumed constants

It probably can be assumed that the following values don't differ across different devices:
- `distance`
- `c_walls` - brick walls
- `w_walls` - wood walls

In [46]:
device_by_distance = pd.DataFrame([
    {
        'device_id': device_id,
        'distance': unique_distances[0] if len(unique_distances) == 1 else 'VARIES'
    }
    for device_id, group in df.groupby('device_id')
    for unique_distances in [group['distance'].unique()]
])

# print(device_by_distance.sort_values(by='distance', ascending=True))

In [47]:
device_by_brick_walls = pd.DataFrame([
    {
        'device_id': device_id,
        'brick_walls': unique_c_walls[0] if len(unique_c_walls) == 1 else 'VARIES'
    }
    for device_id, group in df.groupby('device_id')
    for unique_c_walls in [group['c_walls'].unique()]
])

# print(device_by_brick_walls.sort_values(by='brick_walls', ascending=True))

In [48]:
device_by_wood_walls = pd.DataFrame([
    {
        'device_id': device_id,
        'wood_walls': unique_w_walls[0] if len(unique_w_walls) == 1 else 'VARIES'
    }
    for device_id, group in df.groupby('device_id')
    for unique_w_walls in [group['w_walls'].unique()]
])

# print(device_by_wood_walls.sort_values(by='wood_walls', ascending=True))

In [51]:
devices_grouped = pd.merge(device_by_distance, device_by_brick_walls, on='device_id', how='inner')
devices_grouped = pd.merge(devices_grouped, device_by_wood_walls, on='device_id', how='inner')

print(devices_grouped)

assert not (devices_grouped == 'VARIES').any().any(), "Error: 'VARIES' found in data column"
assert not devices_grouped.isna().any().any(), "Error: NaN values found in any column"

  device_id  distance  brick_walls  wood_walls
0       ED0        10            0           0
1       ED1         8            1           0
2       ED2        25            0           2
3       ED3        18            1           2
4       ED4        39            0           5
5       ED5        43            2           2
