In [15]:
import pandas as pd
import numpy as np
from pathlib import Path

SENSOR_CSV = r"C:\Users\Dell\Downloads\Iot_part4.csv"
META_CSV   = r"C:\Users\Dell\Downloads\12k_trip_part_4.csv"

OUT_DIR = Path(r"C:\Users\Dell\Downloads")
OUT_DIR.mkdir(parents=True, exist_ok=True)

OUT_MERGED_CSV = OUT_DIR / "merged_timeseries_per_reading.csv"
OUT_MERGED_PARQUET = OUT_DIR / "merged_timeseries_per_reading.parquet"

print("Sensor file:", SENSOR_CSV)
print("Metadata file:", META_CSV)
print("Output (CSV):", OUT_MERGED_CSV)
print("Output (Parquet):", OUT_MERGED_PARQUET)


Sensor file: C:\Users\Dell\Downloads\Iot_part4.csv
Metadata file: C:\Users\Dell\Downloads\12k_trip_part_4.csv
Output (CSV): C:\Users\Dell\Downloads\merged_timeseries_per_reading.csv
Output (Parquet): C:\Users\Dell\Downloads\merged_timeseries_per_reading.parquet


# Load sensor-level và loại bỏ 2 cột không cần thiết

In [16]:
# parse_dates để có timestamp dạng datetime
sensor = pd.read_csv(SENSOR_CSV, parse_dates=['timestamp'], low_memory=False)

print("Cột sensor ban đầu:", sensor.columns.tolist())

# Xóa 2 cột speed_knots và failure_scenario nếu tồn tại
for col in ['speed_knots', 'failure_scenario']:
    if col in sensor.columns:
        sensor.drop(columns=[col], inplace=True)
        print(f"Đã xóa cột {col} khỏi sensor-level")

# Chuẩn hoá timestamp (tz-naive)
sensor['timestamp'] = pd.to_datetime(sensor['timestamp'], errors='coerce')
sensor['timestamp'] = sensor['timestamp'].dt.tz_localize(None)

# đảm bảo có cột trip_id
if 'trip_id' not in sensor.columns:
    raise RuntimeError("Không tìm thấy cột 'trip_id' trong file sensor-level. Kiểm tra file input.")

print("Số bản ghi sensor:", len(sensor))
print("Ví dụ 5 dòng:")
display(sensor.head(5))

Cột sensor ban đầu: ['timestamp', 'temp', 'humid', 'co2', 'light', 'class', 'trip_id', 'fruit_cate', 'latitude', 'longitude', 'speed_knots', 'Route', 'temperature_C', 'humidity_%', 'dew_point_C', 'pressure_hPa', 'wind_speed_kmh', 'precipitation_mm', 'failure_scenario']
Đã xóa cột speed_knots khỏi sensor-level
Đã xóa cột failure_scenario khỏi sensor-level
Số bản ghi sensor: 8591130
Ví dụ 5 dòng:


Unnamed: 0,timestamp,temp,humid,co2,light,class,trip_id,fruit_cate,latitude,longitude,Route,temperature_C,humidity_%,dew_point_C,pressure_hPa,wind_speed_kmh,precipitation_mm
0,2024-03-28 06:04:53,22.888812,90.159451,320.601052,12.837124,Good,TRIP_00126,Tomato,16.056896,108.219371,VNDAD-KRPUS,30.2,68.0,23.7,1009.7,16.1,0.0
1,2024-03-28 06:14:53,22.985532,89.737415,321.008643,10.28586,Good,TRIP_00126,Tomato,16.056896,108.219371,VNDAD-KRPUS,30.2,68.0,23.7,1009.7,16.1,0.0
2,2024-03-28 06:24:53,22.825901,90.475185,321.356256,14.449335,Good,TRIP_00126,Tomato,16.056896,108.219371,VNDAD-KRPUS,30.2,68.0,23.7,1009.7,16.1,0.0
3,2024-03-28 06:34:53,23.087722,89.628797,321.642294,9.889969,Good,TRIP_00126,Tomato,16.056896,108.219371,VNDAD-KRPUS,30.2,68.0,23.7,1009.7,16.1,0.0
4,2024-03-28 06:44:53,23.110701,90.29886,322.181717,19.678464,Good,TRIP_00126,Tomato,16.056896,108.219371,VNDAD-KRPUS,30.2,68.0,23.7,1009.7,16.1,0.0


# Load trip metadata và chọn 4 cột

In [17]:
meta = pd.read_csv(META_CSV, low_memory=False)

# Kiểm tra tên cột hiện có
print("Cột metadata:", meta.columns.tolist())

# Chọn các cột cần giữ lại
cols_keep = ['trip_id', 'trip_id_raw', 'expected_delay_min', 'weight', 'unit_quantity']
missing_cols = [c for c in cols_keep if c not in meta.columns]
if missing_cols:
    raise ValueError(f"Thiếu các cột bắt buộc trong metadata: {missing_cols}")

# Giữ đúng thứ tự cột
meta_sel = meta[cols_keep].copy()

# Chuẩn hoá kiểu dữ liệu
meta_sel['trip_id'] = meta_sel['trip_id'].astype(str).str.strip()
meta_sel['trip_id_raw'] = meta_sel['trip_id_raw'].astype(str).str.strip()

print("Số trip metadata:", len(meta_sel))
display(meta_sel.head(6))

Cột metadata: ['trip_id', 'carrier', 'start_time', 'end_time', 'duration_hours', 'origin', 'destination', 'product_id', 'unit_quantity', 'weight', 'service_level', 'priority', 'plant_code', 'expected_delay_min', 'target_temp_c', 'distance_km', 'trip_id_raw', 'fruit_cate', 'Route', 'tpt_days', 'start_month', 'start_year', 'year_month']
Số trip metadata: 3000


Unnamed: 0,trip_id,trip_id_raw,expected_delay_min,weight,unit_quantity
0,TRIP_69303,1447191734.7,8,1.690831,739
1,TRIP_73181,1447145225.7,7,18.351178,1464
2,TRIP_82568,1447306656.7,5,34.767863,635
3,TRIP_21965,1447237193.7,0,28.794509,690
4,TRIP_18974,1447403037.7,1,16.56,470
5,TRIP_115275,1447413740.7,6,30.72,440


# Kiểm tra sự khớp giữa trip_id trong sensor và metadata

In [18]:
sensor['trip_id'] = sensor['trip_id'].astype(str).str.strip()

sensor_trip_set = set(sensor['trip_id'].unique())
meta_trip_set   = set(meta_sel['trip_id'].unique())

in_sensor_not_meta = sorted(list(sensor_trip_set - meta_trip_set))[:10]
in_meta_not_sensor = sorted(list(meta_trip_set - sensor_trip_set))[:10]

print("Số trip unique trong sensor:", len(sensor_trip_set))
print("Số trip unique trong metadata:", len(meta_trip_set))
print("Ví dụ trip trong sensor nhưng KHÔNG trong metadata (max 10):", in_sensor_not_meta)
print("Ví dụ trip trong metadata nhưng KHÔNG trong sensor (max 10):", in_meta_not_sensor)

# Tỉ lệ sensor-trip có metadata
matched = len(sensor_trip_set & meta_trip_set)
print(f"Số trip khớp giữa 2 bảng: {matched} / {len(sensor_trip_set)} (sensor_unique)")


Số trip unique trong sensor: 3000
Số trip unique trong metadata: 3000
Ví dụ trip trong sensor nhưng KHÔNG trong metadata (max 10): []
Ví dụ trip trong metadata nhưng KHÔNG trong sensor (max 10): []
Số trip khớp giữa 2 bảng: 3000 / 3000 (sensor_unique)


# Thực hiện merge (left join) và sắp xếp theo trip + timestamp

In [19]:
merged = sensor.merge(meta_sel, on='trip_id', how='left', validate='m:1')  # many sensor rows -> one metadata row

# Sắp xếp để dễ nhìn: theo trip_id rồi timestamp
merged = merged.sort_values(['trip_id','timestamp']).reset_index(drop=True)

print("Kết quả merge shape:", merged.shape)
print("Số bản ghi có metadata missing (na in expected_delay_min):", merged['expected_delay_min'].isna().sum())
display(merged.head(8))

Kết quả merge shape: (8591130, 21)
Số bản ghi có metadata missing (na in expected_delay_min): 0


Unnamed: 0,timestamp,temp,humid,co2,light,class,trip_id,fruit_cate,latitude,longitude,...,temperature_C,humidity_%,dew_point_C,pressure_hPa,wind_speed_kmh,precipitation_mm,trip_id_raw,expected_delay_min,weight,unit_quantity
0,2024-03-28 06:04:53,22.888812,90.159451,320.601052,12.837124,Good,TRIP_00126,Tomato,16.056896,108.219371,...,30.2,68.0,23.7,1009.7,16.1,0.0,1447202432.7,1,308.36,1961
1,2024-03-28 06:14:53,22.985532,89.737415,321.008643,10.28586,Good,TRIP_00126,Tomato,16.056896,108.219371,...,30.2,68.0,23.7,1009.7,16.1,0.0,1447202432.7,1,308.36,1961
2,2024-03-28 06:24:53,22.825901,90.475185,321.356256,14.449335,Good,TRIP_00126,Tomato,16.056896,108.219371,...,30.2,68.0,23.7,1009.7,16.1,0.0,1447202432.7,1,308.36,1961
3,2024-03-28 06:34:53,23.087722,89.628797,321.642294,9.889969,Good,TRIP_00126,Tomato,16.056896,108.219371,...,30.2,68.0,23.7,1009.7,16.1,0.0,1447202432.7,1,308.36,1961
4,2024-03-28 06:44:53,23.110701,90.29886,322.181717,19.678464,Good,TRIP_00126,Tomato,16.056896,108.219371,...,30.2,68.0,23.7,1009.7,16.1,0.0,1447202432.7,1,308.36,1961
5,2024-03-28 06:54:53,23.228662,89.742282,322.776461,10.199238,Good,TRIP_00126,Tomato,16.056896,108.219371,...,30.2,68.0,23.7,1009.7,16.1,0.0,1447202432.7,1,308.36,1961
6,2024-03-28 07:04:53,23.287922,89.68222,323.260285,8.935343,Good,TRIP_00126,Tomato,15.927031,108.369283,...,30.7,65.0,23.4,1009.3,17.9,0.0,1447202432.7,1,308.36,1961
7,2024-03-28 07:14:53,23.247341,89.997236,323.79922,11.311649,Good,TRIP_00126,Tomato,15.927031,108.369283,...,30.7,65.0,23.4,1009.3,17.9,0.0,1447202432.7,1,308.36,1961


# Lưu kết quả (CSV & Parquet). Tùy chọn partition theo trip_id

In [20]:
merged.to_csv(OUT_MERGED_CSV, index=False, encoding='utf-8-sig')
print("Đã lưu CSV:", OUT_MERGED_CSV)

try:
    merged.to_parquet(OUT_MERGED_PARQUET, index=False)
    print("Đã lưu Parquet:", OUT_MERGED_PARQUET)
except Exception as e:
    print("Không lưu Parquet (thiếu pyarrow?):", e)
    print("Bạn vẫn có CSV để dùng.")

Đã lưu CSV: C:\Users\Dell\Downloads\merged_timeseries_per_reading.csv
Đã lưu Parquet: C:\Users\Dell\Downloads\merged_timeseries_per_reading.parquet


- (Tùy chọn) Tạo index MultiIndex (trip_id, timestamp) cho thao tác time-series thuận tiện

In [10]:
ts_indexed = merged.set_index(['trip_id','timestamp']).sort_index()
# ví dụ truy vấn nhanh: lấy 1 trip rồi resample theo 15 phút (nếu cần)
example_trip = merged['trip_id'].unique()[0]   
print("Ví dụ lấy trip:", example_trip)
sample_trip = ts_indexed.loc[example_trip]
print("Sample trip index shape:", sample_trip.shape)
display(sample_trip.head(8))

Ví dụ lấy trip: TRIP_00002
Sample trip index shape: (985, 19)


Unnamed: 0_level_0,temp,humid,co2,light,class,fruit_cate,latitude,longitude,Route,temperature_C,humidity_%,dew_point_C,pressure_hPa,wind_speed_kmh,precipitation_mm,trip_id_raw,expected_delay_min,weight,unit_quantity
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2024-01-21 19:02:58,21.944977,85.257717,310.470178,14.473861,Good,Pineapple,20.844783,106.76401,VNHPH-CNSHA,16.4,87.0,14.2,1017.2,24.4,0.0,1447158014.7,1,87.94,3188
2024-01-21 19:12:58,21.930127,84.937926,310.754231,11.989037,Good,Pineapple,20.844783,106.76401,VNHPH-CNSHA,16.4,87.0,14.2,1017.2,24.4,0.0,1447158014.7,1,87.94,3188
2024-01-21 19:22:58,22.082836,84.535408,311.071613,14.079832,Good,Pineapple,20.844783,106.76401,VNHPH-CNSHA,16.4,87.0,14.2,1017.2,24.4,0.0,1447158014.7,1,87.94,3188
2024-01-21 19:32:58,22.067485,84.786104,311.393199,13.468183,Good,Pineapple,20.844783,106.76401,VNHPH-CNSHA,16.4,87.0,14.2,1017.2,24.4,0.0,1447158014.7,1,87.94,3188
2024-01-21 19:42:58,22.109032,84.616012,311.723067,14.142465,Good,Pineapple,20.844783,106.76401,VNHPH-CNSHA,16.4,87.0,14.2,1017.2,24.4,0.0,1447158014.7,1,87.94,3188
2024-01-21 19:52:58,22.221654,84.947627,312.247677,13.474877,Good,Pineapple,20.844783,106.76401,VNHPH-CNSHA,16.4,87.0,14.2,1017.2,24.4,0.0,1447158014.7,1,87.94,3188
2024-01-21 20:02:58,21.926738,85.559712,312.470901,14.779193,Good,Pineapple,20.910585,106.850366,VNHPH-CNSHA,15.6875,85.875,13.3,1016.275,23.25,0.0125,1447158014.7,1,87.94,3188
2024-01-21 20:12:58,22.263376,85.186559,312.688357,13.613527,Good,Pineapple,20.910585,106.850366,VNHPH-CNSHA,15.6875,85.875,13.3,1016.275,23.25,0.0125,1447158014.7,1,87.94,3188
