### Prediction des valeurs manquantes

In [1]:
import pandas as pd

# -----------------------------
# Load datasets
# -----------------------------
head_df = pd.read_csv('./new_datasets/head_dataset.csv', sep=',', decimal='.')
# head_df['Date'] = pd.to_datetime(head_df['Date'], dayfirst=True)

pms_df = pd.read_csv('./new_datasets/Cleaned_PMS_SV_station.csv', sep=',', decimal='.')
# pms_df['Date'] = pd.to_datetime(pms_df['Date'], dayfirst=True)

# -----------------------------
# Mapping PMS/Vanne stations to their pressure columns
# -----------------------------
stations = {
    'PMS1': '11-PIT-001',
    'VANNE': 'Presure_SV_Average',
    'PMS2': '13-PIT-001',
    'PMS3': '14-PIT-001',
    'PMS4': '15-PIT-001'
}

# -----------------------------
# Create dataset per station
# -----------------------------
for station, pressure_col in stations.items():
    df = pd.DataFrame()
    df['Date'] = pms_df['Date']  # keep Date from PMS dataset
    df[f'Pressure_{station}'] = pms_df[pressure_col]
    df[f'Flow_{station}'] = head_df['Flow_HS']  # same flow as head station
    df[f'Density_{station}'] = head_df['Density_HS_Average']  # same density as head station for now

    df.to_csv("./new_datasets/" + f'{station}_dataset.csv', index=False, sep=',', decimal='.')
    print(f"✅ {station} dataset created: {df.shape[0]} rows")

print("🎯 All PMS/Vanne datasets created successfully!")


✅ PMS1 dataset created: 8353 rows
✅ VANNE dataset created: 8353 rows
✅ PMS2 dataset created: 8353 rows
✅ PMS3 dataset created: 8353 rows
✅ PMS4 dataset created: 8353 rows
🎯 All PMS/Vanne datasets created successfully!


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Load Head station data
head_df = pd.read_csv('./new_datasets/head_dataset.csv')

# List of PMS/VANNE stations and their pressure columns
stations = {
    'PMS1': '11-PIT-001',
    'PMS2': '13-PIT-001',
    'PMS3': '14-PIT-001',
    'PMS4': '15-PIT-001',
    'VANNE': 'Presure_SV_Average'
}

for station, pressure_col in stations.items():
    # Load PMS/VANNE station data
    pms_df = pd.read_csv(f'./new_datasets/{station}_dataset.csv')
    # Merge with Head station on Date
    merged = pd.merge(pms_df, head_df, on='Date', suffixes=(f'_{station}', '_Head'))
    # Features: Pressure at PMS/VANNE + Flow and Density at Head
    X = merged[[f'Pressure_{station}', 'Flow_HS', 'Density_HS_Average']]
    # Targets: Flow and Density at PMS/VANNE
    y_flow = merged[f'Flow_{station}']
    y_density = merged[f'Density_{station}']
    # Train/test split
    X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X, y_flow, test_size=0.2, random_state=42)
    X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X, y_density, test_size=0.2, random_state=42)
    # Train models
    model_flow = RandomForestRegressor(n_estimators=100, random_state=42)
    model_flow.fit(X_train_f, y_train_f)
    model_density = RandomForestRegressor(n_estimators=100, random_state=42)
    model_density.fit(X_train_d, y_train_d)
    # Predict for all rows
    merged[f'Flow_{station}'] = model_flow.predict(X)
    merged[f'Density_{station}'] = model_density.predict(X)
    # Save only Date, Pressure, Flow_pred, Density_pred for the station
    output = merged[['Date', f'Pressure_{station}', f'Flow_{station}', f'Density_{station}']]
    output.to_csv(f'./new_dataset/updated_{station}_pred.csv', index=False)
    print(f'✅ {station} predictions saved (sans Head station).')
print('🎯 All PMS/VANNE stations processed with Head station data.')


✅ PMS1 predictions saved (sans Head station).
✅ PMS2 predictions saved (sans Head station).
✅ PMS3 predictions saved (sans Head station).
✅ PMS4 predictions saved (sans Head station).
✅ VANNE predictions saved (sans Head station).
🎯 All PMS/VANNE stations processed with Head station data.


In [3]:
import pandas as pd  
from functools import reduce

# ------------------------------------------------------------------------------------------------------------------
# Merge all the 7 updated datasets into one cleaned dataframe
# ------------------------------------------------------------------------------------------------------------------
stations = ['Head', 'PMS1', 'Vanne', 'PMS2', 'PMS3', 'PMS4', 'Terminal']

dfs = {}

# Load updated datasets
for station in stations:
    if station in ['Head', 'Terminal']:
        dfs[station] = pd.read_csv(f'./new_datasets/{station}_dataset.csv', sep=',', decimal='.')
    else:
        dfs[station] = pd.read_csv(f'./new_dataset/updated_{station}_pred.csv', sep=',', decimal='.')
# Merge all datasets on 'Date'
dfs_list = [dfs[station] for station in stations]
merged_df = reduce(lambda left, right: pd.merge(left, right, on='Date', how='outer'), dfs_list)

# Save merged dataset
merged_df.to_csv('./new_dataset/Merged_All_Stations_updated.csv', index=False, sep=',', decimal='.')
print(f"✅ Merged dataset created: {merged_df.shape[0]} rows, {merged_df.shape[1]} columns")


✅ Merged dataset created: 8353 rows, 22 columns
