# Raw results

In [None]:
from core.s3.s3_utils import S3Service
from core.s3.settings import S3Settings
from core.spark_utils import create_spark_session
from pyspark.sql.functions import col

settings = S3Settings()

bucket = S3Service(settings)

spark_session = create_spark_session(
        settings.S3_KEY,
        settings.S3_SECRET
)


# Lire avec mergeSchema activé
df_spark = bucket.read_parquet_df_spark(spark_session, "raw_results/spark_tesla-fleet-telemetry.parquet/vin=5YJSA7E52RF541858/part-00089-25860fa2-18e0-4817-94cb-8fd8ad12f12f.c000.snappy.parquet")
df_spark_vin = df_spark.toPandas()

date_mask = (
    (df_spark_vin['date'] >= '2025-04-24') & 
    (df_spark_vin['date'] <= '2025-05-01')
)

# Appliquer le filtre
df_filtered = df_spark_vin[date_mask]

# Lire avec mergeSchema activé
df_no_spark = bucket.read_parquet_df("raw_results/tesla-fleet-telemetry.parquet")


date_mask = (
    (df_no_spark['date'] >= '2025-04-24') & 
    (df_no_spark['date'] <= '2025-05-01') &
    (df_no_spark.vin == '5YJSA7E52RF541858')
)
# Appliquer le filtre
df_filtered_no_spark = df_no_spark[date_mask]
df_filtered_no_spark

In [None]:
df_filtered[['date', 'in_charge_idx']].sort_values('date', ascending=True).tail(100)

In [None]:
print(df_filtered.in_charge_idx.min())
print(df_filtered.in_charge_idx.max())
print(df_filtered.shape)

print(df_filtered_no_spark.in_charge_idx.min())
print(df_filtered_no_spark.in_charge_idx.max())
print(df_filtered_no_spark.shape)

In [None]:
df_filtered_no_spark[['date', 'soc_diff']]['soc_diff'].sum()
df_filtered_no_spark['level_1'].value_counts()

In [None]:
print(left[left.level_1 >0].level_1.sum())
print(right[right.level_1 >0].level_1.sum())

print(left[left.level_2 >0].level_2.sum())
print(right[right.level_2 >0].level_2.sum())

print(left[left.level_3 >0].level_3.sum())
print(right[right.level_3 >0].level_3.sum())

In [None]:
print(left[left.charging_power >0].charging_power.sum())
print(right[right.charging_power >0].charging_power.sum())

print(left[left.level_2 >0].level_2.sum())
print(right[right.level_2 >0].level_2.sum())

print(left[left.level_3 >0].level_3.sum())
print(right[right.level_3 >0].level_3.sum())

In [None]:
left = df_filtered[['date', 'ac_charging_power','dc_charging_power','charging_power']]
right = df_filtered_no_spark[['date', 'ac_charging_power','dc_charging_power','charging_power']]

left.merge(right, on=['date'], how='outer', suffixes=('_spark', '_no_spark'))

# Processed tss

In [None]:
# Lire avec mergeSchema activé
ptss_spark = bucket.read_parquet_df_spark(spark_session, "processed_ts/tesla-fleet-telemetry/time_series/processed_tss_spark.parquet/vin=5YJSA7E52RF541858/part-00000-0554e60c-8879-4176-b4b5-6568ff87865f.c000.snappy.parquet")
ptss_spark = ptss_spark.to_pandas_on_spark()

date_mask = (
    (ptss_spark['date'] >= '2025-04-24') & 
    (ptss_spark['date'] <= '2025-05-08')
)

# Appliquer le filtre
ptss_filtered = ptss_spark[date_mask]


"""# Lire avec mergeSchema activé
ptss_no_spark = bucket.read_parquet_df("processed_ts/tesla-fleet-telemetry/time_series/processed_tss.parquet", filters=[("vin", "==", "5YJSA7E52RF541858")])

date_mask = (
    (ptss_no_spark['date'] >= '2025-4-24') & 
    (ptss_no_spark['date'] <= '2025-05-08') &
    (ptss_no_spark.vin == '5YJSA7E52RF541858')
)
# Appliquer le filtre
ptss_filtered_no_spark = ptss_no_spark[date_mask]
ptss_filtered_no_spark"""


In [None]:
ptss_filtered_no_spark_socdiff = ptss_filtered_no_spark.dropna(subset=['soc']).copy()
ptss_filtered_no_spark_socdiff['soc_diff'] = ptss_filtered_no_spark_socdiff.groupby('vin', observed=True)['soc'].diff() 
ptss_filtered_no_spark = ptss_filtered_no_spark.merge(ptss_filtered_no_spark_socdiff[["soc", "date", "vin", 'soc_diff']], 
                        on=["soc", "date", "vin"], how="left")

In [None]:
ptss_filtered[['date', 'in_charge_idx', 'soc_diff', 'in_charge', 'soc']].toPandas()

In [None]:
import plotly.express as px
import plotly.graph_objects as go

df_plot = ptss_filtered_no_spark[['date', 'in_charge_idx', 'soc_diff', 'in_charge', 'soc']]

# Créer une palette rouge-vert alternée
unique_charges = sorted(df_plot['in_charge_idx'].unique())
colors = ['black', 'gray'] * (len(unique_charges) // 2 + 1)

# Créer le graphique interactif
fig = go.Figure()

for i, charge_idx in enumerate(unique_charges):
    subset = df_plot[df_plot['in_charge_idx'] == charge_idx]
    fig.add_trace(go.Scatter(
        x=subset['date'],
        y=subset['soc'],
        mode='markers',
        name=f'Charge {charge_idx}',
        marker=dict(
            color=colors[i],
            size=8,
            opacity=0.7
        ),
        hovertemplate='<b>Date:</b> %{x}<br>' +
                     '<b>SOC:</b> %{y}<br>' +
                     '<b>Charge:</b> ' + str(charge_idx) + '<br>' +
                     '<extra></extra>'
    ))

# Personnaliser le graphique
fig.update_layout(
    title='Soc par date - Rouge/Vert alternés par session de charge (Zoomable)',
    xaxis_title='Date',
    yaxis_title='SOC',
    width=1200,
    height=600,
    hovermode='closest'
)

# Activer le zoom et la sélection
fig.update_layout(
    dragmode='zoom',  # Mode zoom par défaut
    selectdirection='any'
)

fig.show()

In [None]:
import plotly.express as px
import plotly.graph_objects as go

df_plot = ptss_filtered[['date', 'in_charge_idx', 'soc_diff', 'in_charge', 'soc']].to_pandas()

# Créer une palette rouge-vert alternée
unique_charges = sorted(df_plot['in_charge_idx'].unique())


colors = ['red', 'green'] * (len(unique_charges) // 2 + 1)

# Créer le graphique interactif
fig = go.Figure()

for i, charge_idx in enumerate(unique_charges):
    subset = df_plot[df_plot['in_charge_idx'] == charge_idx]
    fig.add_trace(go.Scatter(
        x=subset['date'],
        y=subset['soc'],
        mode='markers',
        name=f'Charge {charge_idx}',
        marker=dict(
            color=colors[i],
            size=8,
            opacity=0.7
        ),
        hovertemplate='<b>Date:</b> %{x}<br>' +
                     '<b>SOC:</b> %{y}<br>' +
                     '<b>Charge:</b> ' + str(charge_idx) + '<br>' +
                     '<extra></extra>'
    ))

# Personnaliser le graphique
fig.update_layout(
    title='Soc par date - Rouge/Vert alternés par session de charge (Zoomable)',
    xaxis_title='Date',
    yaxis_title='SOC',
    width=1200,
    height=600,
    hovermode='closest'
)

# Activer le zoom et la sélection
fig.update_layout(
    dragmode='zoom',  # Mode zoom par défaut
    selectdirection='any'
)

fig.show()

In [None]:
ptss_filtered.columns

In [None]:
from pickletools import float8
import pandas as pd
import numpy as np



def label_phases_with_pause(df, threshold=0.4, max_pause_minutes=10):
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)

    df['phase'] = np.nan
    cum_delta = 0
    start_idx = None
    current_trend = None
    current_phase = None
    in_phase = False

    for i in range(1, len(df)):
        diff = df.loc[i, 'soc_diff']

        if pd.isna(diff):
            continue

        time_gap = (df.loc[i, 'date'] - df.loc[i - 1, 'date']).total_seconds() / 60

        if time_gap > max_pause_minutes:
            # Reset de la phase
            cum_delta = 1 if np.sign(df.loc[i+1, 'soc_diff']) == 1 else  -1
            current_trend = 1 if np.sign(df.loc[i+1, 'soc_diff']) == 1 else  -1
            current_phase = None
            in_phase = False
            start_idx = i - 1

        direction = np.sign(diff)

        if current_trend is None or (direction != current_trend and diff != 0):
            current_trend = direction if diff != 0 else current_trend
            cum_delta = diff
            start_idx = i - 1
            current_phase = None
            in_phase = False
            continue

        cum_delta += diff

        if not in_phase and abs(cum_delta) >= threshold:
            current_phase = "charging" if current_trend > 0 else "discharging"
            in_phase = True
            df.loc[start_idx:i, 'phase'] = current_phase
        elif in_phase:
            df.loc[i, 'phase'] = current_phase

    df['phase'] = df['phase'].ffill()

    if df['phase'].isnull().any():
        first_valid = df['phase'].dropna().iloc[0]
        df['phase'] = df['phase'].fillna(first_valid)

    return df


df_plot = ptss_filtered[['date', 'in_charge_idx', 'soc_diff', 'in_charge', 'soc', 'sec_time_diff']].to_pandas()
df_plot = label_phases_with_pause(df_plot)

df_plot = df_plot.sort_values('date').reset_index(drop=True)
df_plot['phase_change'] = (df_plot['phase'] != df_plot['phase'].shift(1))  # True quand la phase change
df_plot['phase_id'] = df_plot['phase_change'].cumsum()               # Incrémente à chaque changement



import plotly.graph_objects as go

# Créer le graphique interactif avec couleurs par phase
fig = go.Figure()

# Définir les couleurs pour chaque phase
phase_colors = {
    'charging': 'green',
    'discharging': 'red', 
    'idle': 'gray'
}

# Tracer chaque phase avec sa couleur
for phase in df_plot['phase'].unique():
    subset = df_plot[df_plot['phase'] == phase]
    fig.add_trace(go.Scatter(
        x=subset['date'],
        y=subset['soc'],
        mode='markers',
        name=f'{phase.capitalize()}',
        marker=dict(
            color=phase_colors[phase],
            size=8,
            opacity=0.7
        ),
        hovertemplate='<b>Date:</b> %{x}<br>' +
                     '<b>SOC:</b> %{y}<br>' +
                     '<b>Phase:</b> ' + phase + '<br>' +
                     '<b>Charge Index:</b> %{customdata}<br>' +
                     '<extra></extra>',
        customdata=subset['in_charge_idx']
    ))

# Ajouter une ligne transparente avec les phase_id
fig.add_trace(go.Scatter(
    x=df_plot['date'],
    y=df_plot['phase_id'],
    mode='lines',
    name='Phase ID',
    line=dict(
        color='black',
        width=2,
        backoff=0.3
    ),
    yaxis='y2',  # Utiliser un second axe Y
    hovertemplate='<b>Date:</b> %{x}<br>' +
                 '<b>Phase ID:</b> %{y}<br>' +
                 '<extra></extra>'
))

# Personnaliser le graphique avec deux axes Y
fig.update_layout(
    title='Soc par date - Coloré par phase avec Phase ID',
    xaxis_title='Date',
    yaxis_title='SOC',
    yaxis2=dict(
        title='Phase ID',
        overlaying='y',
        side='right',
        showgrid=False
    ),
    width=1200,
    height=600,
    hovermode='closest'
)

# Activer le zoom et la sélection
fig.update_layout(
    dragmode='zoom',
    selectdirection='any'
)

fig.show()


In [None]:
from datetime import datetime
print('Transform part', datetime.now())

def label_phases_with_pause(df, threshold=0.5, max_pause_minutes=20):
    print('Sort', datetime.now())
    df = df.sort_values('date').reset_index(drop=True)

    df['phase'] = np.nan
    cum_delta = 0
    start_idx = None
    current_trend = None
    current_phase = None
    in_phase = False

    print('Loop', datetime.now())
    for i in range(1, len(df)):
        diff = df.loc[i, 'soc_diff']

        if pd.isna(diff):
            continue

        time_gap = (df.loc[i, 'date'] - df.loc[i - 1, 'date']).total_seconds() / 60

        if time_gap > max_pause_minutes:
            # Reset de la phase
            cum_delta = 1 if np.sign(df.loc[i+1, 'soc_diff']) == 1 else  1
            current_trend = np.sign(diff) if diff != 0 else 0
            current_phase = None
            in_phase = False
            start_idx = i - 1

        direction = np.sign(diff)

        if current_trend is None or (direction != current_trend and diff != 0):
            current_trend = direction if diff != 0 else current_trend
            cum_delta = diff
            start_idx = i - 1
            current_phase = None
            in_phase = False
            continue

        cum_delta += diff

        if not in_phase and abs(cum_delta) >= threshold:
            current_phase = "charging" if current_trend > 0 else "discharging"
            in_phase = True
            df.loc[start_idx:i, 'phase'] = current_phase
        elif in_phase:
            df.loc[i, 'phase'] = current_phase

    print('Ajsut cols', datetime.now())
    df['phase'] = df['phase'].ffill()
    df['phase_id'] = (df['phase'] != df['phase'].shift(1)).cumsum()   

    if df['phase'].isnull().any():
        first_valid = df['phase'].dropna().iloc[0]
        df['phase'] = df['phase'].fillna(first_valid)

    return df


"""def idle_status(df,threshold_charge:float=0.1, threshold_discharge:float=0.05, gap_mn:float=30):
    df_analysis = df.copy()
    df = df_analysis.sort_values('date').reset_index(drop=True)
    df['readable_date'] = df['date']
    df = df.set_index('readable_date')

    df['cumsum_diff'] = df['soc_diff'].rolling(
        window=f'{gap_mn}T', min_periods=1
    ).sum()

    df = df.reset_index(drop=True)
    
    # Parcourir chaque point
    for i in range(len(df)):
        current_phase = df.loc[i, 'phase']
        mn_since_phase = df.loc[i, 'cumsum_mn_since_phase']
        if current_phase == 'charging':
            current_cumsum = df.loc[i, 'cumsum_diff']
        else:
            current_cumsum = df.loc[i, 'cumsum_diff']
        # Si c'est une phase discharging et que le cumsum est au-dessus du seuil
        if mn_since_phase > gap_mn:
            if current_phase == 'discharging' and current_cumsum > - threshold_discharge:
                # Trouver l'index de début (30 minutes en arrière)
                current_time = df.loc[i, 'date']
                start_time = current_time - pd.Timedelta(minutes=gap_mn)
                
                # Trouver tous les points dans la fenêtre de 30 minutes précédentes
                mask = ((df['date'] >= start_time) & (df['date'] <= current_time) & (df['phase'] == 'discharging') )#& (df.cumsum_diff > threshold_charge))
                
                # Remplacer par 'idle'
                df.loc[mask, 'phase'] = 'idle'
                
            # Si c'est une phase charging et que le cumsum est en-dessous du seuil
            elif current_phase == 'charging' and current_cumsum < threshold_charge:
                # Trouver l'index de début (30 minutes en arrière)
                current_time = df.loc[i, 'date']
                start_time = current_time - pd.Timedelta(minutes=gap_mn)
                
                # Trouver tous les points dans la fenêtre de 30 minutes précédentes
                mask = ((df['date'] >= start_time) & (df['date'] <= current_time) & (df['phase'] == 'charging') )# & (df.cumsum_diff < threshold_charge))
                
                # Remplacer par 'idle'
                df.loc[mask, 'phase'] = 'idle'

    return df"""

def idle_status(df, threshold_charge: float = 0.1, threshold_discharge: float = 0.05, gap_mn: float = 30):
    df_analysis = df.copy()
    df_analysis = df_analysis.sort_values('date').reset_index(drop=True)
    
    df_analysis['readable_date'] = df_analysis['date']
    df_analysis = df_analysis.set_index('readable_date')

    # Rolling window pour le cumul de soc_diff sur la période
    rolling_soc = df_analysis['soc_diff'].rolling(f'{gap_mn}T', min_periods=1).sum()
    df_analysis['cumsum_diff'] = rolling_soc

    df_analysis = df_analysis.reset_index(drop=True)

    # Préparation des masques logiques
    is_discharge = df_analysis['phase'] == 'discharging'
    is_charge = df_analysis['phase'] == 'charging'
    enough_time = df_analysis['cumsum_mn_since_phase'] > gap_mn

    # Création des conditions pour idle
    idle_discharge_mask = is_discharge & enough_time & (df_analysis['cumsum_diff'] > -threshold_discharge)
    idle_charge_mask = is_charge & enough_time & (df_analysis['cumsum_diff'] < threshold_charge)

    # Marquer les lignes à remplacer par 'idle'
    df_analysis.loc[idle_discharge_mask, 'phase'] = 'idle'
    df_analysis.loc[idle_charge_mask, 'phase'] = 'idle'

    return df_analysis





print('TO pandas', datetime.now())
df_plot = ptss_filtered.to_pandas()
print('Label phase', datetime.now())
df_plot = label_phases_with_pause(df_plot)

df_plot = df_plot.sort_values('date').reset_index(drop=True) # True quand la phase change          # Incrémente à chaque changement
# df_plot['cumsum_since_phase'] = df_plot.groupby('phase_id')['soc_diff'].cumsum()
print('Cumsum', datetime.now())
df_plot['cumsum_mn_since_phase'] = df_plot.groupby('phase_id')['sec_time_diff'].cumsum() / 60
print('Idle', datetime.now())
df_plot = idle_status(df_plot)

df_plot['phase_id'] = (df_plot['phase'] != df_plot['phase'].shift(1)).cumsum()   


print('Plot part', datetime.now())


import plotly.graph_objects as go

# Créer le graphique interactif avec couleurs par phase
fig = go.Figure()

# Définir les couleurs pour chaque phase
phase_colors = {
    'charging': 'green',
    'discharging': 'red', 
    'idle': 'gray'
}

# Tracer chaque phase avec sa couleur
for phase in df_plot['phase'].unique():
    subset = df_plot[df_plot['phase'] == phase]
    fig.add_trace(go.Scatter(
        x=subset['date'],
        y=subset['soc'],
        mode='markers',
        name=f'{phase.capitalize()}',
        marker=dict(
            color=phase_colors[phase],
            size=8,
            opacity=0.7
        ),
        hovertemplate='<b>Date:</b> %{x}<br>' +
                     '<b>SOC:</b> %{y}<br>' +
                     '<b>Phase:</b> ' + phase + '<br>' +
                     '<b>Charge Index:</b> %{customdata}<br>' +
                     '<extra></extra>',
        customdata=subset['in_charge_idx']
    ))

# Ajouter une ligne transparente avec les phase_id
fig.add_trace(go.Scatter(
    x=df_plot['date'],
    y=df_plot['phase_id'],
    mode='lines',
    name='Phase ID',
    line=dict(
        color='black',
        width=2,
        backoff=0.3
    ),
    yaxis='y2',  # Utiliser un second axe Y
    hovertemplate='<b>Date:</b> %{x}<br>' +
                 '<b>Phase ID:</b> %{y}<br>' +
                 '<extra></extra>'
))

# Personnaliser le graphique avec deux axes Y
fig.update_layout(
    title='Soc par date - Coloré par phase avec Phase ID',
    xaxis_title='Date',
    yaxis_title='SOC',
    yaxis2=dict(
        title='Phase ID',
        overlaying='y',
        side='right',
        showgrid=False
    ),
    width=1200,
    height=600,
    hovermode='closest'
)

# Activer le zoom et la sélection
fig.update_layout(
    dragmode='zoom',
    selectdirection='any'
)

fig.show()

In [None]:
# Spak

In [None]:
df_plot[['date', 'soc', 'soc_diff', 'cumsum_diff', 'cumsum_since_phase', 'cumsum_mn_since_phase', 'phase', 'phase_id']][(df_plot.soc.notna()) & (df_plot.date >= '2025-05-07 12:45:00') & (df_plot.date <= '2025-05-07 19:00:00')]

In [None]:
import pandas as pd

# Charger ta data d'exemple (adapter selon ton df)
df = df_plot.copy()
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

max_pause_minutes = 60

for i in range(1, len(df)):
    delta_min = (df.loc[i, 'date'] - df.loc[i-1, 'date']).total_seconds() / 60
    if delta_min > max_pause_minutes:
        print(f"Pause détectée > {max_pause_minutes} minutes entre index {i-1} et {i}")


In [None]:
df_plot[(df_plot.soc.notna()) & (df_plot.date >= '2025-04-28 09:00:00') & (df_plot.date <= '2025-04-30 11:40:00')]

In [None]:
print(ptss_filtered_no_spark.columns)
ptss_filtered_no_spark.in_charge_idx.value_counts()

In [None]:
ptss_filtered_no_spark_socdiff = ptss_filtered_no_spark.dropna(subset=['soc']).copy()
ptss_filtered_no_spark_socdiff['soc_diff'] = ptss_filtered_no_spark_socdiff.groupby('vin', observed=True)['soc'].diff() 
ptss_filtered_no_spark = ptss_filtered_no_spark.merge(ptss_filtered_no_spark_socdiff[["soc", "date", "vin", 'soc_diff']], 
                        on=["soc", "date", "vin"], how="left")

In [None]:
left = ptss_filtered[['in_charge_idx', 'date', 'soc_diff']].to_pandas()
right = ptss_filtered_no_spark[['in_charge_idx', 'date', 'soc_diff']]

test = left.merge(right, on=['date'], how='outer', suffixes=('_spark', '_no_spark'))

In [None]:
test[(~((test.soc_diff_spark.isna()) & (test.soc_diff_no_spark.isna()))) & (test.soc_diff_spark != test.soc_diff_no_spark)]

In [None]:

# print(left.groupby('in_charge_idx').ac_charging_power.median().sum())
# print(right.groupby('in_charge_idx').ac_charging_power.median().sum())

# print(left.groupby('in_charge_idx').dc_charging_power.median().sum())
# print(right.groupby('in_charge_idx').dc_charging_power.median().sum())


print(left.groupby('in_charge_idx').soc_diff.sum())
print(right.groupby('in_charge_idx').soc_diff.sum())

print(left[left.soc_diff > 0].groupby('in_charge_idx').soc_diff.sum().sum())
print(right[right.soc_diff > 0].groupby('in_charge_idx').soc_diff.sum().sum())

In [None]:
ptss_filtered[ptss_filtered.date == '2025-05-01 20:47:42']

In [None]:
test = left.merge(right, on=['date'], how='left', suffixes=('_spark', '_no_spark'))
test[(test.ac_charging_power_spark != test.ac_charging_power_no_spark) & ((test.ac_charging_power_spark.notna()) | (test.ac_charging_power_no_spark.notna()))]
#test[(test.dc_charging_power_spark != test.dc_charging_power_no_spark) & ((test.dc_charging_power_spark.notna()) | (test.dc_charging_power_no_spark.notna()))]

In [None]:
print(ptss_filtered.in_charge_idx.min())
print(ptss_filtered.in_charge_idx.max())
print(ptss_filtered.shape)

print(ptss_filtered_no_spark.in_charge_idx.min())
print(ptss_filtered_no_spark.in_charge_idx.max())
print(ptss_filtered_no_spark.shape)

# RTSS

In [None]:
from pyspark.sql.functions import col, lit

tss_spark = bucket.read_parquet_df_spark(spark_session, "raw_ts/tesla-fleet-telemetry/time_series/spark_raw_tss.parquet/vin=5YJSA7E52RF541858/part-00000-7596fcef-b984-4438-8839-d7a16a9b11d7.c000.snappy.parquet")
tss_spark = tss_spark.withColumn("vin", lit('5YJSA7E52RF541858')).filter(col("readable_date") >= '2025-04-24').filter(col("readable_date") <= '2025-05-01')
"""tss_spark = tss_spark.to_pandas_on_spark()

date_mask = (
    (tss_spark['readable_date'] >= '2025-05-01') & 
    (tss_spark['readable_date'] <= '2025-05-08')
)

# Appliquer le filtre
tss_filtered = tss_spark[date_mask]
tss_filtered

tss_no_spark = bucket.read_parquet_df("raw_ts/tesla-fleet-telemetry/time_series/raw_tss.parquet", filters=[("vin", "==", "5YJSA7E52RF541858")])

date_mask = (
    (tss_no_spark['readable_date'] >= '2025-05-01') & 
    (tss_no_spark['readable_date'] <= '2025-05-08') &
    (tss_no_spark.vin == '5YJSA7E52RF541858')
)
# Appliquer le filtre
tss_filtered_no_spark = tss_no_spark[date_mask]
tss_filtered_no_spark"""

In [None]:
rtss = tss_spark.toPandas()

In [None]:
rtss.DetailedChargeState.value_counts()





In [None]:
df = rtss[rtss.readable_date == '2025-05-01 20:47:42']

non_null_columns = df.columns[df.notna().any()].tolist()
print("Colonnes avec au moins une valeur non nulle:")
print(non_null_columns)

# Afficher le DataFrame avec seulement ces colonnes
df[non_null_columns]

In [None]:
from transform.processed_tss.config import *
from pyspark.sql import DataFrame as DF, Window
from pyspark.sql.types import DoubleType
from pyspark.sql import SparkSession
from scipy.integrate import cumulative_trapezoid
import pandas as pd
from core.console_utils import main_decorator
from core.constants import KJ_TO_KWH
from core.caching_utils import CachedETLSpark
from core.logging_utils import set_level_of_loggers_with_prefix
from core.spark_utils import (
    safe_astype_spark_with_error_handling,
    create_spark_session,
    timedelta_to_interval,
)
from transform.fleet_info.main import fleet_info
from transform.processed_tss.config import *
from transform.raw_tss.RawTss import RawTss
from core.s3_utils import S3_Bucket

from pyspark.sql.functions import (
    col,
    lag,
    lead,
    unix_timestamp,
    when,
    lit,
    last,
    first,
    expr,
    coalesce,
    sum as spark_sum,
    pandas_udf,
)

def normalize_units_to_metric(tss):
    tss = tss.withColumn("odometer", col("odometer") * 1.609)
    return tss


def compute_date_vars(tss: DF) -> DF:
    # Créer une fenêtre par vin, ordonnée par date
    window_spec = Window.partitionBy("vin").orderBy("date")

    # Calculer le lag de date (valeur précédente)
    tss = tss.withColumn("prev_date", lag(col("date")).over(window_spec))

    # Différence en secondes entre les deux timestamps
    tss = tss.withColumn(
        "sec_time_diff",
        (unix_timestamp(col("date")) - unix_timestamp(col("prev_date"))).cast(
            "double"
        ),
    )

    return tss

def compute_charge_n_discharge_masks(
    tss: DF, in_charge_vals: list, in_discharge_vals: list
) -> DF:
    """Computes the `in_charge` and `in_discharge` masks either from the charging_status column or from the evolution of the soc over time."""

    if 'tesla-fleet-telemetry' in CHARGE_MASK_WITH_CHARGING_STATUS_MAKES:
        return charge_n_discharging_masks_from_charging_status(
            tss, in_charge_vals, in_discharge_vals
        )
    print(CHARGE_MASK_WITH_SOC_DIFFS_MAKES)
    if 'tesla-fleet-telemetry' in CHARGE_MASK_WITH_SOC_DIFFS_MAKES:
        print('Here1')
        return charge_n_discharging_masks_from_soc_diff(tss)
    raise ValueError(MAKE_NOT_SUPPORTED_ERROR.format(make='tesla-fleet-telemetry'))

def charge_n_discharging_masks_from_soc_diff(tss):
    w = (
        Window.partitionBy("vin")
        .orderBy("date")
        .rowsBetween(Window.unboundedPreceding, 0)
    )

    # Forward fill soc
    tss = tss.withColumn("soc_ffilled", last("soc", ignorenulls=True).over(w))
    print(tss.columns)

    # Window for diff calculation
    w_diff = Window.partitionBy("vin").orderBy("date")

    soc_prev = lag("soc_ffilled").over(w_diff)
    soc_diff = col("soc_ffilled") - soc_prev

    print('soc_ffilled' in tss.columns)

    # Normalisation du signe → {-1, 0, 1}
    soc_sign = when(soc_diff.isNull(), lit(0)).otherwise(soc_diff / abs(soc_diff))

    tss = tss.withColumn("soc_diff", soc_sign)

    # Forward fill and backward fill equivalents
    tss = tss.withColumn(
        "soc_diff_ffill", last("soc_diff", ignorenulls=True).over(w)
    )
    w_rev = (
        Window.partitionBy("vin")
        .orderBy(col("date").desc())
        .rowsBetween(Window.unboundedPreceding, 0)
    )
    tss = tss.withColumn(
        "soc_diff_bfill", last("soc_diff", ignorenulls=True).over(w_rev)
    )

    # Définition des masques
    tss = tss.withColumn(
        "in_charge", (col("soc_diff_ffill") > 0) & (col("soc_diff_bfill") > 0)
    )
    tss = tss.withColumn(
        "in_discharge", (col("soc_diff_ffill") < 0) & (col("soc_diff_bfill") < 0)
    )

    return tss

def charge_n_discharging_masks_from_charging_status(
    tss: DF, in_charge_vals: list, in_discharge_vals: list
) -> DF:

    assert "charging_status" in tss.columns, NO_CHARGING_STATUS_COL_ERROR
    return tss.withColumn(
        "in_charge", col("charging_status").isin(in_charge_vals)
    ).withColumn("in_discharge", col("charging_status").isin(in_discharge_vals))

def trim_leading_n_trailing_soc_off_masks(tss: DF, masks: list[str]) -> DF:
    for mask in masks:
        # Créer une colonne temporaire contenant 'soc' uniquement lorsque le masque est vrai
        tss = tss.withColumn("naned_soc", when(col(mask), col("soc")))
        # Fenêtre pour grouper par 'vin' et l'index associé au masque
        w = Window.partitionBy("vin", col(f"{mask}_idx")).orderBy(
            "date"
        )  # assuming you have a 'timestamp' for ordering
        # Calcul des premières et dernières valeurs non nulles de 'naned_soc' dans chaque groupe
        trailing_soc = first("naned_soc", ignorenulls=True).over(w)
        leading_soc = last("naned_soc", ignorenulls=True).over(w)
        # Ajouter ces colonnes
        tss = (
            tss.withColumn("trailing_soc", trailing_soc)
            .withColumn("leading_soc", leading_soc)
            .withColumn(
                f"trimmed_{mask}",
                (col(mask))
                & (col("soc") != col("trailing_soc"))
                & (col("soc") != col("leading_soc")),
            )
            .drop("naned_soc")
        )

    return tss

def compute_idx_from_masks(tss, masks: list[str]):
    """
    Spark version of compute_idx_from_masks.

    Args:
        tss (DataFrame): Spark DataFrame.
        masks (list): List of boolean column names to compute idx on.

    Returns:
        DataFrame: Transformed Spark DataFrame.
    """

    for mask in masks:
        idx_col_name = f"{mask}_idx"

        w = Window.partitionBy("vin").orderBy("date")

        # Décalage de mask par groupe
        shifted_mask = lag(col(mask), 1).over(w)

        # new_period_start_mask = shifted_mask != mask
        new_period_start_mask = shifted_mask.isNull() | (shifted_mask != col(mask))

        # Si max_td est défini, on ajoute aussi condition sur time_diff
        if MAX_TD is not None:
            new_period_start_mask = new_period_start_mask | (
                col("sec_time_diff") > lit(timedelta_to_interval(MAX_TD))
            )

        # Génère l'index via cumul
        tss = tss.withColumn(
            "new_period_start_mask",
            when(new_period_start_mask, lit(1)).otherwise(lit(0)),
        )

        tss = tss.withColumn(
            idx_col_name, spark_sum("new_period_start_mask").over(w)
        ).drop("new_period_start_mask")

    return tss

def compute_status_col(tss):


    # Fenêtre ordonnée par date pour chaque VIN
    w = Window.partitionBy("vin").orderBy("date")

    # Décalage pour calculer diff(odometer)
    prev_odo = lag("odometer").over(w)
    delta_odo = col("odometer") - prev_odo

    # Première base de status
    status = (
        when(col("in_charge") == True, lit("charging"))
        .when(col("in_charge") == False, lit("discharging"))
        .otherwise(lit("unknown"))
    )

    # Raffinement → si in_charge == False → "moving" ou "idle_discharging"
    status = (
        when(col("in_charge") == True, lit("charging"))
        .when(
            col("in_charge") == False,
            when(delta_odo > 0, lit("moving")).otherwise(lit("idle_discharging")),
        )
        .otherwise(lit("unknown"))
    )

    return tss.withColumn("status", status)


def compute_cum_var(tss, var_col: str, cum_var_col: str):
    if var_col not in tss.columns:
        return tss
    # Schéma de retour attendu → adapte le type si nécessaire
    schema = tss.schema.add(cum_var_col, DoubleType())

    @pandas_udf(schema, functionType="grouped_map")
    def integrate_trapezoid(df: pd.DataFrame) -> pd.DataFrame:

        df = df.sort_values("date").copy()

        x = df["date"].astype("int64") // 10**9  # Convertit ns → s
        y = df[var_col].fillna(0).astype("float64")

        cum = cumulative_trapezoid(y=y.values, x=x.values, initial=0) * KJ_TO_KWH

        # Ajuste pour que ça commence à zéro
        cum = cum - cum[0]

        df[cum_var_col] = cum
        return df

    return tss.groupBy("vin").apply(integrate_trapezoid)

def compute_charge_n_discharge_vars(tss: DF) -> DF:
    tss = compute_charge_n_discharge_masks(
        tss, IN_CHARGE_CHARGING_STATUS_VALS, IN_DISCHARGE_CHARGING_STATUS_VALS
    )
    tss = compute_charge_idx_bis(tss)
    return tss

def compute_charge_n_discharge_masks(
    tss: DF, in_charge_vals: list, in_discharge_vals: list
) -> DF:
    """Computes the `in_charge` and `in_discharge` masks either from the charging_status column or from the evolution of the soc over time."""
    if 'tesla-fleet-telemetry' in CHARGE_MASK_WITH_CHARGING_STATUS_MAKES:
        return charge_n_discharging_masks_from_charging_status(
            tss, in_charge_vals, in_discharge_vals
        )

def charge_n_discharging_masks_from_charging_status(
    tss: DF, in_charge_vals: list, in_discharge_vals: list
) -> DF:
    assert "charging_status" in tss.columns, NO_CHARGING_STATUS_COL_ERROR

    # Masques booléens Spark
    tss = tss.withColumn(
        "in_charge",
        when(col("charging_status").isin(in_charge_vals), lit(True)).otherwise(
            lit(False)
        ),
    )

    tss = tss.withColumn(
        "in_discharge",
        when(col("charging_status").isin(in_discharge_vals), lit(True)).otherwise(
            lit(False)
        ),
    )

    return tss

def compute_energy_added(tss: DF) -> DF:
    tss = tss.withColumn(
        "charge_energy_added",
        when(
            col("dc_charge_energy_added").isNotNull()
            & (col("dc_charge_energy_added") > 0),
            col("dc_charge_energy_added"),
        ).otherwise(col("ac_charge_energy_added")),
    )
    return tss

def compute_charge_idx_bis(tss: DF) -> DF:

    tss = compute_energy_added(tss)

    # 1. Filtrer les lignes où soc n'est pas null
    tss_na = tss.filter(col("soc").isNotNull())

    # 2. Créer une fenêtre ordonnée par date par VIN
    vin_window = Window.partitionBy("vin").orderBy("date")

    # 3. Calcul des différences
    tss_na = (
        tss_na.withColumn("soc_diff", col("soc") - lag("soc", 1).over(vin_window))
        .withColumn(
            "trend",
            when(col("soc_diff") > 0, lit(1))
            .when(col("soc_diff") < 0, lit(-1))
            .otherwise(lit(0)),
        )
        .withColumn("prev_trend", lag("trend", 1).over(vin_window))
        .withColumn("prev_prev_trend", lag("trend", 2).over(vin_window))
        .withColumn("prev_prev_prev_trend", lag("trend", 3).over(vin_window))
        .withColumn("prev_date", lag("date", 1).over(vin_window))
        .withColumn(
            "time_diff_min",
            (unix_timestamp(col("date")) - unix_timestamp(col("prev_date"))) / 60,
        )
        .withColumn("time_gap", col("time_diff_min") > 60)
        .withColumn(
            "trend_change",
            when(
                (
                    (col("trend") == col("prev_trend"))
                    & (col("prev_trend") != col("prev_prev_trend"))
                    & (col("prev_prev_trend") == col("prev_prev_prev_trend"))
                )
                | col("time_gap"),
                lit(1),
            ).otherwise(lit(0)),
        )
    )

    # 4. Initialiser les premières lignes à 0
    tss_na = tss_na.withColumn(
        "trend_change",
        when(col("date") == lag("date", 1).over(vin_window), lit(0)).otherwise(
            col("trend_change")
        ),
    )

    # 5. Cumulative sum (session index)
    tss_na = tss_na.withColumn(
        "in_charge_idx",
        spark_sum("trend_change").over(
            vin_window.rowsBetween(Window.unboundedPreceding, 0)
        ),
    )

    # 6. Join avec le DataFrame original
    tss = tss.join(
        tss_na.select("vin", "date", "soc", "soc_diff", "in_charge_idx"),
        on=["vin", "date", "soc"],
        how="left",
    )

    # 7. Forward-fill `odometer` et `in_charge_idx` (non-natif en Spark, mais on peut approximer)
    fill_window = (
        Window.partitionBy("vin")
        .orderBy("date")
        .rowsBetween(Window.unboundedPreceding, 0)
    )
    tss = tss.withColumn(
        "odometer",
        coalesce(col("odometer"), expr("last(odometer, true)").over(fill_window)),
    ).withColumn(
        "in_charge_idx",
        coalesce(
            col("in_charge_idx"),
            expr("last(in_charge_idx, true)").over(fill_window),
        ),
    )

    return tss

In [None]:
from transform.processed_tss.config import RENAME_COLS_DICT
from core.spark_utils import safe_astype_spark_with_error_handling
from pyspark.sql.functions import min, max, col

"""
tss = compute_charge_n_discharge_vars(tss)
tss = tss.join(spark.createDataFrame(fleet_info), "vin", "left")
tss = tss.sort("vin", ascending=True)
"""

tss = tss_spark.withColumnsRenamed(RENAME_COLS_DICT)
tss = safe_astype_spark_with_error_handling(tss)
tss = normalize_units_to_metric(tss)
tss = tss.orderBy(["vin", "date"])
tss = compute_date_vars(tss)
tss = compute_charge_n_discharge_vars(tss)
tss = tss.join(spark_session.createDataFrame(fleet_info), "vin", "left")
tss = tss.sort("vin", ascending=True)
print(tss.columns)
# Obtenir les valeurs min et max
min_value = tss.agg(min(col("in_charge_idx"))).collect()
max_value = tss.agg(max(col("in_charge_idx"))).collect()

tss_pandas = tss.toPandas()

print(f"Min: {min_value}")
print(f"Max: {max_value}")

In [None]:
left = ptss_filtered_no_spark[['date', 'in_charge_idx', 'soc', 'sec_time_diff']]
right = tss_pandas[['date', 'in_charge_idx', 'soc', 'soc_diff', 'sec_time_diff']]

left['in_charge_idx_minus_ind_charge_idx_lag'] = left['in_charge_idx'] - left['in_charge_idx'].shift(1)
right['in_charge_idx_minus_ind_charge_idx_lag'] = right['in_charge_idx'] - right['in_charge_idx'].shift(1)

merged = pd.merge(left, right, on=['date'], how='left', suffixes=('_left', '_right'))

merged[merged.in_charge_idx_minus_ind_charge_idx_lag_left != merged.in_charge_idx_minus_ind_charge_idx_lag_right]

In [None]:
tss_pandas[['date', 'in_charge_idx', 'soc', 'soc_diff', 'sec_time_diff']][600:630]

In [None]:
ptss_filtered_no_spark[['date', 'in_charge_idx', 'soc', 'sec_time_diff']][600:630]