# Preprocessing


## Imports

In [1]:
import pandas as pd
import posixpath
import data
import eda
import numpy as np
import matplotlib as plt
import plotly.express as px

DATA_PATH = "C:\\Users\\derar\\Documents\\Data Mining Project\\Code\\data"
OUTPUT_PATH = "C:\\Users\\derar\\Documents\\Data Mining Project\\Code\\outputs"

## Load Data

In [2]:
file_name = "sncb_data_challenge.csv"  
file_path = posixpath.join(DATA_PATH, file_name)
data_df = data.load_data_csv(file_path)
data_df = data.reformat_data(data_df)
data_df.drop(columns=["Column1"], inplace=True)

## Remove events < 4h before the incident and > 10min after the incident

In [3]:
def filter_events_out_of_interval(data_df, interval):
    def filter_events(row, interval, list_columns_indices):
        seconds_to_incident = row.iloc[3]
        too_low = len(seconds_to_incident[seconds_to_incident<interval[0]])
        too_high = len(seconds_to_incident[seconds_to_incident>interval[1]])
        for col in list_columns_indices:
            row.iloc[col] = row.iloc[col][too_low:][:-too_high]
        return row
        
    list_columns = ["vehicles_sequence", "events_sequence", "seconds_to_incident_sequence", "train_kph_sequence", "dj_ac_state_sequence", "dj_dc_state_sequence"]
    list_columns_indices = []
    for el in list_columns:
        list_columns_indices.append(data_df.columns.to_list().index(el))
    filtered_data_df = data_df.apply(lambda row: filter_events(row, interval, list_columns_indices), axis=1)
    return filtered_data_df

interval= [-3600*4, 360]
data_df = filter_events_out_of_interval(data_df, interval)

## Event filtering

In [51]:
def compute_events_freq(events_sequence):
    unique_events, events_count = np.unique(events_sequence, return_counts=True)
    total_events_count = np.sum(events_count)
    events_freq = events_count / total_events_count
    events_freq = np.column_stack((unique_events, events_freq))
    events_freq = {int(k): v for k, v in events_freq}
    return events_freq

def compute_events_to_remove(data_df, events_freq_all_classes, t):
    events_to_remove = {}
    events_freq_per_incident = {}
    for incident_type, events_sequences in list(data_df.groupby("incident_type")["events_sequence"]):
        events_sequence_per_incident = np.concatenate(events_sequences.to_list())
        events_freq_per_incident[incident_type] = compute_events_freq(events_sequence_per_incident)

    for incident_type, events_freq in events_freq_per_incident.items():
        events_to_remove[incident_type] = []
        for event_id, event_freq in events_freq.items():
            if event_freq/events_freq_all_classes[event_id] < t:
                events_to_remove[incident_type].append(event_id) 

    return events_to_remove

def remove_event_from_incidents(data_df, events_to_remove_per_incident, allowed_event=None):
    def remove_events(row, list_columns_indices, events_to_remove_per_incident, allowed_event=None):
        incident_type = row.iloc[-1]
        events_to_remove = set(events_to_remove_per_incident[incident_type]) 

        if allowed_event is not None:
            events_to_remove.discard(allowed_event)  # Faster than `remove`
    
        event_ids = row.iloc[2]  # Assuming this is the array of event IDs
        keep_mask = ~np.isin(event_ids, list(events_to_remove))  # Vectorized operation
    
        list_columns = np.array([row.iloc[col] for col in list_columns_indices])
        filtered_columns = list_columns[:, keep_mask]
        for i, col in enumerate(list_columns_indices):
            row.iloc[col] = filtered_columns[i]
        return row
    
    filtered_data_df = data_df.copy()
    list_columns = ["vehicles_sequence", "events_sequence", "seconds_to_incident_sequence", "train_kph_sequence", "dj_ac_state_sequence", "dj_dc_state_sequence"]
    list_columns_indices = []
    for el in list_columns:
        list_columns_indices.append(data_df.columns.to_list().index(el))
        
    filtered_data_df = filtered_data_df.apply(lambda row: remove_events(row, list_columns_indices, events_to_remove_per_incident, allowed_event), axis=1)
    return filtered_data_df
    

def filter_events(data_df, t, allowed_event=None):
    events_sequences_all_classes = np.concatenate(list(data_df["events_sequence"]))
    events_freq_all_classes = compute_events_freq(events_sequences_all_classes)
    events_to_remove_per_incident = compute_events_to_remove(data_df, events_freq_all_classes, t)
    filtered_data_df = remove_event_from_incidents(data_df, events_to_remove_per_incident, allowed_event).reset_index(drop=True)
    return filtered_data_df

    
filtered_data_df = filter_events(data_df, 2, 2)
filtered_data_df

Unnamed: 0,incident_id,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type
0,4432881,"[609.0, 609.0, 609.0, 609.0, 609.0, 609.0, 609...","[1132.0, 2970.0, 4082.0, 4092.0, 2982.0, 3236....","[-5506.0, -3583.0, -3546.0, -3546.0, -3542.0, ...",50.876601,4.718143,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",4
1,4432943,[],[],[],51.037435,4.431218,[],[],[],13
2,4432955,"[592.0, 592.0, 592.0, 592.0, 592.0, 592.0, 592...","[4114.0, 4168.0, 4168.0, 4114.0, 4168.0, 4168....","[-10932.0, -10932.0, -10913.0, -10472.0, -1047...",50.864083,4.162115,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, ...",14
3,4433021,"[576.0, 576.0, 576.0, 576.0, 576.0, 576.0, 576...","[358.0, 4056.0, 4054.0, 2740.0, 3528.0, 3506.0...","[-596.0, 418.0, 595.0, 699.0, 1122.0, 1151.0, ...",51.183220,4.276025,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",2
4,4433129,"[634.0, 634.0, 634.0, 634.0]","[4140.0, 4168.0, 4140.0, 4168.0]","[-139.0, -105.0, 944.0, 953.0]",50.818727,3.253601,"[0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0]",14
...,...,...,...,...,...,...,...,...,...,...
1006,4611953,"[1016.0, 1016.0, 1016.0, 1016.0, 1016.0, 1016....","[4158.0, 4140.0, 4162.0, 4160.0, 4168.0, 4166....","[-13835.0, -13829.0, -13826.0, -13822.0, -1381...",50.159057,5.972059,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",14
1007,4611991,"[505.0, 505.0, 505.0, 505.0, 505.0, 505.0, 505...","[4054.0, 2736.0, 3240.0, 3532.0, 4056.0, 3540....","[-24.0, -23.0, 33.0, 33.0, 36.0, 41.0, 107.0, ...",50.767118,4.424321,"[0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",2
1008,4612137,"[559.0, 559.0, 559.0, 559.0, 559.0, 559.0, 559...","[4048.0, 2736.0, 3506.0, 4050.0, 2740.0, 4122....","[-613.0, -612.0, 312.0, 317.0, 325.0, 350.0, 4...",51.164770,4.160534,"[0.9, 0.3, 0.0, 0.0, 0.0, 0.0, 0.6, 0.5, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",2
1009,4612321,[],[],[],50.161550,4.976849,[],[],[],13


## Remove rows with less than x timestamps

In [52]:
def remove_short_rows(row, x):
    try:
        return len(row) > x
    except:
        return False

filtered_data_df = filtered_data_df[filtered_data_df["events_sequence"].apply(lambda row: remove_short_rows(row, x=1))].reset_index(drop=True)
filtered_data_df

Unnamed: 0,incident_id,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type
0,4432881,"[609.0, 609.0, 609.0, 609.0, 609.0, 609.0, 609...","[1132.0, 2970.0, 4082.0, 4092.0, 2982.0, 3236....","[-5506.0, -3583.0, -3546.0, -3546.0, -3542.0, ...",50.876601,4.718143,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",4
1,4432955,"[592.0, 592.0, 592.0, 592.0, 592.0, 592.0, 592...","[4114.0, 4168.0, 4168.0, 4114.0, 4168.0, 4168....","[-10932.0, -10932.0, -10913.0, -10472.0, -1047...",50.864083,4.162115,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, ...",14
2,4433021,"[576.0, 576.0, 576.0, 576.0, 576.0, 576.0, 576...","[358.0, 4056.0, 4054.0, 2740.0, 3528.0, 3506.0...","[-596.0, 418.0, 595.0, 699.0, 1122.0, 1151.0, ...",51.183220,4.276025,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",2
3,4433129,"[634.0, 634.0, 634.0, 634.0]","[4140.0, 4168.0, 4140.0, 4168.0]","[-139.0, -105.0, 944.0, 953.0]",50.818727,3.253601,"[0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0]",14
4,4433267,"[1025.0, 1025.0, 1025.0, 1025.0, 1025.0, 1025....","[4028.0, 3620.0, 4028.0, 4076.0, 4028.0, 4076....","[-2328.0, -1970.0, -1970.0, -1970.0, -1498.0, ...",49.663705,5.698090,"[0.0, 22.8, 22.7, 22.8, 145.4, 145.1, 0.0, 26....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",11
...,...,...,...,...,...,...,...,...,...,...
640,4611895,"[539.0, 544.0, 544.0, 544.0, 544.0, 544.0, 544...","[2456.0, 1620.0, 1620.0, 1620.0, 1620.0, 1620....","[2101.0, -7670.0, -5337.0, -3986.0, -2678.0, -...",50.656246,4.421481,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",9
641,4611931,"[1057.0, 1057.0, 1057.0, 1057.0, 1057.0, 1057....","[3238.0, 3512.0, 4048.0, 4050.0, 2740.0, 3528....","[-1463.0, -1463.0, -1463.0, -1456.0, -1174.0, ...",50.886315,4.400089,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",2
642,4611953,"[1016.0, 1016.0, 1016.0, 1016.0, 1016.0, 1016....","[4158.0, 4140.0, 4162.0, 4160.0, 4168.0, 4166....","[-13835.0, -13829.0, -13826.0, -13822.0, -1381...",50.159057,5.972059,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",14
643,4611991,"[505.0, 505.0, 505.0, 505.0, 505.0, 505.0, 505...","[4054.0, 2736.0, 3240.0, 3532.0, 4056.0, 3540....","[-24.0, -23.0, 33.0, 33.0, 36.0, 41.0, 107.0, ...",50.767118,4.424321,"[0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",2


## AC_DC failure attribute

In [56]:
columns_bool = ['dj_ac_state_sequence', 'dj_dc_state_sequence']
filtered_data_df[columns_bool] = filtered_data_df[columns_bool].map(lambda row: row.astype(int))
filtered_data_df

Unnamed: 0,incident_id,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type
0,4432881,"[609.0, 609.0, 609.0, 609.0, 609.0, 609.0, 609...","[1132.0, 2970.0, 4082.0, 4092.0, 2982.0, 3236....","[-5506.0, -3583.0, -3546.0, -3546.0, -3542.0, ...",50.876601,4.718143,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4
1,4432955,"[592.0, 592.0, 592.0, 592.0, 592.0, 592.0, 592...","[4114.0, 4168.0, 4168.0, 4114.0, 4168.0, 4168....","[-10932.0, -10932.0, -10913.0, -10472.0, -1047...",50.864083,4.162115,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, ...",14
2,4433021,"[576.0, 576.0, 576.0, 576.0, 576.0, 576.0, 576...","[358.0, 4056.0, 4054.0, 2740.0, 3528.0, 3506.0...","[-596.0, 418.0, 595.0, 699.0, 1122.0, 1151.0, ...",51.183220,4.276025,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1]",2
3,4433129,"[634.0, 634.0, 634.0, 634.0]","[4140.0, 4168.0, 4140.0, 4168.0]","[-139.0, -105.0, 944.0, 953.0]",50.818727,3.253601,"[0.0, 0.0, 0.0, 0.0]","[0, 0, 0, 0]","[0, 0, 0, 0]",14
4,4433267,"[1025.0, 1025.0, 1025.0, 1025.0, 1025.0, 1025....","[4028.0, 3620.0, 4028.0, 4076.0, 4028.0, 4076....","[-2328.0, -1970.0, -1970.0, -1970.0, -1498.0, ...",49.663705,5.698090,"[0.0, 22.8, 22.7, 22.8, 145.4, 145.1, 0.0, 26....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",11
...,...,...,...,...,...,...,...,...,...,...
640,4611895,"[539.0, 544.0, 544.0, 544.0, 544.0, 544.0, 544...","[2456.0, 1620.0, 1620.0, 1620.0, 1620.0, 1620....","[2101.0, -7670.0, -5337.0, -3986.0, -2678.0, -...",50.656246,4.421481,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",9
641,4611931,"[1057.0, 1057.0, 1057.0, 1057.0, 1057.0, 1057....","[3238.0, 3512.0, 4048.0, 4050.0, 2740.0, 3528....","[-1463.0, -1463.0, -1463.0, -1456.0, -1174.0, ...",50.886315,4.400089,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
642,4611953,"[1016.0, 1016.0, 1016.0, 1016.0, 1016.0, 1016....","[4158.0, 4140.0, 4162.0, 4160.0, 4168.0, 4166....","[-13835.0, -13829.0, -13826.0, -13822.0, -1381...",50.159057,5.972059,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",14
643,4611991,"[505.0, 505.0, 505.0, 505.0, 505.0, 505.0, 505...","[4054.0, 2736.0, 3240.0, 3532.0, 4056.0, 3540....","[-24.0, -23.0, 33.0, 33.0, 36.0, 41.0, 107.0, ...",50.767118,4.424321,"[0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2


In [None]:
def indicate_ac_dc_prob_timestamp(row):
    return (row.iloc[0] == row.iloc[1]).astype(int)

def indicate_ac_dc_prob(row):
    return (row.iloc[0] == row.iloc[1]).any().astype(int)

def indicate_ac_dc_num(row):
    return (row.iloc[0] == row.iloc[1]).sum()

def indicate_ac_dc_prob_events(row):
    return row.iloc[2][row.iloc[0] == row.iloc[1]]

filtered_data_df["ac_dc_prob_timestamp"] = filtered_data_df[["dj_ac_state_sequence", "dj_dc_state_sequence"]].apply(indicate_ac_dc_prob_timestamp, axis=1)
filtered_data_df["ac_dc_prob"] = filtered_data_df[["dj_ac_state_sequence", "dj_dc_state_sequence"]].apply(indicate_ac_dc_prob, axis=1)
filtered_data_df["ac_dc_prob_num"] = filtered_data_df[["dj_ac_state_sequence", "dj_dc_state_sequence"]].apply(indicate_ac_dc_num, axis=1)
filtered_data_df["ac_dc_prob_events"] = filtered_data_df[["dj_ac_state_sequence", "dj_dc_state_sequence", "events_sequence"]].apply(indicate_ac_dc_prob_events, axis=1)

filtered_data_df

## Save data to outputs

In [90]:
def np_to_list(row):
    for col in range(len(row)):
        if isinstance(row.iloc[col], np.ndarray):
            row.iloc[col] = list(row.iloc[col])
    return row        
    
filtered_data_df = filtered_data_df.apply(np_to_list, axis=1)
filtered_data_df

Unnamed: 0,incident_id,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type,ac_dc_prob,ac_dc_prob_timestamp,ac_dc_prob_num,ac_dc_prob_events
0,4432881,"[609.0, 609.0, 609.0, 609.0, 609.0, 609.0, 609...","[1132.0, 2970.0, 4082.0, 4092.0, 2982.0, 3236....","[-5506.0, -3583.0, -3546.0, -3546.0, -3542.0, ...",50.876601,4.718143,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4,1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,"[1132.0, 2982.0, 2980.0, 2982.0, 2982.0]"
1,4432955,"[592.0, 592.0, 592.0, 592.0, 592.0, 592.0, 592...","[4114.0, 4168.0, 4168.0, 4114.0, 4168.0, 4168....","[-10932.0, -10932.0, -10913.0, -10472.0, -1047...",50.864083,4.162115,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, ...",14,1,"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, ...",83,"[4168.0, 4168.0, 4168.0, 2492.0, 2712.0, 4140...."
2,4433021,"[576.0, 576.0, 576.0, 576.0, 576.0, 576.0, 576...","[358.0, 4056.0, 4054.0, 2740.0, 3528.0, 3506.0...","[-596.0, 418.0, 595.0, 699.0, 1122.0, 1151.0, ...",51.183220,4.276025,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1]",2,0,"[0, 0, 0, 0, 0, 0, 0, 0]",0,[]
3,4433129,"[634.0, 634.0, 634.0, 634.0]","[4140.0, 4168.0, 4140.0, 4168.0]","[-139.0, -105.0, 944.0, 953.0]",50.818727,3.253601,"[0.0, 0.0, 0.0, 0.0]","[0, 0, 0, 0]","[0, 0, 0, 0]",14,1,"[1, 1, 1, 1]",4,"[4140.0, 4168.0, 4140.0, 4168.0]"
4,4433267,"[1025.0, 1025.0, 1025.0, 1025.0, 1025.0, 1025....","[4028.0, 3620.0, 4028.0, 4076.0, 4028.0, 4076....","[-2328.0, -1970.0, -1970.0, -1970.0, -1498.0, ...",49.663705,5.698090,"[0.0, 22.8, 22.7, 22.8, 145.4, 145.1, 0.0, 26....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",11,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,"[4162.0, 4160.0, 4028.0, 4162.0, 4160.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,4611895,"[539.0, 544.0, 544.0, 544.0, 544.0, 544.0, 544...","[2456.0, 1620.0, 1620.0, 1620.0, 1620.0, 1620....","[2101.0, -7670.0, -5337.0, -3986.0, -2678.0, -...",50.656246,4.421481,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",9,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,[]
641,4611931,"[1057.0, 1057.0, 1057.0, 1057.0, 1057.0, 1057....","[3238.0, 3512.0, 4048.0, 4050.0, 2740.0, 3528....","[-1463.0, -1463.0, -1463.0, -1456.0, -1174.0, ...",50.886315,4.400089,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,[]
642,4611953,"[1016.0, 1016.0, 1016.0, 1016.0, 1016.0, 1016....","[4158.0, 4140.0, 4162.0, 4160.0, 4168.0, 4166....","[-13835.0, -13829.0, -13826.0, -13822.0, -1381...",50.159057,5.972059,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",14,1,"[1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...",56,"[4158.0, 4140.0, 4162.0, 4160.0, 4168.0, 4112...."
643,4611991,"[505.0, 505.0, 505.0, 505.0, 505.0, 505.0, 505...","[4054.0, 2736.0, 3240.0, 3532.0, 4056.0, 3540....","[-24.0, -23.0, 33.0, 33.0, 36.0, 41.0, 107.0, ...",50.767118,4.424321,"[0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,[]


In [91]:
def save_data(data_df, filepath):
    data_df.to_csv(filepath, index=False)
    
filepath = posixpath.join(OUTPUT_PATH, "preprocessed_data.csv")
save_data(filtered_data_df, filepath)