In [6]:
import pandas as pd
from pandasgui import show
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Format data for analysis




## 1. Import dataset
- Set correct delimiter (e.g., `;` or `,`)  
- Specify encoding (e.g., `utf-8`)  
- Optional: `on_bad_lines='skip'` to ignore problematic rows  
- Optional: `engine='python'` for complex CSV parsing

In [7]:
data = pd.read_csv (
     "SNCF_dataset_raw.csv",   
    sep=';',                   # semikolon-separerad fil
    encoding='utf-8-sig',      # handle BOM if present
    #on_bad_lines='skip',       # skip bad lines   
    #engine = 'python'          # change engine if needed
)
#data.reset_index(drop=True, inplace=True)
data.head(3)

Unnamed: 0,Date,Service,Gare de départ,Gare d'arrivée,Durée moyenne du trajet,Nombre de circulations prévues,Nombre de trains annulés,Commentaire annulations,Nombre de trains en retard au départ,Retard moyen des trains en retard au départ,...,Nombre trains en retard > 15min,Retard moyen trains en retard > 15 (si liaison concurrencée par vol),Nombre trains en retard > 30min,Nombre trains en retard > 60min,Prct retard pour causes externes,Prct retard pour cause infrastructure,Prct retard pour cause gestion trafic,Prct retard pour cause matériel roulant,Prct retard pour cause gestion en gare et réutilisation de matériel,"Prct retard pour cause prise en compte voyageurs (affluence, gestions PSH, correspondances)"
0,2018-01,National,BORDEAUX ST JEAN,PARIS MONTPARNASSE,141,870,5,,289,11.25,...,110,6.51,44,8,36.13,31.09,10.92,15.97,5.04,0.84
1,2018-01,National,LA ROCHELLE VILLE,PARIS MONTPARNASSE,165,222,0,,8,2.88,...,22,5.7,5,0,15.38,30.77,38.46,11.54,3.85,0.0
2,2018-01,National,PARIS MONTPARNASSE,QUIMPER,220,248,1,,37,9.5,...,26,7.55,17,7,26.92,38.46,15.38,19.23,0.0,0.0


## 2. Clean & prepare data

Strip spaces from column names and string values

Rename columns for clarity

Drop unused or irrelevant columns

Convert date columns to datetime

Add Year (numeric) and Month (string) columns

Convert numeric columns and round decimals where needed

## column descriptions

| Column                          | Description |
|---------------------------------|-------------|
| `year`                           | Year of the record (numeric). |
| `month`                          | Month abbreviation (`Jan`, `Feb`, …). |
| `date`                           | Date of the record (first day of the month). |
| `service`                        | Type of service (e.g., `National`). |
| `depart_st`                      | Departure station name. |
| `arrival_st`                      | Arrival station name. |
| `avg_duration`                   | Average duration of the trip (in minutes). |
| `exp_monthly_trains`             | Expected number of train departures in the month. |
| `cancelled_nb`                   | Number of trains cancelled in the month. |
| `delay>15`                       | Number of trains delayed more than 15 minutes. |
| `delay>30`                       | Number of trains delayed more than 30 minutes. |
| `delay>60`                       | Number of trains delayed more than 60 minutes. |
| `delay_external`                  | Percentage of delay due to external causes (%). |
| `delay_infrastructure`            | Percentage of delay due to infrastructure (%). |
| `delay_traffic_management`        | Percentage of delay due to traffic management (%). |
| `delay_rolling_stock`             | Percentage of delay due to rolling stock (%). |
| `delay_station_management`        | Percentage of delay due to station operations (%). |
| `delay_passenger_related`         | Percentage of delay caused by passenger-related issues (%

In [None]:
data.rename(columns={"Gare de départ":"depart_st",                                       #fix columns name
                              "Date": "date",
                              "Service":"service",                                      
                              "Gare d'arrivée":"arrival_st",
                              "Durée moyenne du trajet":"avg_duration", 
                              "Nombre de circulations prévues":"exp_monthly_trains", 
                              "Nombre de trains annulés":"cancelled_nb",
                              "Nombre trains en retard > 15min":"delay>15",
                              "Nombre trains en retard > 30min":"delay>30",
                              "Nombre trains en retard > 60min":"delay>60",
                              "Prct retard pour causes externes":"delay_external",
                              "Prct retard pour cause infrastructure":"delay_infrastructure",
                              "Prct retard pour cause gestion trafic":"delay_traffic_management",
                              "Prct retard pour cause matériel roulant":"delay_rolling_stock",
                              "Prct retard pour cause gestion en gare et réutilisation de matériel":"delay_station_management",
                              "Prct retard pour cause prise en compte voyageurs (affluence, gestions PSH, correspondances)":"delay_passenger_related"
                              }, inplace=True)

#display(national_data.head(5))

columns_to_drop = [                                                #select columns we do not need  for dropping
                   'Commentaire annulations', 
                   'Commentaire retards au départ', 
                   'Commentaire retards à l\'arrivée',
                   'Nombre de trains en retard au départ',
                   'Nombre de trains en retard à l\'arrivée',
                   'Retard moyen trains en retard > 15 (si liaison concurrencée par vol)',
                   'Retard moyen des trains en retard au départ',
                   'Retard moyen de tous les trains au départ',
                   'Retard moyen de tous les trains à l\'arrivée',
                   'Retard moyen des trains en retard à l\'arrivée'
          ]



national_trains_data_1 = data.drop(columns=columns_to_drop)  # actually removes columns

pd.set_option('display.float_format', '{:.2f}'.format)

columns_to_format = [                                                                     #select columns with number values to format
    'delay_external',
    'delay_rolling_stock',
    'delay_infrastructure',
    'delay_traffic_management',
    'delay_station_management',
    'delay_passenger_related'
]
national_trains_data_1[columns_to_format] = national_trains_data_1[columns_to_format].round(2)   #rounds to 2 decimals the selected columns



#More formatting: 

national_trains_data_1.columns = national_trains_data_1.columns.str.strip()                             #removes whitespaces from column names
strip_cols = national_trains_data_1.select_dtypes(include='object').columns                             #selects only string columns (not numbers)
national_trains_data_1[strip_cols] = national_trains_data_1[strip_cols].apply(lambda x: x.str.strip())  #removes whitespaces from string values in the dataframe


national_trains_data_1['date'] = pd.to_datetime(national_trains_data_1['date'], format='%Y-%m')        #converts Date (which was a string) to datetime (will be deleted later)

national_trains_data_1['year'] = national_trains_data_1['date'].dt.year                                # extracts year ex. 2023
national_trains_data_1['month'] = national_trains_data_1['date'].dt.strftime('%b')                     # extracts month ex. 'Jan', 'Feb'
cols = ['year', 'month'] + [c for c in national_trains_data_1.columns if c not in ['year', 'month']]
national_trains_data_1 = national_trains_data_1[cols]


display(national_trains_data_1.head(10))                                          #use display instead of print to get a nice table view!


#show(national_trains_data_1)                                                     #OBS pandasgui, opens in a separate window


Unnamed: 0,year,month,date,service,depart_st,arrival_st,avg_duration,exp_monthly_trains,cancelled_nb,delay>15,delay>30,delay>60,delay_external,delay_infrastructure,delay_traffic_management,delay_rolling_stock,delay_station_management,delay_passenger_related
0,2018,Jan,2018-01-01,National,BORDEAUX ST JEAN,PARIS MONTPARNASSE,141,870,5,110,44,8,36.13,31.09,10.92,15.97,5.04,0.84
1,2018,Jan,2018-01-01,National,LA ROCHELLE VILLE,PARIS MONTPARNASSE,165,222,0,22,5,0,15.38,30.77,38.46,11.54,3.85,0.0
2,2018,Jan,2018-01-01,National,PARIS MONTPARNASSE,QUIMPER,220,248,1,26,17,7,26.92,38.46,15.38,19.23,0.0,0.0
3,2018,Jan,2018-01-01,National,PARIS MONTPARNASSE,ST MALO,156,102,0,8,6,4,23.08,46.15,7.69,15.38,7.69,0.0
4,2018,Jan,2018-01-01,National,PARIS MONTPARNASSE,ST PIERRE DES CORPS,61,391,2,17,6,0,21.21,42.42,9.09,21.21,6.06,0.0
5,2018,Jan,2018-01-01,National,QUIMPER,PARIS MONTPARNASSE,223,256,1,21,9,2,27.78,55.56,0.0,5.56,5.56,5.56
6,2018,Jan,2018-01-01,National,RENNES,LYON PART DIEU,233,99,0,6,2,1,0.0,50.0,25.0,25.0,0.0,0.0
7,2018,Jan,2018-01-01,National,ST PIERRE DES CORPS,PARIS MONTPARNASSE,61,410,1,26,9,0,22.73,45.45,16.67,12.12,1.52,1.52
8,2018,Jan,2018-01-01,National,TOURS,PARIS MONTPARNASSE,75,194,0,7,1,0,25.0,56.25,6.25,12.5,0.0,0.0
9,2018,Jan,2018-01-01,National,NANTES,STRASBOURG,303,39,0,5,2,1,0.0,0.0,33.33,33.33,0.0,33.33


In [10]:
national_trains_data_1.drop(columns=['date'], inplace=True)                  #drops the original date column, we have year and month now  

## 3. pandasGUI visualisation

In [11]:
from pandasgui import show
show(national_trains_data_1)                                                     #OBS pandasgui, opens in a separate window

PandasGUI INFO — pandasgui.gui — Opening PandasGUI


<pandasgui.gui.PandasGui at 0x2679a5c7380>

## 4. Save as new CSV


In [12]:
national_trains_data_1.to_csv("Dataset_national_trains.csv", index=False, sep=';')