In [20]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

In [2]:
dossier_csv = 'data/'

fichiers_csv = glob.glob(dossier_csv + '*.csv')

dataframes = []

for fichier in fichiers_csv:
    try:
        df = pd.read_csv(fichier, on_bad_lines='skip')
        dataframes.append(df)
    except pd.errors.ParserError as e:
        print(f"Erreur lors de la lecture du fichier {fichier}: {e}")

colonnes = set()
for df in dataframes:
    colonnes.update(df.columns)

dataframes = [df.reindex(columns=colonnes) for df in dataframes]

df_concatene = pd.concat(dataframes, ignore_index=True)

print(df_concatene)

  df = pd.read_csv(fichier, on_bad_lines='skip')
  df = pd.read_csv(fichier, on_bad_lines='skip')
  df = pd.read_csv(fichier, on_bad_lines='skip')
  df = pd.read_csv(fichier, on_bad_lines='skip')
  df = pd.read_csv(fichier, on_bad_lines='skip')
  df = pd.read_csv(fichier, on_bad_lines='skip')
  df = pd.read_csv(fichier, on_bad_lines='skip')
  df = pd.read_csv(fichier, on_bad_lines='skip')


         TAXI_IN DEST_AIRPORT_ID  CARRIER_DELAY DEP_TIME_BLK  DISTANCE_GROUP  \
0            5.0           12478            NaN    0600-0659             1.0   
1            7.0           12478            NaN    0600-0659             1.0   
2            8.0           12478            NaN    0600-0659             1.0   
3            6.0           12478            NaN    0600-0659             1.0   
4            5.0           12478            NaN    0600-0659             1.0   
...          ...             ...            ...          ...             ...   
5635973     10.0           12892            NaN    0600-0659             2.0   
5635974      4.0           13232            NaN    0700-0759             6.0   
5635975      3.0           14679            NaN    2000-2059             2.0   
5635976      2.0           14679            NaN    1400-1459             2.0   
5635977      5.0           14679            NaN    1000-1059             2.0   

            FL_DATE ORIGIN  ARR_DELAY_N

In [3]:
colonnes_a_retirer = [
    'QUARTER', 'FL_DATE', 'AIRLINE_ID', 'CARRIER', 'ORIGIN_AIRPORT_ID', 'CRS_DEP_HOUR', 'ORIGIN_AIRPORT_SEQ_ID',
    'ORIGIN_CITY_MARKET_ID', 'ORIGIN_STATE_ABR', 'ORIGIN_STATE_FIPS',
    'ORIGIN_STATE_NM', 'ORIGIN_WAC', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_MARKET_ID',
    'DEST_STATE_ABR', 'DEST_STATE_FIPS', 'DEST_STATE_NM', 'DEST_WAC', 'DEP_DELAY_NEW',
    'DEP_DEL15', 'DEP_DELAY_GROUP', 'DEP_TIME_BLK', 'WHEELS_OFF', 'WHEELS_ON',
    'ARR_DEL15', 'ARR_DELAY_GROUP', 'ARR_TIME_BLK', 'FLIGHTS', 'FIRST_DEP_TIME', 'TOTAL_ADD_GTIME',
    'LONGEST_ADD_GTIME', 'DISTANCE_GROUP', 'TAIL_NUM', 'WEATHER_DELAY', 'LATE_AIRCRAFT_DELAY',
    'SECURITY_DELAY', 'TAXI_IN', 'FL_NUM', 'DEST', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'AIR_TIME',
    'DEP_TIME', 'DISTANCE', 'ACTUAL_ELAPSED_TIME', 'DIVERTED', 'CANCELLATION_CODE', 'DEP_DELAY', 'ARR_TIME',
    'NAS_DELAY', 'TAXI_OUT', 'ORIGIN', 'CANCELLED', 'Unnamed: 64', 'CARRIER_DELAY', 'CRS_ELAPSED_TIME', "DAY_OF_WEEK"
]

df_concatene = df_concatene.drop(columns=colonnes_a_retirer, errors='ignore')

print("Colonnes disponibles après suppression :")
print(df_concatene.columns)

Colonnes disponibles après suppression :
Index(['ARR_DELAY_NEW', 'ARR_DELAY', 'YEAR', 'DAY_OF_MONTH', 'UNIQUE_CARRIER',
       'ORIGIN_CITY_NAME', 'DEST_CITY_NAME', 'MONTH'],
      dtype='object')


In [4]:
missing_values_percentage = df_concatene.isnull().mean() * 100
high_missing_cols = missing_values_percentage[missing_values_percentage > 50]
df_cleaned = df_concatene.drop(columns=high_missing_cols.index)

In [5]:
def remove_outliers(df, column):
    df['z_score'] = zscore(df[column])
    df_filtered = df[(df['z_score'] > -3) & (df['z_score'] < 3)]
    df_filtered = df_filtered.drop(columns=['z_score'])
    return df_filtered

In [6]:
df_cleaned = remove_outliers(df_cleaned, 'ARR_DELAY')

In [7]:
df_cleaned = df_cleaned.dropna()

In [8]:
numeric_columns = df_cleaned.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df_cleaned.select_dtypes(include=[object]).columns.tolist()

print("\nColonnes numériques :")
print(numeric_columns)

print("\nColonnes catégorielles :")
print(categorical_columns)

if 'ARR_DELAY' in numeric_columns:
    numeric_columns.remove('ARR_DELAY')

for col in numeric_columns:
    df_cleaned[col] = pd.to_numeric(df[col], errors='coerce')

for col in categorical_columns:
    df_cleaned[col] = df[col].astype(str)

print("\nTypes des colonnes après conversion :")
print(df_cleaned.dtypes)


Colonnes numériques :
['ARR_DELAY_NEW', 'ARR_DELAY', 'MONTH']

Colonnes catégorielles :
['YEAR', 'DAY_OF_MONTH', 'UNIQUE_CARRIER', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME']

Types des colonnes après conversion :
ARR_DELAY_NEW       float64
ARR_DELAY           float64
YEAR                 object
DAY_OF_MONTH         object
UNIQUE_CARRIER       object
ORIGIN_CITY_NAME     object
DEST_CITY_NAME       object
MONTH                 int64
dtype: object


In [9]:
print(df_concatene.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5635978 entries, 0 to 5635977
Data columns (total 8 columns):
 #   Column            Dtype  
---  ------            -----  
 0   ARR_DELAY_NEW     float64
 1   ARR_DELAY         float64
 2   YEAR              object 
 3   DAY_OF_MONTH      object 
 4   UNIQUE_CARRIER    object 
 5   ORIGIN_CITY_NAME  object 
 6   DEST_CITY_NAME    object 
 7   MONTH             int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 344.0+ MB
None


In [10]:
# Convertir les retards en classes (0: pas de retard, 1: retard)
df_cleaned['Delayed'] = (df_cleaned['ARR_DELAY_NEW'] > 0).astype(int)

In [11]:
delayed_counts = df_cleaned['Delayed'].value_counts()
print("\nNombre de valeurs dans la colonne 'Delayed' :")
print(delayed_counts)


Nombre de valeurs dans la colonne 'Delayed' :
0    300414
1    123475
Name: Delayed, dtype: int64


In [12]:
# Séparer les caractéristiques et la variable cible
X = df_cleaned[['ORIGIN_CITY_NAME', 'DEST_CITY_NAME', 'MONTH', 'DAY_OF_MONTH']]
y = df_cleaned['Delayed']

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identifier les colonnes numériques et catégorielles
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=[object]).columns.tolist()

# Convertir toutes les colonnes catégorielles en chaînes de caractères
X_train[categorical_features] = X_train[categorical_features].astype(str)
X_test[categorical_features] = X_test[categorical_features].astype(str)

# Créer les transformateurs pour les colonnes numériques et catégorielles
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Créer le préprocesseur avec ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Créer le pipeline avec le préprocesseur et le classificateur
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DummyClassifier(strategy='most_frequent'))
])

# Entraîner le modèle
pipeline.fit(X_train, y_train)

# Prédire sur l'ensemble de test
y_pred = pipeline.predict(X_test)

# Évaluer le modèle
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7083677369128784
Classification Report:
              precision    recall  f1-score   support

           0       0.71      1.00      0.83     60054
           1       0.00      0.00      0.00     24724

    accuracy                           0.71     84778
   macro avg       0.35      0.50      0.41     84778
weighted avg       0.50      0.71      0.59     84778



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identifier les colonnes numériques et catégorielles
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=[object]).columns.tolist()

# Convertir toutes les colonnes catégorielles en chaînes de caractères
X_train[categorical_features] = X_train[categorical_features].astype(str)
X_test[categorical_features] = X_test[categorical_features].astype(str)

# Créer les transformateurs pour les colonnes numériques et catégorielles
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Créer le préprocesseur avec ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [24]:
def train_and_evaluate_model(model):
    pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('undersampler', RandomUnderSampler(random_state=42)),
        ('classifier', model)
    ])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    # Évaluer le modèle
    print(f"\nModèle : {model.__class__.__name__}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Matrice de confusion:")
    print(confusion_matrix(y_test, y_pred))

logistic_regression = LogisticRegression(max_iter=1000)
train_and_evaluate_model(logistic_regression)


Modèle : LogisticRegression
Accuracy: 0.5927717096416523
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.59      0.67     60054
           1       0.37      0.59      0.46     24724

    accuracy                           0.59     84778
   macro avg       0.58      0.59      0.57     84778
weighted avg       0.66      0.59      0.61     84778

Matrice de confusion:
[[35557 24497]
 [10027 14697]]


In [25]:
joblib.dump(pipeline, 'flight_delay_model.pkl')

['flight_delay_model.pkl']

In [26]:
# Sauvegarder les listes uniques de villes de départ et d'arrivée
origins = df['ORIGIN_CITY_NAME'].unique()
destinations = df['DEST_CITY_NAME'].unique()

joblib.dump(origins, 'origins.pkl')
joblib.dump(destinations, 'destinations.pkl')

['destinations.pkl']