In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

# Load your data
df = pd.read_csv('Filghts TEC_Valid.csv')

# Convert 'STD' to datetime
df['STD'] = pd.to_datetime(df['STD'])

# Filter data for the year 2023
df = df[df['STD'].dt.year == 2023]

# Initial drop of rows with NaN values in specific critical columns
df.dropna(subset=['STD', 'DepartureStation', 'ArrivalStation', 'Passengers'], inplace=True)

# Report if any NaNs remain in any other critical column
if df.isnull().any().any():
    print("Columns with NaNs:", df.columns[df.isnull().any()].tolist())
    df.dropna(inplace=True)  # Optionally remove all rows with any NaNs across the DataFrame

# Check again if there are still NaNs after cleaning
if df.isnull().any().any():
    raise ValueError("NaNs remain after thorough cleaning and filtering.")

# Create necessary time-based features
df['Month'] = df['STD'].dt.month
df['Day'] = df['STD'].dt.day
df['Weekday'] = df['STD'].dt.weekday
df['Hour'] = df['STD'].dt.hour
# Create a 'Route' feature combining 'DepartureStation' and 'ArrivalStation'

# One-hot encode categorical variables 'Route' and 'Weekday'
encoder = OneHotEncoder(sparse_output=False)
categorical_features = encoder.fit_transform(df[['Weekday', 'DepartureStation', 'ArrivalStation']])
categorical_feature_names = encoder.get_feature_names_out(['Weekday', 'DepartureStation', 'ArrivalStation'])

# Convert encoded features into a DataFrame
categorical_features_df = pd.DataFrame(categorical_features, columns=categorical_feature_names, index=df.index)

# Concatenate with the original DataFrame
df = pd.concat([df, categorical_features_df], axis=1)

# Select only the features needed for modeling
features = list(categorical_feature_names) + ['Month', 'Day', 'Hour', 'Capacity']
X = df[features]
y = df['Passengers']




Columns with NaNs: ['Aeronave']


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

# Assume 'X' and 'y' are already defined and split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model configuration
input_dim = X_train_scaled.shape[1]  # Number of features
output_dim = 1  # Regression output

# Create a Sequential model
model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(output_dim))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model with early stopping
model.fit(X_train_scaled, y_train, epochs=100, batch_size=10, verbose=1, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model on test data
loss = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f'Test MSE: {loss}')

# Predictions and calculate R^2 Score
predictions = model.predict(X_test_scaled)
nn_r2 = r2_score(y_test, predictions)
print("NN R^2 Score:", nn_r2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Test MSE: 515.6856079101562
NN R^2 Score: 0.6336396213065971


In [3]:
model.save('passengerReg.hdf5')

  saving_api.save_model(


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

# Load your data
df2 = pd.read_csv('Filghts TEC_Valid.csv')

# Convert 'STD' to datetime
df2['STD'] = pd.to_datetime(df2['STD'])

# Filter data for the year 2023
df2 = df2[df2['STD'].dt.year == 2024]

# Initial drop of rows with NaN values in specific critical columns
df2.dropna(subset=['STD', 'DepartureStation', 'ArrivalStation', ], inplace=True)


# Create necessary time-based features
df2['Month'] = df2['STD'].dt.month
df2['Day'] = df2['STD'].dt.day
df2['Weekday'] = df2['STD'].dt.weekday
df2['Hour'] = df2['STD'].dt.hour
# Create a 'Route' feature combining 'DepartureStation' and 'ArrivalStation'

# One-hot encode categorical variables 'Route' and 'Weekday'
encoder = OneHotEncoder(sparse_output=False)
categorical_features = encoder.fit_transform(df2[['Weekday', 'DepartureStation', 'ArrivalStation']])
categorical_feature_names = encoder.get_feature_names_out(['Weekday', 'DepartureStation', 'ArrivalStation'])

# Convert encoded features into a DataFrame
categorical_features_df = pd.DataFrame(categorical_features, columns=categorical_feature_names, index=df2.index)

# Concatenate with the original DataFrame
df2 = pd.concat([df2, categorical_features_df], axis=1)

# Select only the features needed for modeling
features = list(categorical_feature_names) + ['Month', 'Day', 'Hour', 'Capacity']
X = df2[features]
y = df2['Passengers']
X_scaled = scaler.transform(X)
predictions = model.predict(X_scaled)
df2['Passengers'] = predictions




In [22]:
df2.columns

Index(['Flight_ID', 'Aeronave', 'DepartureStation', 'ArrivalStation',
       'Destination_Type', 'Origin_Type', 'STD', 'STA', 'Capacity',
       'Passengers',
       ...
       'ArrivalStation_BJ', 'ArrivalStation_BK', 'ArrivalStation_BL',
       'ArrivalStation_BM', 'ArrivalStation_BN', 'ArrivalStation_BO',
       'ArrivalStation_BP', 'ArrivalStation_BQ', 'ArrivalStation_BS',
       'ArrivalStation_BT'],
      dtype='object', length=104)

In [6]:
df3 = df2[['Flight_ID', 'Aeronave', 'DepartureStation', 'ArrivalStation',
       'Destination_Type', 'Origin_Type', 'STD', 'STA', 'Capacity',
       'Passengers']]

In [8]:
df3.to_csv('Preds2.csv')