In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split

In [2]:
path=r".\railway.csv"
df=pd.read_csv(path,low_memory=False)
df.head()

Unnamed: 0,Transaction ID,Date of Purchase,Time of Purchase,Purchase Type,Payment Method,Railcard,Ticket Class,Ticket Type,Price,Departure Station,Arrival Destination,Date of Journey,Departure Time,Arrival Time,Actual Arrival Time,Journey Status,Reason for Delay,Refund Request
0,da8a6ba8-b3dc-4677-b176,2023-12-08,12:41:11,Online,Contactless,Adult,Standard,Advance,43,London Paddington,Liverpool Lime Street,2024-01-01,11:00:00,13:30:00,13:30:00,On Time,,No
1,b0cdd1b0-f214-4197-be53,2023-12-16,11:23:01,Station,Credit Card,Adult,Standard,Advance,23,London Kings Cross,York,2024-01-01,09:45:00,11:35:00,11:40:00,Delayed,Signal Failure,No
2,f3ba7a96-f713-40d9-9629,2023-12-19,19:51:27,Online,Credit Card,,Standard,Advance,3,Liverpool Lime Street,Manchester Piccadilly,2024-01-02,18:15:00,18:45:00,18:45:00,On Time,,No
3,b2471f11-4fe7-4c87-8ab4,2023-12-20,23:00:36,Station,Credit Card,,Standard,Advance,13,London Paddington,Reading,2024-01-01,21:30:00,22:30:00,22:30:00,On Time,,No
4,2be00b45-0762-485e-a7a3,2023-12-27,18:22:56,Online,Contactless,,Standard,Advance,76,Liverpool Lime Street,London Euston,2024-01-01,16:45:00,19:00:00,19:00:00,On Time,,No


In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder

# Assuming df is your training DataFrame
categorical_cols = ['Purchase Type', 'Payment Method', 'Railcard', 'Ticket Class', 'Ticket Type', 
                    'Departure Station', 'Arrival Destination', 'Journey Status', 
                    'Reason for Delay', 'Refund Request']

# Label encode categorical columns
missing_value_placeholder = 'Missing'

# Function to handle missing values in categorical columns
def handle_missing_values(df, categorical_cols, placeholder='Missing'):
    for col in categorical_cols:
        # Replace NaN with the placeholder value
        df[col] = df[col].fillna(placeholder)
    return df

# Function to safely fit and transform categorical columns
def safe_label_encoder(df, categorical_cols):
    label_encoders = {}
    for col in categorical_cols:
        label_encoders[col] = LabelEncoder()
        df[col] = df[col].fillna(missing_value_placeholder)  # Handle missing values
        df[col] = label_encoders[col].fit_transform(df[col].astype(str))  # Encode
    return label_encoders, df

# Train the model on the training data
label_encoders, df = safe_label_encoder(df, categorical_cols)

# Prepare the features (X) and target (y)
X = df[categorical_cols]
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define the combined model with embedding layers and regression layers
def create_combined_model(input_shape):
    inputs = []
    embeddings = []
    
    # Create an embedding layer for each categorical column
    for i, col in enumerate(X_train.columns):
        input_layer = layers.Input(shape=(1,), dtype=tf.int32, name=f"input_{col}")
        embed_dim = int(np.ceil(len(df[col].unique()) ** 0.25))  # Embedding dimension
        embed_layer = layers.Embedding(input_dim=len(df[col].unique()), output_dim=embed_dim)(input_layer)
        embeddings.append(embed_layer)
        inputs.append(input_layer)
    
    # Concatenate all embedding layers
    x = layers.Concatenate()(embeddings)
    x = layers.Flatten()(x)
    
    # Add dense layers for regression
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dropout(0.3)(x)  # Dropout for regularization
    output = layers.Dense(1)(x)  # Regression output (price prediction)
    
    # Create and compile the model
    model = models.Model(inputs=inputs, outputs=output)
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Instantiate and summarize the model
model = create_combined_model(X_train.shape)
model.summary()

# Prepare the input data for the model (convert categorical columns to list of inputs)
train_inputs = [X_train[col].values for col in X_train.columns]

# Train the model
model.fit(train_inputs, y_train, epochs=200, batch_size=64, validation_split=0.3)

# Now the model is trained and ready for inference on new data


# Assuming df_new is your new (unseen) data for inference
df_new = pd.DataFrame({
    'Purchase Type': ['Online'],
    'Payment Method': ['Credit Card'],
    'Railcard': ['Adult'],
    'Ticket Class': ['Standard'],
    'Ticket Type': ['Advance'],
    'Departure Station': ['London Kings Cross'],
    'Arrival Destination': ['Liverpool Lime Street'],
    'Journey Status': ['Delayed'],
    'Reason for Delay': ['Signal Failure'],
    'Refund Request': ['No']
})

# Step 1: Handle missing values in new data
df_new = handle_missing_values(df_new, categorical_cols, placeholder=missing_value_placeholder)

# Step 2: Apply the label encoding for the new data using the same label encoders
for col in categorical_cols:
    if df_new[col].iloc[0] not in label_encoders[col].classes_:
        # If the value is not in the classes, assign a default encoding (e.g., 0 or 'Missing')
        df_new[col] = label_encoders[col].transform([missing_value_placeholder])[0]
    else:
        df_new[col] = label_encoders[col].transform(df_new[col].astype(str))

# Step 3: Prepare the input data for the model
input_data = [df_new[col].values for col in df_new.columns]

# Step 4: Make a prediction using the trained model
prediction = model.predict(input_data)

# Step 5: Output the prediction (predicted price)
print("Predicted Price:", prediction)





Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_Purchase Type (Input  [(None, 1)]                  0         []                            
 Layer)                                                                                           
                                                                                                  
 input_Payment Method (Inpu  [(None, 1)]                  0         []                            
 tLayer)                                                                                          
                                                                                                  
 input_Railcard (InputLayer  [(None, 1)]                  0         []                            
 )                                                                                         

In [4]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Prepare the input data for the model (convert categorical columns to list of inputs)
test_inputs = [X_test[col].values for col in X_test.columns]

# Predict on the test set
y_pred = model.predict(test_inputs).flatten()

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5  # Square root of MSE

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


Mean Absolute Error (MAE): 0.7914
Mean Squared Error (MSE): 1.8518
Root Mean Squared Error (RMSE): 1.3608


In [5]:


# Assuming df_new is your new (unseen) data for inference
df_new = pd.DataFrame({
    'Purchase Type': ['Online'],
    'Payment Method': ['Debit Card'],
    'Railcard': ['Adult'],
    'Ticket Class': ['First Class'],
    'Ticket Type': ['Advance'],
    'Departure Station': ['London Kings Cross'],
    'Arrival Destination': ['Manchester Piccadilly'],
    'Journey Status': ['On Time'],
    'Reason for Delay': ['Missing'],
    'Refund Request': ['Yes']
})

# Step 1: Handle missing values in new data
df_new = handle_missing_values(df_new, categorical_cols, placeholder=missing_value_placeholder)

# Step 2: Apply the label encoding for the new data using the same label encoders
for col in categorical_cols:
    if df_new[col].iloc[0] not in label_encoders[col].classes_:
        # If the value is not in the classes, assign a default encoding (e.g., 0 or 'Missing')
        df_new[col] = label_encoders[col].transform([missing_value_placeholder])[0]
    else:
        df_new[col] = label_encoders[col].transform(df_new[col].astype(str))

# Step 3: Prepare the input data for the model
input_data = [df_new[col].values for col in df_new.columns]

# Step 4: Make a prediction using the trained model
prediction = model.predict(input_data)

# Step 5: Output the prediction (predicted price)
print("Predicted Price:", prediction)


Predicted Price: [[70.13487]]


In [6]:
model.save('trained_model.h5')


  saving_api.save_model(


In [8]:
import joblib

# Save the label encoders
joblib.dump(label_encoders, 'label_encoders.pkl')


['label_encoders.pkl']