In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("analysis2.csv")

In [3]:
df.head()

Unnamed: 0,train_number,station_code,third_ac,zone,chair_car,first_class,sleeper,second_ac,type,first_ac,distance,duration,month,month_3,month_6,year,week,train_delay_level,station_delay_level
0,10103,CSMT,1,KR,0,0,1,1,Exp,1,765.0,11.833333,7,14,20,16,12,4,1
1,10104,CSMT,1,KR,0,0,1,1,Exp,1,765.0,12.0,10,14,16,17,12,2,1
2,11007,CSMT,0,CR,1,0,0,0,Exp,0,192.0,4.0,1,1,1,3,1,0,1
3,11008,CSMT,0,CR,1,0,0,0,Exp,0,192.0,4.166667,13,15,16,13,9,0,1
4,11009,CSMT,0,CR,1,0,0,0,Exp,0,192.0,4.166667,1,1,1,2,0,0,1


In [4]:
df.shape

(27320, 19)

In [5]:
df.columns

Index(['train_number', 'station_code', 'third_ac', 'zone', 'chair_car',
       'first_class', 'sleeper', 'second_ac', 'type', 'first_ac', 'distance',
       'duration', 'month', 'month_3', 'month_6', 'year', 'week',
       'train_delay_level', 'station_delay_level'],
      dtype='object')

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler

In [7]:
from sklearn.pipeline import Pipeline

In [8]:
ct = ColumnTransformer(
    [
        ("encoder", OrdinalEncoder(), ["zone", "type"]),
        (
            "scaler",
            StandardScaler(),
            ["distance", "duration", "month", "month_3", "month_6", "year"],
        ),
    ],
    remainder="passthrough",
)

In [9]:
X = df[
    [
        "third_ac",
        "zone",
        "chair_car",
        "first_class",
        "sleeper",
        "second_ac",
        "type",
        "first_ac",
        "distance",
        "duration",
        "month",
        "month_3",
        "month_6",
        "year",
        "train_delay_level",
        "station_delay_level",
    ]
].copy()

In [10]:
y = df["week"].copy()

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2
)

In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
from sklearn.ensemble import RandomForestRegressor

In [15]:
pipeline = Pipeline([("transformer", ct), ("model", RandomForestRegressor())])

In [16]:
pipeline.fit(X_train, y_train)

In [17]:
pipeline.score(X_train, y_train)

0.9854649544575648

In [18]:
pipeline.score(X_test, y_test)

0.910013896839051

In [19]:
from sklearn.metrics import mean_squared_error

In [20]:
y_pred = pipeline.predict(X_test)

In [21]:
mean_squared_error(y_test, y_pred)

438.51915204573527

In [22]:
import joblib

In [23]:
joblib.dump(pipeline, "delay_pred_2.sav")

['delay_pred_2.sav']