In [19]:
#Import package pandas for data analysis
import pandas as pd
# Import package numpy for numeric computing
import numpy as np
import seaborn as sns
# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from patsy import dmatrices
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

import pymysql

import matplotlib.pyplot as plt

#For showing plots directly in the notebook run the command below
%matplotlib inline

# For saving multiple plots into a single pdf file
from matplotlib.backends.backend_pdf import PdfPages

import scipy.stats as ss

# ignore warnings
import warnings
warnings.filterwarnings('ignore')
import pickle

In [2]:
df = pd.read_csv('line_65_segments.csv', keep_default_na=True, delimiter=',')


In [None]:
df

In [3]:
segs_to_drop = list((df["segment_id"].value_counts()[df["segment_id"].value_counts() < 100]).index)

df = df.query(f'segment_id not in {segs_to_drop}')
df

Unnamed: 0.1,Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,PROGRNUMBER,STOPPOINTID,DIRECTION,ACTUALTIME_DEP,ACTUALTIME_ARR,hour,...,temp,pressure,humidity,wind_speed,wind_dir,sun,visibility,cloud_height,cloud_cover,holiday
0,0,2018-01-01,5956287,65,2,4521,1,34810,34797,9,...,4.6,7.1,7,14,240,0.2,30000,999,3,1
1,1,2018-01-01,5956287,65,3,1283,1,34887,34887,9,...,4.6,7.1,7,14,240,0.2,30000,999,3,1
2,2,2018-01-01,5956287,65,4,4456,1,34926,34926,9,...,4.6,7.1,7,14,240,0.2,30000,999,3,1
3,3,2018-01-01,5956287,65,5,1284,1,34957,34948,9,...,4.6,7.1,7,14,240,0.2,30000,999,3,1
4,4,2018-01-01,5956287,65,6,1285,1,35009,35009,9,...,4.6,7.1,7,14,240,0.2,30000,999,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738657,738657,2018-12-31,8590288,65,75,7250,1,23204,23193,6,...,9.3,9.2,9,7,230,0.0,30000,25,7,0
738658,738658,2018-12-31,8590288,65,76,7248,1,23349,23349,6,...,9.3,9.2,9,7,230,0.0,30000,25,7,0
738659,738659,2018-12-31,8590288,65,77,7207,1,23418,23418,6,...,9.3,9.2,9,7,230,0.0,30000,25,7,0
738660,738660,2018-12-31,8590288,65,78,7288,1,23520,23520,6,...,9.3,9.2,9,7,230,0.0,30000,25,7,0


In [4]:
# drop unrelated columns
df = df.drop(columns=['Unnamed: 0',"TRIPID", "LINEID", "STOPPOINTID"])
df["DIRECTION"] = df["DIRECTION"].astype('category')

In [5]:
df.nunique()

DAYOFSERVICE          360
PROGRNUMBER            87
DIRECTION               2
ACTUALTIME_DEP      66252
ACTUALTIME_ARR      66266
hour                   22
dayofweek               2
journey_time         1788
dwell_time            433
prev_stop_id          184
prev_progrnumber       87
prev_dept_time      66096
segment_id            187
rain                   43
temp                  294
pressure              173
humidity               19
wind_speed             34
wind_dir               36
sun                    11
visibility             55
cloud_height           73
cloud_cover             9
holiday                 3
dtype: int64

In [6]:
#check if any null column
df.isnull().sum()

DAYOFSERVICE        0
PROGRNUMBER         0
DIRECTION           0
ACTUALTIME_DEP      0
ACTUALTIME_ARR      0
hour                0
dayofweek           0
journey_time        0
dwell_time          0
prev_stop_id        0
prev_progrnumber    0
prev_dept_time      0
segment_id          0
rain                0
temp                0
pressure            0
humidity            0
wind_speed          0
wind_dir            0
sun                 0
visibility          0
cloud_height        0
cloud_cover         0
holiday             0
dtype: int64

In [7]:
df.dtypes

DAYOFSERVICE          object
PROGRNUMBER            int64
DIRECTION           category
ACTUALTIME_DEP         int64
ACTUALTIME_ARR         int64
hour                   int64
dayofweek              int64
journey_time           int64
dwell_time             int64
prev_stop_id           int64
prev_progrnumber       int64
prev_dept_time         int64
segment_id            object
rain                 float64
temp                 float64
pressure             float64
humidity               int64
wind_speed             int64
wind_dir               int64
sun                  float64
visibility             int64
cloud_height           int64
cloud_cover            int64
holiday                int64
dtype: object

In [8]:
#change datatypes of some features
df['DAYOFSERVICE'] = df['DAYOFSERVICE'].astype('datetime64') #convert DAYOFSERVICE to datetime
df['DAYOFSERVICE']=df['DAYOFSERVICE'].apply(lambda x: x.toordinal()) #then convert it to numeric
df['dayofweek'] = df['dayofweek'].astype('category')
df['hour'] = df['hour'].astype('category')
df.dtypes


DAYOFSERVICE           int64
PROGRNUMBER            int64
DIRECTION           category
ACTUALTIME_DEP         int64
ACTUALTIME_ARR         int64
hour                category
dayofweek           category
journey_time           int64
dwell_time             int64
prev_stop_id           int64
prev_progrnumber       int64
prev_dept_time         int64
segment_id            object
rain                 float64
temp                 float64
pressure             float64
humidity               int64
wind_speed             int64
wind_dir               int64
sun                  float64
visibility             int64
cloud_height           int64
cloud_cover            int64
holiday                int64
dtype: object

In [9]:
y = pd.DataFrame(df["journey_time"])
X = df.drop(["journey_time"],1)

In [10]:
# Split the dataset into two datasets: 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)

print("original range is: ",df.shape[0])
print("training range (70%):\t rows 0 to", round(X_train.shape[0]))
print("test range (30%): \t rows", round(X_train.shape[0]), "to", round(X_train.shape[0]) + X_test.shape[0])

original range is:  738338
training range (70%):	 rows 0 to 516836
test range (30%): 	 rows 516836 to 738338


In [11]:
continuous_columns = X.select_dtypes(['int64','float64']).columns
X[continuous_columns].dtypes

DAYOFSERVICE          int64
PROGRNUMBER           int64
ACTUALTIME_DEP        int64
ACTUALTIME_ARR        int64
dwell_time            int64
prev_stop_id          int64
prev_progrnumber      int64
prev_dept_time        int64
rain                float64
temp                float64
pressure            float64
humidity              int64
wind_speed            int64
wind_dir              int64
sun                 float64
visibility            int64
cloud_height          int64
cloud_cover           int64
holiday               int64
dtype: object

In [12]:
categorical_columns = df.select_dtypes(['category']).columns
df[categorical_columns].dtypes

DIRECTION    category
hour         category
dayofweek    category
dtype: object

In [None]:
# Correlation matrix using code found on https://stanford.edu/~mwaskom/software/seaborn/examples/many_pairwise_correlations.html
sns.set(style="white")

# Calculate correlation of all pairs of continuous features
corr = X_train[continuous_columns].corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 20))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, annot=True, mask=mask, vmax=1, vmin=-1,
            square=True, xticklabels=True, yticklabels=True,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
plt.yticks(rotation = 0)
plt.xticks(rotation = 90)

In [None]:
# dict to hold correlation values 
corr_dict = {}

# plot pairwise interaction between all continuous features and target
for column in X_train[continuous_columns]:
    # create temp df to merge column and target
    df_temp = pd.concat([X_train[column], y_train], axis=1)
    # store correlation in variable
    correlation = df_temp[[column, "journey_time"]].corr().values[0,1]
    # plot the column and tartget feature
    df_temp.plot(kind='scatter', x=column, y="journey_time", label="%.3f" % correlation)
    # add correlation to dict
    corr_dict[column] = correlation

# dataframe holding sorted correlation values to aid in interpreting results
corr_df = pd.DataFrame.from_dict(corr_dict, orient='index', columns=['journey_time']).sort_values('journey_time', ascending=False)
corr_df

In [13]:
low_information_gain = ['rain',
                        'wind_dir',
                        'cloud_cover',
                        'wind_speed',
                        'pressure',
                        'humidity',
                        'sun',
                        "PROGRNUMBER",
                        "ACTUALTIME_DEP",
                        "ACTUALTIME_ARR",
                        "dwell_time",
                        "prev_stop_id",
                        "prev_progrnumber",
                        "DAYOFSERVICE",
                        "DIRECTION"]

In [None]:
plt.figure()
flierprops = dict(marker='o', markerfacecolor='green', markersize=6,
                  linestyle='none')
df.boxplot(column=['journey_time'], by=['dayofweek'], flierprops=flierprops, figsize=(10,7))

In [None]:
plt.figure()
flierprops = dict(marker='o', markerfacecolor='green', markersize=6,
                  linestyle='none')
df.boxplot(column=['journey_time'], by=['hour'], flierprops=flierprops, figsize=(10,7))

### First attempt using one-hot encoding for the segment IDs
This results in a very large number of columns

In [14]:
# drop the useless column
df_rev1 = df.copy()
# drop low value features
df_rev1.drop(low_information_gain, 1, inplace=True)

In [15]:
df_rev1 = pd.get_dummies(df_rev1)
df_rev1.dtypes

journey_time              int64
prev_dept_time            int64
temp                    float64
visibility                int64
cloud_height              int64
                         ...   
segment_id_7287-7208      uint8
segment_id_7288-7286      uint8
segment_id_7289-7280      uint8
segment_id_7395-6124      uint8
segment_id_7564-4521      uint8
Length: 217, dtype: object

In [None]:
# y is the target
y = df_rev1["journey_time"]
# X is everything else
X = df_rev1.drop(["journey_time"],1)
# Split the dataset into two datasets: 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.3)

print("original range is: ",df_rev1.shape[0])
print("training range (70%):\t rows 0 to", round(X_train.shape[0]))
print("test range (30%): \t rows", round(X_train.shape[0]), "to", round(X_train.shape[0]) + X_test.shape[0])

In [None]:
print("\nDescriptive features in X:\n", X_train.head(5))
print("\nTarget feature in y:\n", y_train.head(5))

In [None]:
X_train.head(5)

In [None]:
# need to reset the index to allow contatenation with predicted values otherwise not joining on same index...
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
X_train.head(5)

In [None]:
# Train aka fit, a model using all continuous and categorical features.
multiple_linreg = LinearRegression().fit(X_train, y_train)

In [None]:
# Print the weights learned for each feature.
print("\nFeatures are: \n", X_train.columns)
print("\nCoeficients are: \n", multiple_linreg.coef_)
print("\nIntercept is: \n", multiple_linreg.intercept_)
print("\nFeatures and coeficients: \n", list(zip(X_train.columns, multiple_linreg.coef_)))

In [None]:
multiple_linreg_predictions_train = multiple_linreg.predict(X_train)

print("\nPredictions with multiple linear regression: \n")
actual_vs_predicted_multiplelinreg = pd.concat([y_train, pd.DataFrame(multiple_linreg_predictions_train, columns=['Predicted'])], axis=1)
print(actual_vs_predicted_multiplelinreg.head(100))

In [None]:
#This function is used repeatedly to compute all metrics
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print('\n==============================================================================')
    print("MAE: ", metrics.mean_absolute_error(testActualVal, predictions))
    #print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE: ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [None]:
printMetrics(y_train, multiple_linreg_predictions_train)

In [None]:
multiple_linreg_predictions_train = multiple_linreg.predict(X_test)

print("\nPredictions with multiple linear regression: \n")
actual_vs_predicted_multiplelinreg = pd.concat([y_test, pd.DataFrame(multiple_linreg_predictions_train, columns=['Predicted'])], axis=1)
print(actual_vs_predicted_multiplelinreg.head(100))

In [None]:
printMetrics(y_test, multiple_linreg_predictions_train)

In [None]:
scores = -cross_val_score(LinearRegression(), X, y, scoring='neg_mean_absolute_error', cv=5)
scores

In [None]:
metrics_list = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
scores = cross_validate(LinearRegression(), X, y, scoring=metrics, cv=5)
scores

### Second attempt by creating individual models for each segment
This means a large number of models will need to be created and stored

In [None]:
df_rev2 = df.copy()
# drop low value features
df_rev2.drop(low_information_gain, 1, inplace=True)
metrics_list = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
results_dict = {"MAE": [],
                "RMSE": [],
                "R2": [],
                "cv_neg_mean_absolute_error": [],
                "cv_neg_mean_squared_error": [],
                "cv_r2": []}

segments = df.segment_id.unique()

for i, seg in enumerate(segments):
    print(i, seg)
    seg_df = df_rev2.copy()
    seg_df = seg_df[seg_df["segment_id"] == seg]
    seg_df.drop(["segment_id"], 1, inplace=True)
    seg_df = pd.get_dummies(seg_df)

    # y is the target
    y = seg_df["journey_time"]
    # X is everything else
    X = seg_df.drop(["journey_time"],1)
    # Split the dataset into two datasets: 70% training and 30% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.3)

    # need to reset the index to allow contatenation with predicted values otherwise not joining on same index...
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
    # Train aka fit, a model using all continuous and categorical features.
    multiple_linreg = LinearRegression().fit(X_train, y_train)
    multiple_linreg_predictions_train = multiple_linreg.predict(X_train)
    
    results_dict["MAE"].append(metrics.mean_absolute_error(y_train, multiple_linreg_predictions_train))
    results_dict["RMSE"].append(metrics.mean_squared_error(y_train, multiple_linreg_predictions_train)**0.5)
    results_dict["R2"].append(metrics.r2_score(y_train, multiple_linreg_predictions_train))
                                           
    actual_vs_predicted_multiplelinreg = pd.concat([y_train, pd.DataFrame(multiple_linreg_predictions_train, columns=['Predicted'])], axis=1)
    print(actual_vs_predicted_multiplelinreg.head(10))
    scores = cross_validate(LinearRegression(), X, y, scoring=metrics_list, cv=5)
    for metric in metrics_list:
        print( metric, np.average(scores["test_" + metric]) )
        results_dict["cv_" + metric].append(np.average(scores["test_" + metric]))
    print("================================================")


In [None]:
for m in results_dict:
    results_dict[m] = np.average(results_dict[m])
results_dict

In [None]:
import xgboost as xgb
model=xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
model.fit(X_train, y_train)
model.score(X_test,y_test)

In [None]:
model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
 
# fit the model with the training data
model.fit(X_train,y_train)
 
predict_train = model.predict(X_train)
print('\nTarget on train data',predict_train) 
 
# Accuray Score on train dataset
accuracy_train = accuracy_score(y_train,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)
 
# predict the target on the test dataset
predict_test = model.predict(X_test)
print('\nTarget on test data',predict_test) 
 
# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_test,predict_test)
print('\naccuracy_score on test dataset : ', accuracy_test)

In [None]:
df_rev2 = df.copy()
# drop low value features
df_rev2.drop(low_information_gain, 1, inplace=True)
metrics_list = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
results_dict = {"MAE": [],
                "RMSE": [],
                "R2": [],
                "cv_neg_mean_absolute_error": [],
                "cv_neg_mean_squared_error": [],
                "cv_r2": []}

segments = df.segment_id.unique()

for i, seg in enumerate(segments):
    print(i, seg)
    seg_df = df_rev2.copy()
    seg_df = seg_df[seg_df["segment_id"] == seg]
    seg_df.drop(["segment_id"], 1, inplace=True)
    seg_df = pd.get_dummies(seg_df)

    # y is the target
    y = seg_df["journey_time"]
    # X is everything else
    X = seg_df.drop(["journey_time"],1)
    # Split the dataset into two datasets: 70% training and 30% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.3)

    # need to reset the index to allow contatenation with predicted values otherwise not joining on same index...
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
    # Train aka fit, a model using all continuous and categorical features.
    multiple_linreg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42).fit(X_train, y_train)
    multiple_linreg_predictions_train = multiple_linreg.predict(X_train)
    
    results_dict["MAE"].append(metrics.mean_absolute_error(y_train, multiple_linreg_predictions_train))
    results_dict["RMSE"].append(metrics.mean_squared_error(y_train, multiple_linreg_predictions_train)**0.5)
    results_dict["R2"].append(metrics.r2_score(y_train, multiple_linreg_predictions_train))
                                           
    actual_vs_predicted_multiplelinreg = pd.concat([y_train, pd.DataFrame(multiple_linreg_predictions_train, columns=['Predicted'])], axis=1)
    print(actual_vs_predicted_multiplelinreg.head(10))
    scores = cross_validate(xgb.XGBRegressor(objective="reg:squarederror", random_state=42), X, y, scoring=metrics_list, cv=5)
    for metric in metrics_list:
        print( metric, np.average(scores["test_" + metric]) )
        results_dict["cv_" + metric].append(np.average(scores["test_" + metric]))
    print("================================================")

In [None]:
for m in results_dict:
    results_dict[m] = np.average(results_dict[m])
results_dict

In [None]:
results_dict

In [None]:
from sklearn.neural_network import MLPRegressor
df_rev2 = df.copy()
# drop low value features
df_rev2.drop(low_information_gain, 1, inplace=True)
metrics_list = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
results_dict = {"MAE": [],
                "RMSE": [],
                "R2": [],
                "cv_neg_mean_absolute_error": [],
                "cv_neg_mean_squared_error": [],
                "cv_r2": []}

segments = df.segment_id.unique()

for i, seg in enumerate(segments):
    print(i, seg)
    seg_df = df_rev2.copy()
    seg_df = seg_df[seg_df["segment_id"] == seg]
    seg_df.drop(["segment_id"], 1, inplace=True)
    seg_df = pd.get_dummies(seg_df)

    # y is the target
    y = seg_df["journey_time"]
    # X is everything else
    X = seg_df.drop(["journey_time"],1)
    # Split the dataset into two datasets: 70% training and 30% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.3)

    # need to reset the index to allow contatenation with predicted values otherwise not joining on same index...
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
    # Train aka fit, a model using all continuous and categorical features.
    multiple_linreg = MLPRegressor(random_state=1, max_iter=300).fit(X_train, y_train)
    multiple_linreg_predictions_train = multiple_linreg.predict(X_train)
    
    results_dict["MAE"].append(metrics.mean_absolute_error(y_train, multiple_linreg_predictions_train))
    results_dict["RMSE"].append(metrics.mean_squared_error(y_train, multiple_linreg_predictions_train)**0.5)
    results_dict["R2"].append(metrics.r2_score(y_train, multiple_linreg_predictions_train))
                                           
    actual_vs_predicted_multiplelinreg = pd.concat([y_train, pd.DataFrame(multiple_linreg_predictions_train, columns=['Predicted'])], axis=1)
    print(actual_vs_predicted_multiplelinreg.head(10))
    scores = cross_validate(MLPRegressor(random_state=1, max_iter=300), X, y, scoring=metrics_list, cv=5)
    for metric in metrics_list:
        print( metric, np.average(scores["test_" + metric]) )
        results_dict["cv_" + metric].append(np.average(scores["test_" + metric]))
    print("================================================")

In [None]:
import xgboost as xgb

In [None]:
import xgboost as xgb
df_rev2 = df.copy()
# drop low value features
df_rev2.drop(low_information_gain, 1, inplace=True)
metrics_list = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
results_dict = {"MAE": [],
                "RMSE": [],
                "R2": [],
                "cv_neg_mean_absolute_error": [],
                "cv_neg_mean_squared_error": [],
                "cv_r2": []}

segment_models = {}

segments = df.segment_id.unique()

for i, seg in enumerate(segments):
    print(i, seg)
    seg_df = df_rev2.copy()
    seg_df = seg_df[seg_df["segment_id"] == seg]
    seg_df.drop(["segment_id"], 1, inplace=True)
    seg_df = pd.get_dummies(seg_df)

    # y is the target
    y = seg_df["journey_time"]
    # X is everything else
    X = seg_df.drop(["journey_time"],1)

    # Train aka fit, a model using all continuous and categorical features.
    segment_models[seg] = xgb.XGBRegressor(objective="reg:squarederror", random_state=42).fit(X, y)

In [None]:
segment_models['4436-5008']

In [None]:
import pickle


In [None]:
for seg in segment_models:
    with open(f'C:/Users/cls15/Google Drive/Comp Sci/Research Practicum/Code/dublin-bus-app/DataAnalytics/Conor/pickels/{seg}.pickle', 'wb') as f:
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(segment_models[seg], f, pickle.HIGHEST_PROTOCOL)

In [None]:
for seg in segment_models:
    with open(f'C:/Users/cls15/Google Drive/Comp Sci/Research Practicum/Code/dublin-bus-app/DataAnalytics/Conor/pickels/{seg}.pickle', 'rb') as f:
        data = pickle.load(f)

In [None]:
seg = '4436-5008'
seg_df = df_rev2.copy()
seg_df = seg_df[seg_df["segment_id"] == seg]
seg_df.drop(["segment_id"], 1, inplace=True)
seg_df = pd.get_dummies(seg_df)

# y is the target
y = seg_df["journey_time"]
# X is everything else
X = seg_df.drop(["journey_time"],1)

data.predict(X)

In [28]:
def get_prediction(segment, x):
    with open(f'C:/Users/cls15/Google Drive/Comp Sci/Research Practicum/Code/dublin-bus-app/DataAnalytics/Conor/pickels/{segment}.pickle', 'rb') as f:
        model = pickle.load(f)
        return model.predict(x)

data = pd.get_dummies(seg_df)
seg = '4436-5008'
seg_df = df_rev2.copy()
seg_df = seg_df[seg_df["segment_id"] == seg]
seg_df.drop(["segment_id"], 1, inplace=True)
seg_df = pd.get_dummies(seg_df)
X = seg_df.drop(["journey_time"],1)
get_prediction(seg, X.head(1))

df_rev2 = df.copy()
df_rev2.drop(low_information_gain, 1, inplace=True)
predictions = {}
for seg in df.segment_id.unique():
    seg_df = df_rev2.copy()
    seg_df = seg_df[seg_df["segment_id"] == seg]
    seg_df.drop(["segment_id"], 1, inplace=True)
    seg_df = pd.get_dummies(seg_df)

    # y is the target
    y = seg_df["journey_time"]
    # X is everything else
    X = seg_df.drop(["journey_time"],1)
    seg_df["predicted_journey"] = get_prediction(seg, X)
    predictions[seg] = seg_df["predicted_journey"]

result = pd.concat(list(predictions.values()))

df_rev2["predicted_journey"] = 0

df[["DAYOFSERVICE","PROGRNUMBER", ]]

df_predictions = df_rev2.join(result) 

df_predictions.join(df[["DAYOFSERVICE","PROGRNUMBER"]])

Unnamed: 0,hour,dayofweek,journey_time,prev_dept_time,segment_id,temp,visibility,cloud_height,holiday,predicted_journey,DAYOFSERVICE,PROGRNUMBER
0,9,0,130,34667,7564-4521,4.6,30000,999,1,145.713287,736695,2
1,9,0,77,34810,4521-1283,4.6,30000,999,1,109.635406,736695,3
2,9,0,39,34887,1283-4456,4.6,30000,999,1,43.710220,736695,4
3,9,0,22,34926,4456-1284,4.6,30000,999,1,31.118746,736695,5
4,9,0,52,34957,1284-1285,4.6,30000,999,1,64.406311,736695,6
...,...,...,...,...,...,...,...,...,...,...,...,...
738657,6,0,40,23153,4027-7250,9.3,30000,25,0,45.694683,737059,75
738658,6,0,145,23204,7250-7248,9.3,30000,25,0,152.858292,737059,76
738659,6,0,69,23349,7248-7207,9.3,30000,25,0,69.631256,737059,77
738660,6,0,102,23418,7207-7288,9.3,30000,25,0,98.871460,737059,78
