# Traffic Management Main File

    Consists of
        1) Part 1 - Validation - here is the recorded performance of train-testing using Time Series Validation by separating the values
        2) Part 2 - How to Use the Model for your Testing - Validation Process - here is the tutorial how to use the model.
        3) Part 3 - Your Own Dataset - This is where you can apply test and validation using your dataset with the tutorial shown in Part 2

The necessary classes are shown below

In [1]:
# importing necessary libraries

import pandas as pd
import numpy as np
# import geohash
from itertools import combinations
# from sklearn.neighbors import NearestNeighbors
import time

# from sklearn.linear_model import Lasso
# from sklearn.linear_model import LinearRegression

from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse


class TrainTestPreprocessor:
    """
        the input would be two pandas dataframes
        with features:
        
        geohash6 : object
        day: int with the starting value from 1
        timestamp
        
        and one target columns:
        
        demand: normalised / scaled (float) value ranging from 0 to 1
        
        the output would be two pandas dataframe that have been filled for zero demands (demand = 0), 
        added coupled features, and mean encoded.
        
        Example Usage: 
        
        >>> train_test_preprocessor = TrainTestPreprocessor(training_df = testing_df, test_df = validation_df)
        >>> train_test_preprocessor.preprocessing()

        >>> X_test = train_test_preprocessor.X_train
        >>> y_test = train_test_preprocessor.y_train
        >>> X_val = train_test_preprocessor.X_test
        >>> y_val = train_test_preprocessor.y_test
        
        
        
    """
    def __init__(self, training_df, test_df):
        self.training_df = training_df
        self.test_df = test_df 
        self.X_train = None 
        self.X_test = None 
        self.y_train = None 
        self.y_test = None 

    def preprocessing(self):

        """ 
        input: 

        training_day = (default = None) a list consists of unique day for training_set. Example: [1,2,3,4,5,6,7,8,9,10,11,12,13,14]
        test_day = (default = None) a list consists of unique day for test_df. Example: [15,16,17,18,19]
        training_hour = (default = None) a list consists of unique hour for training_df.


        Preprocessing consists of several steps that
        needs to be done

        
        1) feature engineering of original features:
            a) converting timestamp (str) into hour (int)
            and minute (int).
            b) 
        2) filling missing demand data that is actually 0 in value
        
        """

        # step 1) converting timestamp values with new columns hour and minute in both training

        self.training_df["hour"] = self.training_df["timestamp"].apply(lambda x: int(x.split(":")[0]))
        self.training_df["minute"] = self.training_df["timestamp"].apply(lambda x: int(x.split(":")[1]))
        self.training_df = self.training_df[["geohash6", "day", "hour", "minute", "demand"]]

        self.test_df["hour"] = self.test_df["timestamp"].apply(lambda x: int(x.split(":")[0]))
        self.test_df["minute"] = self.test_df["timestamp"].apply(lambda x: int(x.split(":")[1]))
        self.test_df = self.test_df[["geohash6", "day", "hour", "minute", "demand"]]

        # step 2) filling the missing demand data that is actually 0 in value

        self.training_df['key'] = 0
        self.training_df = \
        self.training_df[["key","geohash6"]].drop_duplicates().merge(self.training_df[["key","day"]].drop_duplicates(), how = "outer").\
        merge(self.training_df[["key", "hour"]].drop_duplicates(), how = "outer").\
        merge(self.training_df[["key", "minute"]].drop_duplicates(), how = "outer").drop(columns = ["key"]).\
        merge(self.training_df.drop(columns = ["key"]), how = "left").fillna(0)

        self.test_df['key'] = 0
        self.test_df = \
        self.test_df[["key","geohash6"]].drop_duplicates().merge(self.test_df[["key","day"]].drop_duplicates(), how = "outer").\
        merge(self.test_df[["key", "hour"]].drop_duplicates(), how = "outer").\
        merge(self.test_df[["key", "minute"]].drop_duplicates(), how = "outer").drop(columns = ["key"]).\
        merge(self.test_df.drop(columns = ["key"]), how = "left").fillna(0)

        self.training_df["day_of_week"] = self.training_df["day"].apply(lambda x: 7 if x%7 == 0 else x%7)
        self.test_df["day_of_week"] = self.test_df["day"].apply(lambda x: 7 if x%7 == 0 else x%7)

        # step 3) adding feature coupling and applying mean encoding into it:
        # original features would be: geohash6, day_of_week, hour, minute
        # coupled features would be: geohash6-day_of_week.
        # Converting all features into their `mean encoding`

        features = ['geohash6', 'day_of_week', 'hour', 'minute']
        original_features = ["geohash6", "day", "hour", "minute", "day_of_week"]
        for i in range(1, len(features)): # +1
            for comb in combinations(features, i):
                temp_df = self.training_df.copy().rename(columns = {"demand" : "-".join(list(comb)) + "(encoded)"}).\
                groupby(list(comb), as_index = False)\
                ["-".join(list(comb)) + "(encoded)"].mean()

                self.training_df = self.training_df.merge(temp_df, how = "left")
                self.test_df = self.test_df.merge(temp_df, how = "left")
                del temp_df

        self.X_train = self.training_df.copy().drop(columns = ["demand"] + original_features)
        self.y_train = self.training_df.copy()["demand"]
        self.X_test = self.test_df.copy().drop(columns = ["demand"] + original_features)
        self.y_test = self.test_df.copy()["demand"]

class Model:
    """
    Applying Bayesian Optimised XGBRegressor
    
    Usage: 
    >>> import numpy as np
    >>> regressor = Model()
    >>> regressor.fit(X_train, y_train)
    >>> y_pred = regressor.predict(X_test)
    >>> print("root mean square = {}".format(np.sqrt(mse(y_test, y_pred))))
    root mean square = 0.041972593025907085
    
    
    """
    def __init__(self):
        self.regressor = XGBRegressor(colsample_bytree= 0.4,
                            gamma= 0.21756976616440335,
                            min_child_weight= 10.0,
                            learning_rate= 0.05358746065589267,
                            max_depth= 4,
                            reg_alpha= 0.6870315939145919,
                            reg_lambda= 0.24937700167665464,
                            subsample= 0.845934912033431)

    def fit(self, X, y):
        self.regressor.fit(X, y)
    
    def predict(self, X):
        return self.regressor.predict(X)

    def rms(self, X_test,y_test):
        y_pred = self.regressor.predict(X_test)
        return np.sqrt(mse(y_test, y_pred))

                
class timeSeriesValidator:
    """
    Usage:
    
    >>> dataset = pd.read_csv("training.csv")
    >>> score = timeSeriesValidator(dataset = dataset, minimum = 1, maximum = 60)
    >>> result_performance = score.performance()
    
    
    """
    def __init__(self, dataset, minimum = 1, maximum = 60):
        self.minimum = minimum 
        self.maximum = maximum
        self.performance_results = {}
        self.dataset = dataset 

    def performance(self):

        start = time.time()

#         features = ['geohash6', 'day_of_week', 'hour', 'minute']
#         original_features = ["geohash6", "day", "hour", "minute", "day_of_week"]

        # defining the dataset copy

        datasetcopy = self.dataset.copy()
        datasetcopy.index = datasetcopy["day"]
        datasetcopy.index = datasetcopy.index.set_names(["id"])

        # with open("result002_features.txt", "a") as resultfile:
        for i in range(self.minimum, self.maximum + 1):
            training_index = [j for j in range(i, min(i + 14, self.maximum + 1))]
            test_index = [j for j in range(i+14, min(i+19, self.maximum + 1))]
            if len(test_index) == 5:
                print("{})with indexes given below".format(i))
                print("="*60)
                print("training = {}".format(training_index))
                print("test = {}".format(test_index))

                training_set = datasetcopy.loc[training_index,:]
                test_set = datasetcopy.loc[test_index,:]

                # preprocessing the training and testing data
                # including filling missing demand (by adding demand = 0), 
                # adding coupling features, and mean encoding

                train_test_preprocessor = TrainTestPreprocessor(training_set, test_set)
                train_test_preprocessor.preprocessing()
                X_train, X_test, y_train, y_test = train_test_preprocessor.X_train, train_test_preprocessor.X_test, \
                    train_test_preprocessor.y_train, train_test_preprocessor.y_test

        #         regressor = LinearRegression()
        #         regressor = Lasso(alpha = 0.000302)
        #         regressor.fit(X_train, y_train)
                # regressor = regressorModel(X = X_train.values, y = y_train.values.reshape(-1,1))
                model = Model()
                
                model.fit(X_train.values, y_train.values.reshape(-1,1))
#                 model.fit(X_train, y_train)
                y_pred = model.predict(X_test.values)
                r2test = r2_score(y_test.values.reshape(-1,1), y_pred)
                mse_test = mse(y_test.values.reshape(-1,1), y_pred)
                print("test r2_score = {}".format(r2test))
                print("mse test = {}".format(mse_test))
                y_train_pred = model.predict(X_train.values)
                r2train = r2_score(y_train.values.reshape(-1,1), y_train_pred)
                mse_train = mse(y_train.values.reshape(-1,1), y_train_pred)
                print("training r2_score = {}".format(r2train))
                print("mse train = {}".format(mse_train))
                print("="*60)
                
                if len(self.performance_results.get("mse_train", [])) == 0:
                    self.performance_results["mse_train"] = [mse_train]
                else:
                    self.performance_results["mse_train"].append(mse_train)
                    
                if len(self.performance_results.get("mse_test", [])) == 0:
                    self.performance_results["mse_test"] = [mse_test]
                else:
                    self.performance_results["mse_test"].append(mse_test)
                    
                if len(self.performance_results.get("r2train", [])) == 0:
                    self.performance_results["r2train"] = [r2train]
                else:
                    self.performance_results["r2train"].append(r2train)
                    
                if len(self.performance_results.get("r2test", [])) == 0:
                    self.performance_results["r2test"] = [r2test]
                else:
                    self.performance_results["r2test"].append(r2test)
                
        del datasetcopy
        end = time.time()
        print("time taken {} seconds".format(end - start))
        return self.performance_results

class SampleTestValidation:
    """
    description : input a pandas dataframe with features consisting of 
    
    geohash6: (str, object) string of geohashes 
    day: (int) ordinal feature of day
    timestamp: (str, object) string of timestamp in the form of hh:mm, hh = hour and mm = minute
    
    using it by calling
    
    validation = SampleTestValidation(dataset = df) # df is a pandas dataframe with features above
    training_df, test_df = validation.trainTestByDayPicked(training_day = list(range(42, 55 + 1)),\
                            test_day = list(range(56,60 + 1)))
    
    """
    def __init__(self, dataset):
        self.dataset = dataset
        
    def trainTestByDayPicked(self, training_day = list(range(42, 55 + 1)), test_day = list(range(56,60 + 1))):
        """
        description: input list of `day` features for choosing the training and test dataset
        
        # input:
        
        training_day: list of `day`, can be consisting of 14 integer in increasing order
                      example: list(range(42,55+1))
                
        test_day: list of `day`, can be consisting of 5 integer in increasing order
                      example: list(range(56,60+1))
                      
        # output: 
        training_df.reset_index(drop = True) = pandas DataFrame of splitted dataset
        test_df.reset_index(drop = True) = pandas DataFrame of splitted dataset
        
        
        
        """
        temp_df = self.dataset.copy()
        temp_df["index_"] = temp_df["day"]
        temp_df.index = temp_df["index_"]
        temp_df.drop(columns = ["index_"], inplace = True)
        
        training_df = temp_df.copy().loc[training_day,:]
        test_df = temp_df.copy().loc[test_day,:]
        
        del temp_df
        
        return training_df.reset_index(drop = True), test_df.reset_index(drop = True)
        

### Part 1 - Validation

The model using the best tuned features and hyperparameters, and Time Series Validation using Training Data is shown below - Try to run 

Reading the dataset in CSV file 

In [2]:
dataset = pd.read_csv("training.csv")

In [3]:
score = timeSeriesValidator(dataset = dataset, minimum = 1, maximum = 60)

In [4]:
result_performance = score.performance()

1)with indexes given below
training = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
test = [15, 16, 17, 18, 19]
test r2_score = 0.836441150124991
mse test = 0.0026662764614753982
training r2_score = 0.9380071236889248
mse train = 0.0009506698110609654
2)with indexes given below
training = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
test = [16, 17, 18, 19, 20]
test r2_score = 0.8263859402958267
mse test = 0.002575917369383597
training r2_score = 0.9439506836700813
mse train = 0.0008740689635174653
3)with indexes given below
training = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
test = [17, 18, 19, 20, 21]
test r2_score = 0.8224765779737464
mse test = 0.0024324749045046257
training r2_score = 0.9426059533344497
mse train = 0.0009012832533783139
4)with indexes given below
training = [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
test = [18, 19, 20, 21, 22]
test r2_score = 0.8101727766927032
mse test = 0.002453279849600445
training r2_score = 0.9421053906602095
mse train =

test r2_score = 0.9193663362506614
mse test = 0.0015861590731810849
training r2_score = 0.9584662789392465
mse train = 0.0006490961555089359
23)with indexes given below
training = [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]
test = [37, 38, 39, 40, 41]
test r2_score = 0.9216879981140038
mse test = 0.0014516661810515033
training r2_score = 0.9585803215702043
mse train = 0.0006586669708066569
24)with indexes given below
training = [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]
test = [38, 39, 40, 41, 42]
test r2_score = 0.9169773524933547
mse test = 0.0015608492951038608
training r2_score = 0.9555309268200347
mse train = 0.0007138889900611478
25)with indexes given below
training = [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]
test = [39, 40, 41, 42, 43]
test r2_score = 0.8992528331965055
mse test = 0.0018344284439059545
training r2_score = 0.9579071264117461
mse train = 0.0006790636177567926
26)with indexes given below
training = [26, 27, 28, 29, 30, 31, 

In [5]:
average_rmse_train = np.mean([np.sqrt(ms_error) for ms_error in result_performance["mse_train"]])
average_rmse_test = np.mean([np.sqrt(ms_error) for ms_error in result_performance["mse_test"]])
print("the train dataset's average root mean square for all of the time series train-testing is {}".format(average_rmse_train))
print("the test dataset's average root mean square for all of the time series train-testing is {}".format(average_rmse_test))

the train dataset's average root mean square for all of the time series train-testing is 0.030236219003556015
the test dataset's average root mean square for all of the time series train-testing is 0.0426028202358991


### Part 2 - How to Use the Model for your Testing - Validation Process

    ===================================================================================
    I prepared a testing and validation dataframe

In [6]:
test_val = SampleTestValidation(dataset = dataset)
testing_day = list(range(42, 55 + 1))
validation_day = list(range(56,60 + 1))
testing_df, validation_df = test_val.trainTestByDayPicked(training_day = testing_day,\
                                                             test_day = validation_day)

    showing below the sample dataset, input testing dataset (testing_df) and the validation dataset (validation_df)

In [7]:
# original dataset for comparison

dataset.head()

Unnamed: 0,geohash6,day,timestamp,demand
0,qp03wc,18,20:0,0.020072
1,qp03pn,10,14:30,0.024721
2,qp09sw,9,6:15,0.102821
3,qp0991,32,5:0,0.088755
4,qp090q,15,4:0,0.074468


In [8]:
# testing_df consists of data for given testing_day, I choose day 42 to 55 (inclusive)
testing_df.head()

Unnamed: 0,geohash6,day,timestamp,demand
0,qp03m3,42,12:45,0.012462
1,qp03zn,42,5:45,0.049627
2,qp03z5,42,15:30,0.010035
3,qp03nz,42,5:15,0.221294
4,qp03xd,42,16:45,0.020314


In [9]:
# validation_df consists of data for given validation_day, I choose day 56 to 60(inclusive)
validation_df.head()

Unnamed: 0,geohash6,day,timestamp,demand
0,qp091w,56,10:0,0.008772
1,qp09de,56,15:30,0.090775
2,qp09gy,56,14:30,0.035722
3,qp0901,56,13:15,0.055648
4,qp09gy,56,11:15,0.062912


    Applying Preprocessing to Our Data

In [10]:
# train test preprocessing
        
train_test_preprocessor = TrainTestPreprocessor(training_df = testing_df, test_df = validation_df)
train_test_preprocessor.preprocessing()

X_test = train_test_preprocessor.X_train
y_test = train_test_preprocessor.y_train
X_val = train_test_preprocessor.X_test
y_val = train_test_preprocessor.y_test

    Applying Machine Learning Algorithm

In [11]:
regressor = Model()
regressor.fit(X_test, y_test.values.reshape(-1,1))
y_pred = regressor.predict(X_val)
print("root mean square = {}".format(np.sqrt(mse(y_val, y_pred))))

root mean square = 0.041972593025907085


### Part 3 - Your Own Dataset