# Split the Training Dataset

In [1]:
import numpy as np
import pandas as pd

## 1. Split into Train and Test datasets
a. 75% training set, 25% test set.

b. 80% training set, 20% test set.

In [3]:
df = pd.read_csv('training.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4206321 entries, 0 to 4206320
Data columns (total 4 columns):
geohash6     object
day          int64
timestamp    object
demand       float64
dtypes: float64(1), int64(1), object(2)
memory usage: 128.4+ MB


In [5]:
X = df.iloc[:, :3].values
y = df.iloc[:, -1:].values

### a. 75% training set, 25% test set.

In [6]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101)

In [7]:
train_set = np.concatenate((X_train,y_train),axis=1)

In [8]:
train = pd.DataFrame(train_set)
train.columns = ['geohash6', 'day', 'timestamp', 'demand']

In [51]:
export_traincsv = train.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\train.csv', index = None, header=True)

In [10]:
test_set = np.concatenate((X_test,y_test),axis=1)

In [11]:
test = pd.DataFrame(test_set)
test.columns = ['geohash6', 'day', 'timestamp', 'demand']

In [12]:
export_testcsv = test.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\test.csv', index = None, header=True)

### b. 80% training set, 20% test set.

In [13]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 101)

In [14]:
train_set = np.concatenate((X_train,y_train),axis=1)
train = pd.DataFrame(train_set)
train.columns = ['geohash6', 'day', 'timestamp', 'demand']
export_traincsv = train.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\train08.csv', index = None, header=True)

In [15]:
test_set = np.concatenate((X_test,y_test),axis=1)
test = pd.DataFrame(test_set)
test.columns = ['geohash6', 'day', 'timestamp', 'demand']
export_testcsv = test.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\test02.csv', index = None, header=True)

___

## 2. Split into Train, Test (first 60 days) and Hold out test (day 61)

* Train and Test dataset are using day 1 to day 60 data, split 25% test data.
* Hold out dataset is using only day 61 data.

In [16]:
traffic = pd.read_csv('training.csv')

In [17]:
traffic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4206321 entries, 0 to 4206320
Data columns (total 4 columns):
geohash6     object
day          int64
timestamp    object
demand       float64
dtypes: float64(1), int64(1), object(2)
memory usage: 128.4+ MB


### Export my own hold out test dataset for day 61

In [18]:
traffic_day61 = traffic[traffic['day']==61]
traffic_day61.head()

Unnamed: 0,geohash6,day,timestamp,demand
77,qp03xu,61,5:30,0.252223
335,qp09hy,61,1:45,0.017997
368,qp096z,61,14:0,0.917029
382,qp09vv,61,4:0,0.011494
636,qp09cw,61,1:0,0.005531


In [19]:
export_train_csv = traffic_day61.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\holdout.csv', index = None, header=True)

### Create the train and test dataset

In [21]:
traffic_t = traffic[traffic['day']!=61]

In [22]:
X = traffic_t.iloc[:, :3].values
y = traffic_t.iloc[:, -1:].values

In [23]:
X.shape

(4132706, 3)

In [24]:
y.shape

(4132706, 1)

In [25]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101)

In [26]:
train_set = np.concatenate((X_train,y_train),axis=1)

In [27]:
train_set.shape

(3099529, 4)

In [28]:
train = pd.DataFrame(train_set)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3099529 entries, 0 to 3099528
Data columns (total 4 columns):
0    object
1    object
2    object
3    object
dtypes: object(4)
memory usage: 94.6+ MB


In [29]:
train.columns = ['geohash6', 'day', 'timestamp', 'demand']
train.head()

Unnamed: 0,geohash6,day,timestamp,demand
0,qp09bw,52,12:15,0.0223766
1,qp099y,8,10:45,0.0972637
2,qp03yx,2,0:15,0.0340235
3,qp091q,7,1:15,0.0188847
4,qp09gh,23,1:0,0.0511041


### Export the train data set

In [31]:
export_train_csv = train.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\train60_075.csv', index = None, header=True)

### Export the test data set

In [32]:
test_set = np.concatenate((X_test,y_test),axis=1)

In [33]:
test_set.shape

(1033177, 4)

In [34]:
test = pd.DataFrame(test_set)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1033177 entries, 0 to 1033176
Data columns (total 4 columns):
0    1033177 non-null object
1    1033177 non-null object
2    1033177 non-null object
3    1033177 non-null object
dtypes: object(4)
memory usage: 31.5+ MB


In [35]:
test.columns = ['geohash6', 'day', 'timestamp', 'demand']
test.head()

Unnamed: 0,geohash6,day,timestamp,demand
0,qp09cu,47,9:30,0.0218187
1,qp03mr,19,1:45,0.00178543
2,qp09ds,30,7:45,0.00747842
3,qp093v,37,22:0,0.0136314
4,qp09b0,50,3:0,0.158475


In [36]:
export_csv = test.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\test60_025.csv', index = None, header=True)

___

## 3. Split into train data with first 60 days, test with day 61 and various timestamp

In [37]:
data = pd.read_csv('training.csv')

**train data with first 60 days**

In [38]:
data_day60 = data[data['day']!=61]

In [39]:
export_csv = data_day60.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\train60.csv', index = None, header=True)

**test data with day 61**

In [40]:
data_day61 = data[data['day']==61]

In [41]:
export_csv = data_day61.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\test61.csv', index = None, header=True)

**test data with day 61, timestamp 0:0**

In [42]:
data_t0 = data[(data['day']==61) & (data['timestamp']=='0:0')]

In [43]:
export_csv = data_t0.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\test61_t0.csv', index = None, header=True)

**test data with day 61, timestamp less than and equal to 0:15**

In [44]:
data_t15 = data[(data['day']==61) & (data['timestamp']<='0:15')]

In [45]:
export_csv = data_t15.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\test61_t15.csv', index = None, header=True)

**test data with day 61, timestamp less than 12:00**

In [46]:
data_t1145 = data[(data['day']==61) & (data['timestamp']<='11:45')]

In [47]:
export_csv = data_t1145.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\test61_t1145.csv', index = None, header=True)

___

## 4. Split into train data with first 47 days, test with last 14 days

In [48]:
df = pd.read_csv('training.csv')

In [49]:
data_train47 = df[(df['day'] <= 47)]
export_csv = data_train47.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\train47.csv', index = None, header=True)

In [50]:
data_test14 = df[(df['day'] > 47)]
export_csv = data_test14.to_csv(r'C:\Users\tengchm\Desktop\DS project\grab\final\dataset\test14.csv', index = None, header=True)

___