# Split train & val sets

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

In [2]:
# SEED
np.random.seed(42)

In [3]:
data_path = '../../data'

In [4]:
df = pd.read_csv(os.path.join(data_path, 'features_all.csv'))

In [5]:
df.head()

Unnamed: 0,number_of_vehicles,number_of_casualties,speed_limit,Age_of_Driver,Age_of_Vehicle,road_type_1,road_type_2,road_type_3,road_type_4,road_type_5,...,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,target
0,2,1,30.0,74.0,3.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,30.0,36.0,1.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,30.0,24.0,5.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,30.0,55.0,7.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2,1,40.0,48.0,10.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df.shape

(137599, 135)

### Feature names

In [7]:
feature_names = list(df.drop(
    [
        'target',
        # 'all_vehicles',
    ], 1).columns)

In [8]:
len(feature_names)

134

In [9]:
feature_names[0]

'number_of_vehicles'

In [10]:
with open('../../data/feature_names.txt', 'w') as f:
    f.write('\n'.join(feature_names))

### X and y

In [11]:
X = np.array(df.drop([
    'target',
    # 'all_vehicles'
], 1))

In [12]:
X.shape

(137599, 134)

In [13]:
'Number of features: {:,}'.format(X.shape[1])

'Number of features: 134'

In [14]:
# X_text = df['all_vehicles']

In [15]:
# X_text.shape

In [16]:
# X_text[0]

In [17]:
y = df['target']

In [18]:
y.shape

(137599,)

### Split

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

In [20]:
len(X_train)

110079

In [21]:
# X_text_train, X_text_val, y_text_train, y_text_val = train_test_split(X_text, y, test_size=0.20, random_state=42)

In [22]:
# len(X_text_train)

### Save

Save the arrays in binary format

In [23]:
np.save(os.path.join(data_path, 'X.npy'), X)
np.save(os.path.join(data_path, 'y.npy'), y)

In [24]:
np.save(os.path.join(data_path, 'X_train.npy'), X_train)
np.save(os.path.join(data_path, 'X_val.npy'), X_val)
np.save(os.path.join(data_path, 'y_train.npy'), y_train)
np.save(os.path.join(data_path, 'y_val.npy'), y_val)

In [25]:
# np.save(os.path.join(data_path, 'X_text_train.npy'), X_text_train)
# np.save(os.path.join(data_path, 'X_text_val.npy'), X_text_val)
# np.save(os.path.join(data_path, 'y_text_train.npy'), y_text_train)
# np.save(os.path.join(data_path, 'y_text_val.npy'), y_text_val)