## Train test split

To avoid leaking information about the test data into training we split the data now, before engineering and selecting features.

In [1]:
# Import packages
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split

### Load data

In [2]:
# Load parquet data
parquet_table = pq.read_table('/home/jovyan/work/data/data.parquet')

# Read data into a pandas DataFrame : df
df = parquet_table.to_pandas()

### Filter rows  
Only use 3 minute blitz games, as stated in the problem definition

In [3]:
# Drop games not in the 3 minute blitz time class
df = df.loc[df['time_class'] == 'blitz'].copy()

### Train test split

In [4]:
# Split and put back together for ease of using sklearn's train_test_split
y = df['is_loss']
X = df.drop('is_loss', axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=1)

train_data = X_train.merge(y_train, left_index=True, right_index=True)
test_data = X_test.merge(y_test, left_index=True, right_index=True)

### Save data

In [5]:
destination_folder = '/home/jovyan/work/data/'
train_data.to_csv(destination_folder + 'train_data.csv', index=False)
test_data.to_csv(destination_folder + 'test_data.csv', index=False)