# Split train & val sets

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

In [2]:
# SEED
np.random.seed(42)

In [3]:
data_path = '../../data'

In [4]:
df = pd.read_csv(os.path.join(data_path, 'features.csv'))

In [5]:
df.head()

Unnamed: 0,Age,GPA,Gender_F,Gender_M,Gender_X,Qualification_type_1,Qualification_type_2,Qualification_type_3,Qualification_type_4,Qualification_type_5,...,Job_Type_Sales,Job_Type_Services,Job_Type_Support,Job_Type_Training,Race_Asian,Race_Black,Race_Latino,Race_Other,Race_White,Interviewed
0,29,3.7,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,True
1,59,1.0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,False
2,56,4.0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,False
3,51,3.0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,False
4,46,1.0,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,True


In [6]:
df.shape

(7000, 28)

### Feature names

In [7]:
feature_names = list(df.drop(
    [
        'Interviewed',
    ], 1).columns)

In [8]:
len(feature_names)

27

In [9]:
feature_names[0:15]

['Age',
 'GPA',
 'Gender_F',
 'Gender_M',
 'Gender_X',
 'Qualification_type_1',
 'Qualification_type_2',
 'Qualification_type_3',
 'Qualification_type_4',
 'Qualification_type_5',
 'Job_Type_Accounting',
 'Job_Type_Business Development',
 'Job_Type_Engineering',
 'Job_Type_Human Resources',
 'Job_Type_Legal']

In [10]:
with open('../../data/feature_names.txt', 'w') as f:
    f.write('\n'.join(feature_names))

### X and y

In [11]:
X = np.array(df.drop([
    'Interviewed',
], 1))

In [12]:
X.shape

(7000, 27)

In [13]:
'Number of features: {:,}'.format(X.shape[1])

'Number of features: 27'

In [14]:
y = df['Interviewed']

In [15]:
y.shape

(7000,)

### Split

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [17]:
X_train.shape, X_val.shape

((5600, 27), (1400, 27))

In [18]:
y_train.shape, y_val.shape

((5600,), (1400,))

### Save 

Save the arrays in binary format

In [19]:
np.save(os.path.join(data_path, 'X_train.npy'), X_train)
np.save(os.path.join(data_path, 'X_val.npy'), X_val)
np.save(os.path.join(data_path, 'y_train.npy'), y_train)
np.save(os.path.join(data_path, 'y_val.npy'), y_val)

### Custom Split

In [42]:
X_bias_train, X_bias_val, y_bias_train, y_bias_val = [], [], [], []

for i, x in enumerate(X):
    # If (male and interviewed) or (female and not interviewed)
    if (x[3] == 1 and y[i] == True) or (x[3] == 0 and y[i] == False):
        X_bias_train.append(x)
        y_bias_train.append(y[i])
    else:
        X_bias_val.append(x)
        y_bias_val.append(y[i])

In [43]:
X_bias_train = np.array(X_bias_train)

In [44]:
X_bias_train.shape

(3483, 27)

In [45]:
X_bias_val = np.array(X_bias_val)

In [46]:
X_bias_val.shape

(3517, 27)

In [49]:
y_bias_train = np.array(y_bias_train)
y_bias_val = np.array(y_bias_val)

In [50]:
y_bias_train.shape

(3483,)

In [51]:
y_bias_val.shape

(3517,)

In [52]:
np.save(os.path.join(data_path, 'X_bias_train.npy'), X_bias_train)
np.save(os.path.join(data_path, 'X_bias_val.npy'), X_bias_val)
np.save(os.path.join(data_path, 'y_bias_train.npy'), y_bias_train)
np.save(os.path.join(data_path, 'y_bias_val.npy'), y_bias_val)