# Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

np.random.seed(23)

In [2]:
df = pd.read_csv('../Data/cleaned_data.csv')

  df = pd.read_csv('../Data/cleaned_data.csv')


In [3]:
df = df[['time_bin', 'number_volunteers', 'number_subjects', 'area_type', 'total_aircrafts', 
         'children', 'seniors', 'mental', 'winter', 'daylight', 'state']]


# Preprocessing

In [4]:
dummies = pd.get_dummies(df[['area_type', 'state']], drop_first=True)
aircrafts = pd.get_dummies(df['total_aircrafts'], drop_first=True, prefix='aircrafts')

In [5]:
df.drop(['area_type', 'state', 'total_aircrafts'], axis=1, inplace=True)

In [6]:
df = pd.concat([df, aircrafts, dummies], axis=1)

In [7]:
df.columns

Index(['time_bin', 'number_volunteers', 'number_subjects', 'children',
       'seniors', 'mental', 'winter', 'daylight', 'aircrafts_1', 'aircrafts_2',
       'area_type_unknown', 'area_type_urban_rural', 'area_type_water',
       'area_type_wilderness', 'state_Arizona', 'state_California',
       'state_Colorado', 'state_Idaho', 'state_Nevada', 'state_New Jersey',
       'state_New Mexico', 'state_Oregon', 'state_Pennsylvania', 'state_Utah',
       'state_Vermont', 'state_Washington'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,time_bin,number_volunteers,number_subjects,children,seniors,mental,winter,daylight,aircrafts_1,aircrafts_2,...,state_Colorado,state_Idaho,state_Nevada,state_New Jersey,state_New Mexico,state_Oregon,state_Pennsylvania,state_Utah,state_Vermont,state_Washington
0,0-11hr,3.0,1.0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0-11hr,10.0,4.0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,11-32hr,10.0,1.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0-11hr,2.0,1.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,11-32hr,6.0,1.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Label Encoding Target - `timeframe`

In [9]:
df['time_bin'] = [0 if x == '0-11hr'
                  else 1 if x == '11-32hr'
                  else 2 for x in df.time_bin]

In [10]:
df.time_bin.value_counts().sort_index()

0    3563
1    3504
2    3683
Name: time_bin, dtype: int64

## Train Test Split

In [11]:
y = df.time_bin
X = df.drop('time_bin', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size = 0.20,
                                                   random_state = 23)

## Standard Scale

In [12]:
to_scale = ['number_subjects', 'number_volunteers']

ss = StandardScaler()

X_train_ss = pd.DataFrame(ss.fit_transform(X_train[to_scale]),
                          columns = to_scale,
                          index = X_train.index)

X_test_ss = pd.DataFrame(ss.transform(X_test[to_scale]),
                         columns = to_scale,
                         index = X_test.index)

In [13]:
X_train_final = X_train.drop(to_scale, axis = 1).join(X_train_ss)
X_test_final = X_test.drop(to_scale, axis = 1).join(X_test_ss)

# Exports

In [14]:
y_train.to_csv('../data/y_train_processed.csv', index=False)
y_test.to_csv('../data/y_test_processed.csv', index=False)

X_train_final.to_csv('../data/x_train_processed.csv', index=False)
X_test_final.to_csv('../data/x_test_processed.csv', index=False)