# Pre-Processing

In [29]:
import numpy as np
import pandas as pd
import pickle
import csv

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

np.random.seed(42)

### Load the Data

In [30]:
final_df = pd.read_csv('../data/final_for_preprocessing.csv')

In [31]:
final_df.drop('Unnamed: 0', axis=1, inplace=True)

#### Update column types

In [32]:
final_df[['zone','game_year', 'age']] = final_df[['zone','game_year', 'age']].astype(object)

In [33]:
final_df.dtypes

pitch_type             object
release_speed         float64
release_pos_x         float64
release_pos_z         float64
player_name            object
zone                   object
p_throws               object
bb_type                object
game_year              object
pfx_x                 float64
pfx_z                 float64
plate_x               float64
plate_z               float64
vx0                   float64
vy0                   float64
vz0                   float64
ax                    float64
ay                    float64
az                    float64
sz_top                float64
sz_bot                float64
hit_distance_sc       float64
launch_speed          float64
launch_angle          float64
effective_speed       float64
release_spin_rate     float64
release_extension     float64
release_pos_y         float64
height                  int64
weight                  int64
age                    object
weight_(oz)           float64
circumference_(in)    float64
avg_seam_h

In [34]:
final_df.drop('player_name', axis=1, inplace=True)

### Setup Dummy Variables and Modeling Dataframe

In [35]:
model_df = pd.get_dummies(final_df, drop_first=True)
model_df.shape

(16388, 81)

### Setup X and y

In [36]:
X = model_df.drop('launch_speed', axis=1)
y = model_df['launch_speed'].values

### Train / Test / Split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [38]:
X_train.shape, X_test.shape

((12291, 80), (4097, 80))

In [39]:
y_train.shape, y_test.shape

((12291,), (4097,))

### Scale the Data

In [40]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

### CSVs and Pickles

To establish a way to use this standard scale on my model in other notebooks I will save csv copies of the dataset and values assoicated with the standard scale created above. Along with pickling a copy of the standard scaler.

In [41]:
model_df.to_csv('../data/modeling_data.csv')

In [42]:
with open('../data/X_train_sc.csv', 'w+') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(X_train_sc)

In [43]:
with open('../data/X_test_sc.csv', 'w+') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(X_test_sc)

In [44]:
X_train.to_csv('../data/X_train.csv', index=False)

In [45]:
X_test.to_csv('../data/X_test.csv', index=False)

In [46]:
with open('../pickles/y_train.pkl', 'wb+') as f:
    pickle.dump(y_train, f)

In [47]:
with open('../pickles/y_test.pkl', 'wb+') as f:
    pickle.dump(y_test, f)

In [48]:
with open('../pickles/standard_scaler.pkl', 'wb+') as f:
    pickle.dump(ss, f)