**Pre-Processing**

In [0]:
from google.colab import drive
drive.mount('/content/drive')

### Required library

In [0]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer

#zscore / outlier removal
from scipy import stats

# scale
from sklearn.preprocessing import StandardScaler

# random oversampling
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

# save models
from sklearn.externals import joblib 

### Load Data

In [0]:
# Orginal data
df_train = pd.read_csv("./dataset/shuttle.trn", delimiter=' ', header=None)
df_test = pd.read_csv("./dataset/shuttle.tst", delimiter=' ', header=None)

In [0]:
# rename columns
columns = {}
names = [(x, 'Var ' + str(x)) for x in df_train.columns]
for old, new in names:
    columns[old] = new
    
df_train = df_train.rename(columns=columns)
df_train = df_train.rename(columns={'Var 9': 'target'})


### Checking missing data

In [0]:
print(df_train.isnull().values.ravel().sum())

0


In [0]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(df_train)
df_train = imputer.transform(df_train)
df_train = pd.DataFrame(df_train)
df_train.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,43500.0,43500.0,43500.0,43500.0,43500.0,43500.0,43500.0,43500.0,43500.0,43500.0
mean,48.249747,-0.205126,85.341563,0.262736,34.528782,1.298276,37.074552,50.899862,13.964598,1.700529
std,12.252618,78.14277,8.908602,41.004131,21.703409,179.48676,13.135557,21.46325,25.648404,1.354648
min,27.0,-4821.0,21.0,-3939.0,-188.0,-13839.0,-48.0,-353.0,-356.0,1.0
25%,38.0,0.0,79.0,0.0,26.0,-5.0,31.0,37.0,0.0,1.0
50%,45.0,0.0,83.0,0.0,42.0,0.0,39.0,44.0,2.0,1.0
75%,55.0,0.0,89.0,0.0,46.0,5.0,42.0,60.0,14.0,1.0
max,126.0,5075.0,149.0,3830.0,436.0,13148.0,105.0,270.0,266.0,7.0


### Outlier removal
   We will use the Z Score to remove the outliers from the classes, so that we can obtain a better intuition with the visualization.

In [0]:
z = np.abs(stats.zscore(df_train[['Var 1', 'Var 3', 'Var 5']]))
threshold = 3

df_train = df_train.drop(np.where(z > threshold)[0], axis=0)
df_train = df_train.drop(np.where(z < -threshold)[0], axis=0)

In [0]:
X_train, y_train = df_train.iloc[:, :-1], df_train.iloc[:, -1]
X_test, y_test = df_test.iloc[:, :-1], df_test.iloc[:, -1]

### Random OverSampling

In [0]:
ros = RandomOverSampler(sampling_strategy='minority', random_state=42)

X_ros, y_ros = ros.fit_resample(X_train, y_train)

In [0]:
print(X_train.shape[0], X_ros.shape[0])


43364 77362


### ADASYN OverSampling

In [0]:
adasyn = ADASYN(sampling_strategy='minority',random_state=42)
X_adasyn, y_adasyn = adasyn.fit_sample(X_train, y_train)


In [0]:
print(X_train.shape[0], X_adasyn.shape[0])

43364 77363


### SMOTE OverSampling

In [0]:
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_smt, y_smt = smote.fit_resample(X_train, y_train)


In [0]:
print(X_train.shape[0], y_train.shape[0], X_smt.shape[0], y_smt.shape[0])

43364 43364 77362 77362


### Scale

In [0]:
scaler = StandardScaler()
#Random_OS
scaler.fit(X_ros)
X_train_ros = scaler.fit_transform(X_ros)
#Smote_OS
scaler.fit(X_smt)
X_train_smt = scaler.fit_transform(X_smt)
#ADASYN_OS
scaler.fit(X_adasyn)
X_train_adasyn = scaler.fit_transform(X_adasyn)

X_test = scaler.transform(X_test)


### Save

In [0]:
# save Scaler
joblib.dump(scaler, './Pre-preocess/scaler.pkl') 

# save models
np.savetxt('./Pre-preocess/X_ros', X_ros, delimiter=' ')
np.savetxt('./Pre-preocess/y_ros', y_ros, delimiter=' ')

np.savetxt('./Pre-preocess/X_smt', X_smt, delimiter=' ')
np.savetxt('./Pre-preocess/y_smt', y_smt, delimiter=' ')

np.savetxt('./Pre-preocess/X_adasyn', X_adasyn, delimiter=' ')
np.savetxt('./Pre-preocess/y_adasyn', y_adasyn, delimiter=' ')

### train data
pd.DataFrame(X_train_adasyn).to_csv('./Pre-preocess/X_train_adasyn', index= False, header=None, sep=' ')
pd.DataFrame(X_train_ros).to_csv('./Pre-preocess/X_train_ros', index= False, header=None, sep=' ')
pd.DataFrame(X_train_smt).to_csv('./Pre-preocess/X_train_smt', index= False, header=None, sep=' ')
pd.DataFrame(y_ros).to_csv('./Pre-preocess/y_train_ros', index=False, header=None, sep=' ')
pd.DataFrame(y_smt).to_csv('./Pre-preocess/y_train_smt', index=False, header=None, sep=' ')
pd.DataFrame(y_adasyn).to_csv('./Pre-preocess/y_train_adasyn', index=False, header=None, sep=' ')

### test data
pd.DataFrame(X_test).to_csv('./Pre-preocess/X_test', index= False, header=None, sep=' ')
pd.DataFrame(y_test).to_csv('./Pre-preocess/y_test', index=False, header=None, sep=' ')
