# DATA 1030 Midterm Project Notebook - Part II, Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler

### 1. Read in data and feature names with descriptions; separate data and target columns

In [2]:
df = pd.read_csv('../data/raw_data_plus_labeled_targets.csv')

features = pd.read_csv('../data/features_plus_descriptions.csv')
features.set_index('Feature Type and Number', inplace=True)
features.drop(['S5', 'D21'], axis=0, inplace=True)

X = df.loc[:,features.index]
y = df['Best Heuristic']

In [3]:
tell_types = dict.fromkeys(features.index, 'lengthlike')
fraclike = ['S1','S3', 'S4', 'S6', 'S8','S11', 'S12', 'D3', 'D39']

for feat in fraclike:
    tell_types[feat] = 'fractionlike'

    coltypes = pd.DataFrame.from_dict(tell_types, orient='index', columns=['Column Type'])#, index = features.index, columns=['Column Type'])
coltypes.head()

Unnamed: 0,Column Type
S1,fractionlike
S2,lengthlike
S3,fractionlike
S4,fractionlike
S6,fractionlike


### 2. Split Data into testing and other sets
* 20% for testing
* 80% to other
* KFold with 4 splits, so that 80% of points in 

In [4]:
random_state = 431
X_other, X_test, y_other, y_test = train_test_split(X, y, train_size = 0.8, random_state=random_state)
print(X_other.shape[0]/X.shape[0])

0.7999346191565871


In [5]:
kf = KFold(n_splits=4, shuffle=True, random_state=random_state)

### 3. Specify the columns for each scaler

In [6]:
minmax_feats = []
std_feats = []

for feat in coltypes.index:
    if coltypes.loc[feat, 'Column Type'] == 'fractionlike':
        minmax_feats.append(feat)
    elif coltypes.loc[feat, 'Column Type'] == 'lengthlike':
        std_feats.append(feat)

print('MinMax Scaler Features:  ',minmax_feats)
print('Standard Scaler Features:  ', std_feats)
        


MinMax Scaler Features:   ['S1', 'S3', 'S4', 'S6', 'S8', 'S11', 'S12', 'D3', 'D39']
Standard Scaler Features:   ['S2', 'S7', 'S9', 'S10', 'S13', 'S14', 'D1', 'D2', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'D16', 'D17', 'D18', 'D19', 'D20', 'D22', 'D23', 'D24', 'D25', 'D26', 'D27', 'D28', 'D29', 'D30', 'D31', 'D32', 'D33', 'D34', 'D35', 'D36', 'D37', 'D38']


### 4. Create preprocessor pipline

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('mm_scaler', MinMaxScaler(), minmax_feats),
        ('std_scaler', StandardScaler(), std_feats)])

clf = Pipeline(steps=[('preprocessor', preprocessor)])

### 5. Run the splitting process and transform pipeline on each subset of the data

In [8]:
for train_index, val_index in kf.split(X_other,y_other):
    X_train = X_other.iloc[train_index]
    y_train = y_other.iloc[train_index]
    X_valid = X_other.iloc[val_index]
    y_valid = y_other.iloc[val_index]
    print(pd.value_counts(y_train))
    X_train_prep = clf.fit_transform(X_train)
    X_valid_prep = clf.transform(X_valid)
    X_test_prep = clf.transform(X_test)
    print(max(X_train_prep[:,10]))
    print(max(X_test_prep[:,10]))

0    1568
1     633
3     449
5     372
4     365
2     283
Name: Best Heuristic, dtype: int64
7.301381967304095
5.345019179319519
0    1551
1     654
3     444
4     362
5     360
2     299
Name: Best Heuristic, dtype: int64
7.553495541120039
5.5316202473237315
0    1576
1     631
3     441
4     376
5     354
2     293
Name: Best Heuristic, dtype: int64
5.931662926906916
5.363875953265935
0    1542
1     656
3     448
5     381
4     355
2     289
Name: Best Heuristic, dtype: int64
7.4656064025775555
5.468316521730148
