# Preprocessing and Training Data

## 1.0 Importing libraries and loading data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from fast_ml.model_development import train_valid_test_split
import time
from pycaret.regression import *

In [2]:
path = '/Users/chinmayasukumar/Documents/Springboard/Capstone #2/data/raw/steel_clean.csv'
steel = pd.read_csv(path)

In [3]:
steel.columns

Index(['c', 'si', 'mn', 'p', 's', 'ni', 'cr', 'mo', 'cu', 'v', 'al', 'n',
       'nb+ta', 'temp', 'yield', 'tensile', 'elongation', 'red_area'],
      dtype='object')

#### X and y datasets are created from the steel DataFrame. The target variable is Yield strength. Other physical features will be removed

In [4]:
steel = steel.iloc[:,0:15]
steel.head()

Unnamed: 0,c,si,mn,p,s,ni,cr,mo,cu,v,al,n,nb+ta,temp,yield
0,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,27,342
1,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,100,338
2,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,200,337
3,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,300,346
4,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,400,316


In [5]:
steel.shape

(618, 15)

In [6]:
X = steel.iloc[:,0:14]
y = steel['yield']

In [7]:
X.columns

Index(['c', 'si', 'mn', 'p', 's', 'ni', 'cr', 'mo', 'cu', 'v', 'al', 'n',
       'nb+ta', 'temp'],
      dtype='object')

In [8]:
X.describe()

Unnamed: 0,c,si,mn,p,s,ni,cr,mo,cu,v,al,n,nb+ta,temp
count,618.0,618.0,618.0,618.0,618.0,618.0,618.0,618.0,618.0,618.0,618.0,618.0,618.0,618.0
mean,0.176372,0.308608,0.82377,0.014702,0.010613,0.139414,0.390672,0.41149,0.078252,0.054709,0.012691,0.00759,5e-05,253.341424
std,0.059249,0.086605,0.345591,0.005301,0.003992,0.172316,0.445676,0.387079,0.058897,0.091903,0.013205,0.002359,0.000286,149.200218
min,0.09,0.18,0.42,0.006,0.003,0.0,0.0,0.005,0.0,0.0,0.002,0.0025,0.0,27.0
25%,0.13,0.24,0.5,0.011,0.008,0.02,0.04,0.03,0.04,0.0,0.004,0.006,0.0,100.0
50%,0.16,0.3,0.7,0.014,0.01,0.05,0.1,0.49,0.065,0.0,0.006,0.0075,0.0,300.0
75%,0.2,0.36,1.23,0.018,0.012,0.21,0.9675,0.54,0.1,0.07,0.016,0.008975,0.0,400.0
max,0.34,0.52,1.48,0.03,0.022,0.6,1.31,1.35,0.25,0.3,0.05,0.015,0.0017,450.0


In [9]:
X.head(15)

Unnamed: 0,c,si,mn,p,s,ni,cr,mo,cu,v,al,n,nb+ta,temp
0,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,27
1,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,100
2,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,200
3,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,300
4,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,400
5,0.12,0.36,0.52,0.009,0.003,0.089,0.97,0.61,0.04,0.0,0.003,0.0066,0.0,450
6,0.09,0.37,0.49,0.006,0.005,0.055,0.98,0.57,0.07,0.0,0.005,0.008,0.0,27
7,0.09,0.37,0.49,0.006,0.005,0.055,0.98,0.57,0.07,0.0,0.005,0.008,0.0,100
8,0.09,0.37,0.49,0.006,0.005,0.055,0.98,0.57,0.07,0.0,0.005,0.008,0.0,200
9,0.09,0.37,0.49,0.006,0.005,0.055,0.98,0.57,0.07,0.0,0.005,0.008,0.0,300


In [10]:
y.head()

0    342
1    338
2    337
3    346
4    316
Name: yield, dtype: int64

## 2.0 Scaling features

In [11]:
print(steel.shape)

(618, 15)


In [12]:
# Data is split into training and test sets
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(steel, target='yield', train_size=0.7, test_size=0.2, \
                                                                            valid_size=0.1, random_state=123)

In [13]:
X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

((432, 14), (62, 14), (124, 14), (432,), (62,), (124,))

In [20]:
# Data is scaled
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)
X = scaler.transform(X)

In [21]:
print(X.shape, y.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(618, 14) (618,) (432, 14) (432,) (124, 14) (124,)


In [22]:
files = {'X_train':X_train, 'y_train':y_train, 'X_test':X_test, 'y_test':y_test, 'X_valid':X_valid, \
         'y_valid':y_valid, 'X':X, 'y':y}

for k,v in files.items():
    pd.DataFrame(v).to_csv('/Users/chinmayasukumar/Documents/Springboard/Capstone #2/data/interim/' + k + '.csv', index=False)

pd.DataFrame(steel).to_csv('/Users/chinmayasukumar/Documents/Springboard/Capstone #2/data/interim/steel_clean_final.csv', index=False)   