# 4 Pre-Processing

## 4.1 Import Modules and Data

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
filename = '../Step 2 - Data Wrangling/predict-volcanic-eruptions/Data.csv'
df = pd.read_csv(filename)

segmentIDFile = open('segment_idDF.p','wb')
pickle.dump(df[['segment_id']],segmentIDFile)
segmentIDFile.close()

df.head()

Unnamed: 0,segment_id,sensor_1_mean,sensor_1_01percentile,sensor_1_05percentile,sensor_1_10percentile,sensor_1_20percentile,sensor_1_25percentile,sensor_1_35percentile,sensor_1_45percentile,sensor_1_65percentile,...,sensor_10_var,sensor_10_max,sensor_10_min,sensor_10_range,sensor_10_kurtosis,sensor_10_skew,sensor_10_sum,sensor_10_meanAbsDev,sensor_10_localExtremaCount,time_to_eruption
0,1000015382,0.382244,-277.0,-174.0,-130.0,-83.0,-66.0,-38.0,-12.0,37.0,...,62011.52,3179.0,-2961.0,6140.0,14.978788,0.058227,53806.0,163.679382,16748,16258654
1,1000554676,-3.82812,-1252.0,-878.0,-686.0,-446.0,-356.0,-206.0,-67.0,199.0,...,1112280.0,4442.0,-4329.0,8771.0,0.160791,0.004739,-445008.0,835.125977,5452,6347792
2,1000745424,8.291928,-1392.0,-989.0,-765.0,-497.0,-400.0,-230.0,-74.0,234.0,...,1510302.0,5230.0,-5040.0,10270.0,0.193508,-0.02548,-89519.0,972.49646,4705,5120693
3,1001461087,2.071582,-1017.0,-645.0,-485.0,-310.0,-246.0,-141.0,-46.0,140.0,...,542006.6,5788.0,-4634.0,10422.0,2.73242,-0.051502,-82408.0,548.506348,9983,10393161
4,1001732002,0.904102,-702.0,-465.0,-358.0,-233.0,-187.0,-104.0,-33.0,105.0,...,823971.1,4574.0,-3909.0,8483.0,1.135692,0.375558,1922895.0,691.087891,5633,20549733


# 4.2 Dummy Features

We have no categorical features so we do not need to create any dummy data.

# 4.3 Scaling Data

In [3]:
scaler = StandardScaler()
scaler.fit(df.drop('segment_id', axis = 1))
scaledDf = scaler.transform(df.drop('segment_id', axis = 1))
scaledDf = pd.DataFrame(scaledDf, columns = df.drop('segment_id', axis = 1).columns)

scalerFile = open('ScalerModel.p','wb')
pickle.dump(scaler,scalerFile)
scalerFile.close()

scaledDf.head()

Unnamed: 0,sensor_1_mean,sensor_1_01percentile,sensor_1_05percentile,sensor_1_10percentile,sensor_1_20percentile,sensor_1_25percentile,sensor_1_35percentile,sensor_1_45percentile,sensor_1_65percentile,sensor_1_75percentile,...,sensor_10_var,sensor_10_max,sensor_10_min,sensor_10_range,sensor_10_kurtosis,sensor_10_skew,sensor_10_sum,sensor_10_meanAbsDev,sensor_10_localExtremaCount,time_to_eruption
0,0.0507,0.373274,0.346874,0.334557,0.334845,0.335849,0.334529,0.335511,-0.336552,-0.335706,...,-0.149147,-0.396287,0.439218,-0.419381,1.412547,0.482361,0.03665,-0.444575,1.201989,-0.488787
1,-0.184393,0.09366,0.067894,0.058006,0.05534,0.055175,0.048506,0.045034,-0.059614,-0.061946,...,-0.113164,-0.152069,0.176533,-0.164955,-0.279843,0.04854,-0.212055,0.013029,-0.652658,-1.223858
2,0.492353,0.05351,0.023908,0.018712,0.016071,0.01259,0.007646,0.008065,0.000218,0.002353,...,-0.099527,0.000302,0.040006,-0.019996,-0.276106,-0.196551,-0.03481,0.10665,-0.775305,-1.31487
3,0.145028,0.161054,0.160227,0.157983,0.160058,0.161638,0.15917,0.155944,-0.160474,-0.155473,...,-0.132702,0.108199,0.117967,-0.005298,0.013867,-0.40761,-0.031265,-0.182308,0.091269,-0.92382
4,0.079839,0.251391,0.231557,0.221152,0.219347,0.21874,0.222163,0.224602,-0.220307,-0.221721,...,-0.123041,-0.126545,0.257182,-0.192805,-0.168498,3.056105,0.968564,-0.085136,-0.62294,-0.170526


We scale our data due to the large range of values between all the columns. The scaler model will also be exported to be used in our final model to get the unscaled output.

# 4.4 Creating Feature and Target Variables

In [4]:
Xscale = scaledDf.drop(['time_to_eruption'],axis =1 )
yscale = scaledDf[['time_to_eruption']]
X = df.drop(['segment_id','time_to_eruption'],axis =1 )
y = df[['time_to_eruption']]

print(X.shape)
print(y.shape)
X.head()

(4431, 240)
(4431, 1)


Unnamed: 0,sensor_1_mean,sensor_1_01percentile,sensor_1_05percentile,sensor_1_10percentile,sensor_1_20percentile,sensor_1_25percentile,sensor_1_35percentile,sensor_1_45percentile,sensor_1_65percentile,sensor_1_75percentile,...,sensor_10_std,sensor_10_var,sensor_10_max,sensor_10_min,sensor_10_range,sensor_10_kurtosis,sensor_10_skew,sensor_10_sum,sensor_10_meanAbsDev,sensor_10_localExtremaCount
0,0.382244,-277.0,-174.0,-130.0,-83.0,-66.0,-38.0,-12.0,37.0,67.0,...,249.021118,62011.52,3179.0,-2961.0,6140.0,14.978788,0.058227,53806.0,163.679382,16748
1,-3.82812,-1252.0,-878.0,-686.0,-446.0,-356.0,-206.0,-67.0,199.0,348.0,...,1054.646729,1112280.0,4442.0,-4329.0,8771.0,0.160791,0.004739,-445008.0,835.125977,5452
2,8.291928,-1392.0,-989.0,-765.0,-497.0,-400.0,-230.0,-74.0,234.0,414.0,...,1228.943604,1510302.0,5230.0,-5040.0,10270.0,0.193508,-0.02548,-89519.0,972.49646,4705
3,2.071582,-1017.0,-645.0,-485.0,-310.0,-246.0,-141.0,-46.0,140.0,252.0,...,736.210999,542006.6,5788.0,-4634.0,10422.0,2.73242,-0.051502,-82408.0,548.506348,9983
4,0.904102,-702.0,-465.0,-358.0,-233.0,-187.0,-104.0,-33.0,105.0,184.0,...,907.728516,823971.1,4574.0,-3909.0,8483.0,1.135692,0.375558,1922895.0,691.087891,5633


# 4.5 Train Test Split

In [5]:
X_train70,X_test30,y_train70,y_test30 = train_test_split(X,y,test_size = 0.3)
X_train50,X_test50,y_train50,y_test50 = train_test_split(X,y,test_size = 0.5)
X_train70scale,X_test30scale,y_train70scale,y_test30scale = train_test_split(Xscale,yscale,test_size = 0.3)
X_train50scale,X_test50scale,y_train50scale,y_test50scale = train_test_split(Xscale,yscale,test_size = 0.5)

print(X_train70.shape,y_train70.shape)
print(X_train70.shape,y_train70.shape)

(3101, 240) (3101, 1)
(3101, 240) (3101, 1)


Two splits are created to test our models when presented with more data. A 70/30 train test split and a 50/50 train test split have been created and both will be used to evaluate performance.

# 4.6 Export Data

In [6]:
Train70 = open('Train70Data.p','wb')
pickle.dump((X_train70,y_train70),Train70)
Train70.close()

Test30 = open('Test30Data.p','wb')
pickle.dump((X_test30,y_test30),Test30)
Test30.close()

Train50 = open('Train50Data.p','wb')
pickle.dump((X_train50,y_train50),Train50)
Train50.close()

Test50 = open('Test50Data.p','wb')
pickle.dump((X_test50,y_test50),Test50)
Test50.close()

Train70scale = open('Train70Datascale.p','wb')
pickle.dump((X_train70scale,y_train70scale),Train70scale)
Train70scale.close()

Test30scale = open('Test30Datascale.p','wb')
pickle.dump((X_test30scale,y_test30scale),Test30scale)
Test30scale.close()

Train50scale = open('Train50Data.p','wb')
pickle.dump((X_train50scale,y_train50scale),Train50scale)
Train50scale.close()

Test50scale = open('Test50Datascale.p','wb')
pickle.dump((X_test50scale,y_test50scale),Test50scale)
Test50scale.close()

We have exported two train test splits to test different scenarios, as well as a scaler model to invert our output from the model.