# 4 Pre-Processing

## 4.1 Import Modules and Data

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
filename = '../Step 2 - Data Wrangling/FlareDataClean.csv'
df = pd.read_csv(filename)

df.head()

Unnamed: 0,Zurich_Class,Largest_Spot_Size,Spot_Distribution,Activity,Evolution,Prev_24h_Activity,Historically_Complex,Become_Historically_Complex,Area,Area_Of_Largest,C_Class_Flares,M_Class_Flares,X_Class_Flares
0,8,4,1,1,3,1,1,1,1,1,0,0,0
1,4,2,2,1,3,1,1,2,1,1,0,0,0
2,3,3,2,1,3,1,1,2,1,1,0,0,0
3,8,2,1,1,2,1,1,1,1,1,0,0,0
4,8,3,1,1,1,1,1,2,1,1,0,0,0


In [4]:
X = df.drop(['C_Class_Flares','M_Class_Flares','X_Class_Flares'],axis =1 )
y = df[['C_Class_Flares','M_Class_Flares','X_Class_Flares']]

print(X.shape)
print(y.shape)
X.head()

(1066, 10)
(1066, 3)


Unnamed: 0,Zurich_Class,Largest_Spot_Size,Spot_Distribution,Activity,Evolution,Prev_24h_Activity,Historically_Complex,Become_Historically_Complex,Area,Area_Of_Largest
0,8,4,1,1,3,1,1,1,1,1
1,4,2,2,1,3,1,1,2,1,1
2,3,3,2,1,3,1,1,2,1,1
3,8,2,1,1,2,1,1,1,1,1
4,8,3,1,1,1,1,1,2,1,1


# 4.2 Dummy Features

All our categorical data has already been converted into numeric values, so we will be skipping this step.

# 4.3 Scaling Data

In [8]:
scalerReg = StandardScaler()
scalerReg.fit(df)
scaledReg = scalerReg.transform(df)
scaledReg = pd.DataFrame(scaledReg, columns = df.columns)

scaledReg.head()

Unnamed: 0,Zurich_Class,Largest_Spot_Size,Spot_Distribution,Activity,Evolution,Prev_24h_Activity,Historically_Complex,Become_Historically_Complex,Area,Area_Of_Largest,C_Class_Flares,M_Class_Flares,X_Class_Flares
0,1.357423,0.914018,-1.196352,-0.426401,0.964865,-0.184586,-0.823857,-2.648592,-0.161203,0.0,-0.359337,-0.154969,-0.06511
1,-0.415877,-0.757239,0.044223,-0.426401,0.964865,-0.184586,-0.823857,0.377559,-0.161203,0.0,-0.359337,-0.154969,-0.06511
2,-0.859202,0.078389,0.044223,-0.426401,0.964865,-0.184586,-0.823857,0.377559,-0.161203,0.0,-0.359337,-0.154969,-0.06511
3,1.357423,-0.757239,-1.196352,-0.426401,-0.647276,-0.184586,-0.823857,-2.648592,-0.161203,0.0,-0.359337,-0.154969,-0.06511
4,1.357423,0.078389,-1.196352,-0.426401,-2.259418,-0.184586,-0.823857,0.377559,-0.161203,0.0,-0.359337,-0.154969,-0.06511


Our features are scaled and our target variables are unaltered. We do not want to scale our target variables since it will lead to scaled outputs

# 4.4 Creating Feature and Target Variables

In [9]:
X = df.drop(['C_Class_Flares','M_Class_Flares','X_Class_Flares'],axis =1 )
y = df[['C_Class_Flares','M_Class_Flares','X_Class_Flares']]

print(X.shape)
print(y.shape)
X.head()

(1066, 10)
(1066, 3)


Unnamed: 0,Zurich_Class,Largest_Spot_Size,Spot_Distribution,Activity,Evolution,Prev_24h_Activity,Historically_Complex,Become_Historically_Complex,Area,Area_Of_Largest
0,8,4,1,1,3,1,1,1,1,1
1,4,2,2,1,3,1,1,2,1,1
2,3,3,2,1,3,1,1,2,1,1
3,8,2,1,1,2,1,1,1,1,1
4,8,3,1,1,1,1,1,2,1,1


# 4.5 Train Test Split

In [10]:
X_train70,X_test30,y_train70,y_test30 = train_test_split(X,y,test_size = 0.3)
X_train50,X_test50,y_train50,y_test50 = train_test_split(X,y,test_size = 0.5)

print(X_train70.shape,y_train70.shape)
print(X_train70.shape,y_train70.shape)

(746, 10) (746, 3)
(746, 10) (746, 3)


Two splits are created to test our models when presented with more data. A 70/30 train test split and a 50/50 train test split have been created and both will be used to evaluate performance.

# 4.6 Export Data

In [11]:
Train70 = open('Train70Data.p','wb')
pickle.dump((X_train70,y_train70),Train70)
Train70.close()

Test30 = open('Test30Data.p','wb')
pickle.dump((X_test30,y_test30),Test30)
Test30.close()

Train50 = open('Train50Data.p','wb')
pickle.dump((X_train50,y_train50),Train50)
Train50.close()

Test50 = open('Test50Data.p','wb')
pickle.dump((X_test50,y_test50),Test50)
Test50.close()

All the data created will be exported in case any portion of it needs to be referenced later on, without needing to come back and export it later on. Exporting all the test train splits that were created on the scaled dummy feature data. The full scale data and the unscaled dummy feature data is also being exported. Lastly, the target variables and the feature values are being exported so we have the original unaltered data in case something else needs to be performed later on. Pickle is being used to export this data so that the object is preserved across projects without needing to convert to csv or other file formats.