# Data Preprocessing

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
dataset = pd.read_csv('SeaStateCode.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

In [3]:
print(dataset)

   WMOSeaStateCode  WaveHeight_Min (m)  WaveHeight_Max (m) Characteristics
0                0                0.00                0.00      GlassyCalm
1                1                0.00                0.10     RippledCalm
2                2                0.10                0.50  SmoothWavelets
3                3                0.50                1.25          Slight
4                4                1.25                2.50        Moderate
5                5                2.50                4.00           Rough
6                6                4.00                6.00       VeryRough
7                7                6.00                9.00            High
8                8                9.00               14.00        VeryHigh
9                9               14.00                 NaN      Phenomenal


In [4]:
print(X)

[[ 0.    0.    0.  ]
 [ 1.    0.    0.1 ]
 [ 2.    0.1   0.5 ]
 [ 3.    0.5   1.25]
 [ 4.    1.25  2.5 ]
 [ 5.    2.5   4.  ]
 [ 6.    4.    6.  ]
 [ 7.    6.    9.  ]
 [ 8.    9.   14.  ]
 [ 9.   14.     nan]]


In [5]:
print(y)

['GlassyCalm' 'RippledCalm' 'SmoothWavelets' 'Slight' 'Moderate' 'Rough'
 'VeryRough' 'High' 'VeryHigh' 'Phenomenal']


In [6]:
# top rows of the frame
dataset.head(2)

Unnamed: 0,WMOSeaStateCode,WaveHeight_Min (m),WaveHeight_Max (m),Characteristics
0,0,0.0,0.0,GlassyCalm
1,1,0.0,0.1,RippledCalm


In [7]:
# bottom rows of the frame
dataset.tail(2)

Unnamed: 0,WMOSeaStateCode,WaveHeight_Min (m),WaveHeight_Max (m),Characteristics
8,8,9.0,14.0,VeryHigh
9,9,14.0,,Phenomenal


In [8]:
# quick statistic summary of your data
dataset.describe()

Unnamed: 0,WMOSeaStateCode,WaveHeight_Min (m),WaveHeight_Max (m)
count,10.0,10.0,9.0
mean,4.5,3.735,4.15
std,3.02765,4.685323,4.770613
min,0.0,0.0,0.0
25%,2.25,0.2,0.5
50%,4.5,1.875,2.5
75%,6.75,5.5,6.0
max,9.0,14.0,14.0


In [9]:
# Taking care of missing data
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X)
SimpleImputer()

In [10]:
print(imp.transform(X))
# Obviously NOT CORRECT value - We Ocean engineers know that

[[ 0.    0.    0.  ]
 [ 1.    0.    0.1 ]
 [ 2.    0.1   0.5 ]
 [ 3.    0.5   1.25]
 [ 4.    1.25  2.5 ]
 [ 5.    2.5   4.  ]
 [ 6.    4.    6.  ]
 [ 7.    6.    9.  ]
 [ 8.    9.   14.  ]
 [ 9.   14.    4.15]]


In [11]:
# Redefine X to revert to old NaN
X = dataset.iloc[:, :-1].values
X

array([[ 0.  ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.1 ],
       [ 2.  ,  0.1 ,  0.5 ],
       [ 3.  ,  0.5 ,  1.25],
       [ 4.  ,  1.25,  2.5 ],
       [ 5.  ,  2.5 ,  4.  ],
       [ 6.  ,  4.  ,  6.  ],
       [ 7.  ,  6.  ,  9.  ],
       [ 8.  ,  9.  , 14.  ],
       [ 9.  , 14.  ,   nan]])

In [12]:
# Taking care of missing data
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value = 20)
imp.fit(X)
SimpleImputer()

In [13]:
print(imp.transform(X))

[[ 0.    0.    0.  ]
 [ 1.    0.    0.1 ]
 [ 2.    0.1   0.5 ]
 [ 3.    0.5   1.25]
 [ 4.    1.25  2.5 ]
 [ 5.    2.5   4.  ]
 [ 6.    4.    6.  ]
 [ 7.    6.    9.  ]
 [ 8.    9.   14.  ]
 [ 9.   14.   20.  ]]


In [14]:
# What is y?
y

array(['GlassyCalm', 'RippledCalm', 'SmoothWavelets', 'Slight',
       'Moderate', 'Rough', 'VeryRough', 'High', 'VeryHigh', 'Phenomenal'],
      dtype=object)

In [15]:
# Encoding categorical data
# Encoding the Independent Variable
from sklearn.preprocessing import LabelEncoder
# integer encode
labelencoder_y = LabelEncoder()

# Fit label encoder and return encoded labels
integer_encoded = labelencoder_y.fit_transform(y)
integer_encoded

array([0, 4, 7, 6, 2, 5, 9, 1, 8, 3])

In [16]:
# Encode categorical integer features as a one-hot numeric array.
# Representation of categorical data to be more expressive
# binary encode
# https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([("Characteristics", OneHotEncoder(), [0])], remainder = 'passthrough')
# The last arg ([0]) is the list of columns you want to transform in this step
# 'remainder' argument determines what to do with unmodified columns if you have many
ct

In [17]:
# Find list of columns that are numerical columns marked as ‘float64‘ or ‘int64‘ in Pandas, 
# and a list of categorical columns, marked as ‘object‘ or ‘bool‘ type in Pandas.
# determine categorical and numerical features
numerical_ix = dataset.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = dataset.select_dtypes(include=['object', 'bool']).columns
print(numerical_ix)
print(categorical_ix)

Index(['WMOSeaStateCode', 'WaveHeight_Min (m)', 'WaveHeight_Max (m)'], dtype='object')
Index(['Characteristics'], dtype='object')


In [18]:
output_data1 = dataset.iloc[:,3]
output_data1

0        GlassyCalm
1       RippledCalm
2    SmoothWavelets
3            Slight
4          Moderate
5             Rough
6         VeryRough
7              High
8          VeryHigh
9        Phenomenal
Name: Characteristics, dtype: object

In [19]:
output_data1.shape

(10,)

In [20]:
output_data2 = dataset.iloc[:, 3].values
#Encoding the categorical output data (There is no categorical input data)
output_data2

array(['GlassyCalm', 'RippledCalm', 'SmoothWavelets', 'Slight',
       'Moderate', 'Rough', 'VeryRough', 'High', 'VeryHigh', 'Phenomenal'],
      dtype=object)

In [21]:
output_data2.shape

(10,)

In [22]:
#Reshape dataset before apply fit_transform function
# output_data = output_data1.shape - ERROR not numpy array but Pandas Object
output_data = output_data2.reshape(-1, 1)
output_data

array([['GlassyCalm'],
       ['RippledCalm'],
       ['SmoothWavelets'],
       ['Slight'],
       ['Moderate'],
       ['Rough'],
       ['VeryRough'],
       ['High'],
       ['VeryHigh'],
       ['Phenomenal']], dtype=object)

In [23]:
output_data.shape

(10, 1)

In [24]:
z = ct.fit_transform(output_data)
print(z)

  (0, 0)	1.0
  (1, 4)	1.0
  (2, 7)	1.0
  (3, 6)	1.0
  (4, 2)	1.0
  (5, 5)	1.0
  (6, 9)	1.0
  (7, 1)	1.0
  (8, 8)	1.0
  (9, 3)	1.0


In [25]:
z.todense()

matrix([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]])

In [26]:
dataset

Unnamed: 0,WMOSeaStateCode,WaveHeight_Min (m),WaveHeight_Max (m),Characteristics
0,0,0.0,0.0,GlassyCalm
1,1,0.0,0.1,RippledCalm
2,2,0.1,0.5,SmoothWavelets
3,3,0.5,1.25,Slight
4,4,1.25,2.5,Moderate
5,5,2.5,4.0,Rough
6,6,4.0,6.0,VeryRough
7,7,6.0,9.0,High
8,8,9.0,14.0,VeryHigh
9,9,14.0,,Phenomenal


In [27]:
# pandas library has an easier way to generate OneHotEncoding
datasetNew = pd.get_dummies(dataset,columns=['Characteristics'])
datasetNew

Unnamed: 0,WMOSeaStateCode,WaveHeight_Min (m),WaveHeight_Max (m),Characteristics_GlassyCalm,Characteristics_High,Characteristics_Moderate,Characteristics_Phenomenal,Characteristics_RippledCalm,Characteristics_Rough,Characteristics_Slight,Characteristics_SmoothWavelets,Characteristics_VeryHigh,Characteristics_VeryRough
0,0,0.0,0.0,1,0,0,0,0,0,0,0,0,0
1,1,0.0,0.1,0,0,0,0,1,0,0,0,0,0
2,2,0.1,0.5,0,0,0,0,0,0,0,1,0,0
3,3,0.5,1.25,0,0,0,0,0,0,1,0,0,0
4,4,1.25,2.5,0,0,1,0,0,0,0,0,0,0
5,5,2.5,4.0,0,0,0,0,0,1,0,0,0,0
6,6,4.0,6.0,0,0,0,0,0,0,0,0,0,1
7,7,6.0,9.0,0,1,0,0,0,0,0,0,0,0
8,8,9.0,14.0,0,0,0,0,0,0,0,0,1,0
9,9,14.0,,0,0,0,1,0,0,0,0,0,0
