<a href="https://colab.research.google.com/github/GECarlo/CodingDojo/blob/main/Abalone_Preprocessing_Exercise_(Core).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# mount drive
from google.colab import drive
drive.mount ('/content/drive')

Mounted at /content/drive


In [2]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display = 'diagram')

In [3]:
# import file
data = '/content/drive/MyDrive/Colab Notebooks/abalone.data'
df = pd.read_csv(data)
df.head()

Unnamed: 0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [4]:
# add column names from file
df.columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole', 'Shucked', 'Viscera', 'Shell', 'Rings']
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole,Shucked,Viscera,Shell,Rings
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


7. Attribute information:

   Given is the attribute name, attribute type, the measurement unit and a
   brief description.  The number of rings is the value to predict: either
   as a continuous value or as a classification problem.

	Name		Data Type	Meas.	Description
	----		---------	-----	-----------
	Sex		nominal			M, F, and I (infant)

	Length		continuous	mm	Longest shell measurement

	Diameter	continuous	mm	perpendicular to length

	Height		continuous	mm	with meat in shell

	Whole weight	continuous	grams	whole abalone

	Shucked weight	continuous	grams	weight of meat

	Viscera weight	continuous	grams	gut weight (after bleeding)

	Shell weight	continuous	grams	after being dried
  
	Rings		integer			+1.5 gives the age in years

In [5]:
# check shape
df.shape

(4176, 9)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4176 entries, 0 to 4175
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Sex       4176 non-null   object 
 1   Length    4176 non-null   float64
 2   Diameter  4176 non-null   float64
 3   Height    4176 non-null   float64
 4   Whole     4176 non-null   float64
 5   Shucked   4176 non-null   float64
 6   Viscera   4176 non-null   float64
 7   Shell     4176 non-null   float64
 8   Rings     4176 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [7]:
# check for missing values
df.isna().sum().sum()

0

In [8]:
# check for duplicates
df.duplicated().sum().sum()

0

In [9]:
# check for outliers
df.describe()

Unnamed: 0,Length,Diameter,Height,Whole,Shucked,Viscera,Shell,Rings
count,4176.0,4176.0,4176.0,4176.0,4176.0,4176.0,4176.0,4176.0
mean,0.524009,0.407892,0.139527,0.828818,0.3594,0.180613,0.238852,9.932471
std,0.120103,0.09925,0.041826,0.490424,0.22198,0.10962,0.139213,3.223601
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.093375,0.13,8.0
50%,0.545,0.425,0.14,0.79975,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.15325,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


1) Separate your data into the features matrix (X) and target vector (y).

2) Create a train, test, split on the data. Please use random number 42 for consistency.

3) Use column transformers to transform the appropriate columns

In [10]:
# Create dictionary for ordinal datatypes before assigning target (y) and features (X)
# sex dictionary
sex_dictionary = {'M': 2, 'F': 1, 'I': 0}
df['Sex'].replace(sex_dictionary, inplace = True)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole,Shucked,Viscera,Shell,Rings
0,2,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,2,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,0,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [11]:
# Separate your data into the features matrix (X) and target vector (y) Rings
# target y
y = df['Rings']

# features X
X = df.drop(columns = ['Rings'])
X.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole,Shucked,Viscera,Shell
0,2,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
1,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
2,2,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
3,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055
4,0,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12


In [12]:
# Create a train, test, split on the data. Please use random number 42 for consistency.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

**For the column transformations:**

    a) Use column selectors to select the categorical columns and the numerical columns

    b) Use a OneHotEncoder to encode the categorical columns

    c) Use a StandardScaler to scale the numeric columns

    d) Use a ColumnTransformer to match the transformation to the type of column

    e) Transform the data and display the resulting Numpy array.

In [20]:
# Use column transformers to transform the appropriate columns
# use pipeline and column transformer
cat_selector = make_column_selector(dtype_include = 'object')
num_selector = make_column_selector(dtype_include = 'number')

In [18]:
# Use a OneHotEncoder to encode the categorical columns
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False)

# scale numeric columns
scaler = StandardScaler()

In [30]:
# Use a StandardScaler to scale the numeric columns
# create tuple
cat_tuple = (ohe, cat_selector)
num_tuple = (scaler, num_selector)

In [31]:
# Use a ColumnTransformer to match the transformation to the type of column
preprocessor = make_column_transformer(cat_tuple, num_tuple, remainder = 'drop')

In [32]:
# Transform the data and display the resulting Numpy array.
preprocessor.fit(X_train)

In [34]:
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [35]:
X_train_processed

array([[ 1.16585869, -1.54642176, -1.5561698 , ..., -1.26063323,
        -1.33785174, -1.21234133],
       [ 1.16585869,  0.79572536,  0.52191671, ...,  0.78946343,
         0.7495838 ,  0.40911675],
       [-0.04604619,  0.25201264,  0.31917656, ...,  0.60206472,
         0.04012858,  0.17235517],
       ...,
       [-0.04604619, -0.04075575,  0.21780648, ..., -0.26267869,
        -0.21454765, -0.02853344],
       [-0.04604619,  0.41930886,  0.52191671, ..., -0.12269412,
         0.23568355,  0.42705323],
       [-0.04604619,  0.58660508,  0.57260174, ...,  0.71043988,
         0.57222   ,  0.47368809]])