<a href="https://colab.research.google.com/github/dvisionst/Abalone_Exercise/blob/main/Abalone_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Abalone Core Exercise
- Jose Flores
- 22 July 2022

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler






Prepare the Abalone Dataset for Modeling
The rings column will be your target column.

Note: Similar to trees, the number of rings for Abalone can be used to determine the age.  

In [None]:
# importing the data to use in a dataframe and displaying first 5 rows
data = '/content/abalone.data'
df = pd.read_csv(data)
df.head()

Unnamed: 0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [None]:
# adding the column names from the names dile that was downloaded.

df.columns = ['Sex', 'Length', 'Diameter', 'Height', 
              'Whole_weight', 'Shucked_weight', 'Viscera_weight', 
              'Shell_weight', 'Rings' ]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4176 entries, 0 to 4175
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4176 non-null   object 
 1   Length          4176 non-null   float64
 2   Diameter        4176 non-null   float64
 3   Height          4176 non-null   float64
 4   Whole_weight    4176 non-null   float64
 5   Shucked_weight  4176 non-null   float64
 6   Viscera_weight  4176 non-null   float64
 7   Shell_weight    4176 non-null   float64
 8   Rings           4176 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


## 1) Separate your data into the features matrix (X) and target vector (y).

In [None]:
# no missing values will go straight to features matrix and target vector
# target vector (y) will be the Rings column

X = df.drop(columns='Rings')
y = df['Rings']

## 2) Train/test split the data. Please use the random number 42 for consistency

In [None]:
# doing the split into trian and test sets of data using 42 for consistency

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## 3) Create a ColumnTransformer to preprocess the data. Remember to:

    a) Create column selectors for the numeric and categorical columns

    b) Create a OneHotEncoder for one-hot encoding the categorical columns

    c) Create a StandardScaler for scaling numeric columns

    d) Match each transformer with the appropriate selector in a tuple

    e) Use the tuples to create a ColumnTransformer to preprocess the data.

In [None]:
# a) creating column selectors for both numeric and catagorical data

cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [None]:
# b) creating a OHE for  one-hot encoding the categorical columns
train_cat_data = X_train[cat_selector(X_train)]
test_cat_data = X_test[cat_selector(X_test)]
train_cat_data.head()

Unnamed: 0,Sex
2428,M
3823,M
3956,F
3623,F
0,M


In [None]:
# applying the actual OHE on the training cat data
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe_encoder.fit(train_cat_data)

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [None]:
# c) Creating a standard scaler for the numeric features

train_num_data = X_train[num_selector(X_train)]
test_num_data = X_test[num_selector(X_test)]
train_num_data.head()


Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
2428,0.34,0.255,0.095,0.213,0.081,0.034,0.07
3823,0.62,0.46,0.17,1.127,0.535,0.2635,0.296
3956,0.555,0.44,0.155,1.016,0.4935,0.1855,0.263
3623,0.665,0.5,0.175,1.4355,0.643,0.345,0.37
0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07


In [None]:
# Applying the actual scaler on numerical data
scaler = StandardScaler()
scaler.fit(train_num_data)

StandardScaler()

In [None]:
# d) transforming the data

train_ohe = ohe_encoder.transform(train_cat_data)
test_ohe = ohe_encoder.transform(test_cat_data)
train_scaled = scaler.transform(train_num_data)
test_scaled = scaler.transform(test_num_data)



In [None]:
# converting to dataframe and extracting new column names from encoder
ohe_column_names = ohe_encoder.get_feature_names(train_cat_data.columns)
train_ohe = pd.DataFrame(train_ohe, columns=ohe_column_names)
test_ohe = pd.DataFrame(test_ohe, columns=ohe_column_names)

train_nums = X_train[num_selector(X_train)].reset_index(drop=True)
test_nums = X_test[num_selector(X_test)].reset_index(drop=True)





In [None]:
# putting back the categorical and numerical data in training and testing sets
X_train_processed = pd.concat([train_nums, train_ohe], axis=1)
X_test_processed = pd.concat([test_nums, test_ohe], axis=1)



## Displaying Results of the transformed data

In [None]:
# transformed and processed training data
X_train_processed

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Sex_F,Sex_I,Sex_M
0,0.340,0.255,0.095,0.2130,0.0810,0.0340,0.0700,0.0,0.0,1.0
1,0.620,0.460,0.170,1.1270,0.5350,0.2635,0.2960,0.0,0.0,1.0
2,0.555,0.440,0.155,1.0160,0.4935,0.1855,0.2630,1.0,0.0,0.0
3,0.665,0.500,0.175,1.4355,0.6430,0.3450,0.3700,1.0,0.0,0.0
4,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
3127,0.495,0.400,0.145,0.5780,0.2545,0.1305,0.1645,0.0,1.0,0.0
3128,0.655,0.530,0.195,1.3880,0.5670,0.2735,0.4100,0.0,0.0,1.0
3129,0.520,0.430,0.150,0.7280,0.3020,0.1575,0.2350,1.0,0.0,0.0
3130,0.575,0.460,0.150,0.9270,0.3330,0.2070,0.2985,1.0,0.0,0.0


In [None]:
# transformed and processed testing data
X_test_processed

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Sex_F,Sex_I,Sex_M
0,0.615,0.500,0.175,1.3770,0.5585,0.3300,0.2920,1.0,0.0,0.0
1,0.590,0.465,0.150,1.1510,0.6130,0.2390,0.2515,1.0,0.0,0.0
2,0.535,0.420,0.145,0.9260,0.3980,0.1965,0.2500,0.0,1.0,0.0
3,0.640,0.505,0.165,1.2235,0.5215,0.2695,0.3600,1.0,0.0,0.0
4,0.500,0.380,0.155,0.6550,0.2405,0.1430,0.2050,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1039,0.550,0.415,0.135,0.7750,0.3020,0.1790,0.2600,1.0,0.0,0.0
1040,0.435,0.335,0.110,0.3800,0.1695,0.0860,0.1100,1.0,0.0,0.0
1041,0.580,0.480,0.180,1.2495,0.4945,0.2700,0.3710,1.0,0.0,0.0
1042,0.640,0.515,0.205,1.5335,0.6635,0.3345,0.4025,1.0,0.0,0.0
