# Import dependencies

In [None]:
# For Data science
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn import (
    model_selection,
    linear_model,
    svm,
)

# For modules
from sources import (
    get_heatmap,
    check_is_na,
    get_category_encoded,
    three_sigma_cleared,
    get_count_plot,
    evaluate_model,
)

# Getting data, observations
Get dataset

In [None]:
# Get dataset from file
data = pd.read_csv(
    "../data/abalone.data",
    delimiter=",",
    names=[
        "sex",
        "Length",
        "Diameter",
        "Height",
        "Whole weight",
        "Shucked weight",
        "Viscera weight",
        "Shell weight",
        "Rings",
    ],
)

Look for missing values

In [None]:
# Check if the dataset has missing values
check_is_na(data)

Get dataset data

In [None]:
# Get dataset info
data.info()

There is a categorical column in dataset

In [None]:
# Get DataFrame from categorical column
df = data.select_dtypes(include="object")

# Get head
df.head(5)

Encode category to numerical values

In [None]:
# Encode categorical data
encoded_dataset = get_category_encoded(data, "sex", "ordinal")

encoded_dataset = encoded_dataset.iloc[:, 1:]

encoded_dataset

In [None]:
# Get heatmap for prepared dataset
get_heatmap(encoded_dataset, "encoded_dataset")

There are a lot of strongly correlated features in dataset.

Deal with correlating features

In [None]:
# Set new combination for weight features
encoded_dataset["weight_index"] = (
    encoded_dataset["Shell weight"]
    + encoded_dataset["Shucked weight"]
    + encoded_dataset["Viscera weight"] / encoded_dataset["Whole weight"]
)

# Set new combination for geometry features
encoded_dataset["geometry_index"] = (
    encoded_dataset["Length"] * encoded_dataset["Diameter"] * encoded_dataset["Height"]
)

# Set major index
encoded_dataset["index"] = (
    encoded_dataset["geometry_index"] / encoded_dataset["weight_index"]
)

# Drop combination components
encoded_dataset.drop(
    columns=[
        "Shell weight",
        "Shucked weight",
        "Viscera weight",
        "Whole weight",
        "Length",
        "Diameter",
        "Height",
        "weight_index",
        "geometry_index",
    ],
    inplace=True,
)

# Get new heatmap
get_heatmap(encoded_dataset, "encoded_dataset")

The highly correlated features are removed.

Let's analyse main feature effect on target.

In [None]:
# Regression plot
sns.regplot(data=encoded_dataset, x="index", y="Rings");

Out-layers should be removed from 'index' feature.

In [None]:
# Clear 'index' feature
encoded_dataset_filtered = three_sigma_cleared(encoded_dataset, "index")

In [None]:
# Regression plot
sns.regplot(data=encoded_dataset_filtered, x="index", y="Rings");

Looks much better.

In [None]:
# Plot distribution for sex feature
sns.boxplot(data=encoded_dataset_filtered, x="sex", y="Rings");

The effect of 'sex' feature is not statistically sufficient.

Now, there is no sufficiently correlated features.

Scale the new combined feature.

In [None]:
scaled_dataset = encoded_dataset_filtered

# Regression
Split dataset to train and test

In [None]:
# Set feature and target subsets
abalone_features = scaled_dataset.iloc[:, 1:]
abalone_target = scaled_dataset.iloc[:, :1]

# Get split subsets
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    abalone_features, abalone_target, test_size=0.3, random_state=42
)

Check if split target data is balanced

In [None]:
# Get DataFrame from Series object
train_df = pd.DataFrame(y_train, columns=["Rings"])
test_df = pd.DataFrame(y_test, columns=["Rings"])

# Get count plot for target in train dataset
get_count_plot(train_df, "Rings")
get_count_plot(test_df, "Rings")

Target distributions looked as not normal ones but quite similar. Target distributions balansed.

In [None]:
# Get regression object
abalone_regression = linear_model.LinearRegression()

# Get cross-validation results
cross_validation_score = model_selection.cross_val_score(
    abalone_regression, X_train, y_train, cv=3
)

# Get fit
abalone_regression.fit(X_train, y_train)

# Get train score
abalone_train_score = abalone_regression.score(X_train, y_train)
# Geet test score
abalone_test_score = abalone_regression.score(X_test, y_test)

print(f"Cross validation score: {cross_validation_score[:]}")
print(f"Train score: {abalone_train_score:.2f}")
print(f"Test score: {abalone_test_score:.2f}")

Model looks stable.

Let's evaluate model.

In [None]:
# Get metrics for fit model
evaluate_model(abalone_regression, X_train, X_test, y_train, y_test)

Metrics for the test and train fit look quite similar.

In [None]:
# Get numbers from model
model = abalone_regression
intercept = model.intercept_
coefficients = model.coef_
features = model.feature_names_in_
score = model.score(X_test, y_test)

# Print model
print(
    f"Score:\nR^2 = {score:.2f}\n\n"
    f"The model is:\ny = {float(intercept):.2f} +"
    f" {coefficients[:, 0][0]:.2f} * {features[0]} + {coefficients[:, 1][0]:.2f} * {features[1]}"
)

In [None]:
# Get density plot
# for test data
sns.kdeplot(
    y_test,
    fill=False,
    color="r",
    label="test subset",
)

# for predicted data
sns.kdeplot(
    abalone_regression.predict(X_test),
    fill=True,
    color="b",
    label="predicted",
)

# Plot
plt.title("Distribution of observations in test dataset and and predicted dataset")
plt.legend();

# Classification

Make additional imports

Encode category to numerical values

In [None]:
# Encode categorical data
encoded_dataset = get_category_encoded(data, "sex", "ordinal")

encoded_dataset = encoded_dataset.iloc[:, 1:]

encoded_dataset.head(3)

Make data

In [None]:
# Set feature and target subsets
X = encoded_dataset.drop("Rings", axis=1)
y = encoded_dataset["Rings"]

Split dataset to test and train data.

In [None]:
# Get split subsets
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [None]:
# Initiate classification
classifier = svm.LinearSVC(dual="auto")

# Train model
classifier.fit(X_train, y_train);

In [None]:
# Score model
classifier.score(X_test, y_test)

Visualize model

In [None]:
# Get density plot
# for test data
sns.kdeplot(
    y_test,
    fill=False,
    color="r",
    label="test subset",
)

# for predicted data
sns.kdeplot(
    classifier.predict(X_test),
    fill=True,
    color="b",
    label="predicted",
)

# Plot
plt.title("Distribution of observations in test dataset and and predicted dataset")
plt.legend();

# Summary
1. Correlated features removed.
2. New combined feature created.
3. Regression model created.
4. Additional linear classification done for whole numerical features.
4. Metrics looked poor.