# Call libraries

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, HuberRegressor, TheilSenRegressor
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder, RobustScaler
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
import warnings
warnings.filterwarnings("ignore")


# Call Data

In [4]:
train = pd.read_csv("train.csv", index_col=0)
test = pd.read_csv("test.csv", index_col = 0)

# Feature Engineering

## Creating new features

In [5]:
min_length_train = train["Length"].min()
min_length_test = test["Length"].min()

test["Length"] = test["Length"].replace(min_length_test, min_length_train)
test['Height'] = test['Height'].replace(0, 0.0125)
train['Height'] = train['Height'].replace(0, 0.0125)

In [6]:
def create_new_columns(df):
    df['Weight_to_Length'] = df['Weight'] / df['Length']

    df['Volume'] = df['Length'] * df['Diameter'] * df['Height']

    df['Shucked_Weight_Ratio'] = df['Shucked Weight'] / df['Weight']
    df['Shell_Weight_Ratio'] = df['Shell Weight'] / df['Weight']

    df['Shell_Weight_to_Length_Ratio'] = df['Shell Weight'] / df['Length']
    df['Shell_Weight_to_Diameter_Ratio'] = df['Shell Weight'] / df['Diameter']
    df['Total_Weight_Components'] = df['Shucked Weight'] + df['Viscera Weight'] + df['Shell Weight']
    df['Shucked_to_Shell_Weight_Ratio'] = df['Shucked Weight'] / df['Shell Weight']
    df['Viscera_to_Shell_Weight_Ratio'] = df['Viscera Weight'] / df['Shell Weight']

create_new_columns(train)
create_new_columns(test)


## "Sex" feature

In [7]:
train=train[train["Sex"]!="Diameter"]

In [8]:
def OrdEnc(data, col_1, col_2):
    categ = list(data.groupby([col_1])[col_2].mean().sort_values().index)
    oe = OrdinalEncoder(categories=[categ])
    data[col_1 + "_encoded"] = oe.fit_transform(data[[col_1]])
    data[col_1 + "_encoded"] = data[col_1 + "_encoded"] + 1

OrdEnc(train, "Sex", "Age")
categ = list(train.groupby(["Sex"])["Age"].mean().sort_values().index)
oe = OrdinalEncoder(categories=[categ])
test["Sex" + "_encoded"] = oe.fit_transform(test[["Sex"]])
test["Sex" + "_encoded"] = test["Sex" + "_encoded"] + 1

# Training model in local

In [9]:
X = train.select_dtypes('number').drop("Age", axis=1)
y = train['Age']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

pipeline_ridge = Pipeline([
    ('scaler', RobustScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('ridge', Ridge(alpha=16))
])

pipeline_lasso = Pipeline([
    ('scaler', RobustScaler()),
    ('poly', PolynomialFeatures(degree=1)),
    ('lasso', Lasso(alpha=0.00093))
])

pipeline_huber = Pipeline([
    ('scaler', RobustScaler()),
    ('poly', PolynomialFeatures(degree=1)),
    ('huber', HuberRegressor(epsilon=1.35, alpha=0.001))
])

pipeline_theilsen = Pipeline([
    ('scaler', RobustScaler()),
    ('poly', PolynomialFeatures(degree=1)),
    ('theilsen', TheilSenRegressor())
])

pipeline_linear = Pipeline([
    ('scaler', RobustScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', LinearRegression())
])


estimators = [
    ('ridge', pipeline_ridge),
    ('lasso', pipeline_lasso),
    ('huber', pipeline_huber),
    ('theilsen', pipeline_theilsen),
    ('linear', pipeline_linear)
]

stacking_model = StackingRegressor(estimators=estimators, final_estimator=HuberRegressor(alpha=0.001))
stacking_model.fit(X_train, y_train)

y_pred = stacking_model.predict(X_test)
print("Test MAE for Stacking Model:", mean_absolute_error(y_test, y_pred))


Test MAE for Stacking Model: 1.301562693351867


# Create real model and predict in Test

In [10]:
X_train = train.drop(columns=['Age', "Sex"], axis=1)
y_train = train['Age']
X_test = test.drop(columns=["Sex"], axis=1).copy()

In [11]:
pipeline_ridge = Pipeline([
    ('scaler', RobustScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('ridge', Ridge(alpha=16))
])

pipeline_lasso = Pipeline([
    ('scaler', RobustScaler()),
    ('poly', PolynomialFeatures(degree=1)),
    ('lasso', Lasso(alpha=0.00093))
])

pipeline_huber = Pipeline([
    ('scaler', RobustScaler()),
    ('poly', PolynomialFeatures(degree=1)),
    ('huber', HuberRegressor(epsilon=1.35, alpha=0.001))
])

pipeline_theilsen = Pipeline([
    ('scaler', RobustScaler()),
    ('poly', PolynomialFeatures(degree=1)),
    ('theilsen', TheilSenRegressor())
])

pipeline_linear = Pipeline([
    ('scaler', RobustScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', LinearRegression())
])

estimators = [
    ('ridge', pipeline_ridge),
    ('lasso', pipeline_lasso),
    ('huber', pipeline_huber),
    ('theilsen', pipeline_theilsen),
    ('linear', pipeline_linear)
]

stacking_model = StackingRegressor(estimators=estimators, final_estimator=HuberRegressor(alpha=0.001))
stacking_model.fit(X_train, y_train)

y_pred = stacking_model.predict(X_test)

In [12]:
df = pd.DataFrame(pd.read_csv("test.csv")["id"])
df["Age"] = y_pred
df

Unnamed: 0,id,Age
0,15000,6.364673
1,15001,8.877134
2,15002,5.448133
3,15003,8.193297
4,15004,6.506409
...,...,...
9995,24995,8.639782
9996,24996,7.885621
9997,24997,10.125700
9998,24998,8.997649


In [13]:
df.to_csv("my_submission.csv", index=False)