In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import (
    LinearRegression,
    Ridge,
    HuberRegressor,
    TheilSenRegressor,
    RANSACRegressor
)
from sklearn.model_selection import (
    cross_val_score,
    KFold,
    train_test_split
)
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.ensemble import StackingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    RobustScaler, 
    LabelEncoder, 
    PolynomialFeatures
)
from sklearn.feature_selection import RFECV
import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
df['Sex'] = df['Sex'].replace('Diameter', 'I')


X = df.drop(columns=['id', 'Sex', 'Age'])
X['Sex_num'] = df['Sex'].replace({'I':0, 'M':1, 'F':2})
y = df['Age']


In [3]:
poly_converter = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly_converter.fit(X)
poly_features = poly_converter.transform(X)

In [4]:
model_r = Ridge()

kf = KFold(n_splits=5, shuffle=True, random_state=0)

rfecv = RFECV(estimator=model_r, step=1, cv=kf, scoring='neg_mean_absolute_error', n_jobs=8)

rfecv.fit(poly_features, y)

optimal_features = rfecv.n_features_
feature_ranking = rfecv.ranking_
important_features = rfecv.support_

optimal_features, feature_ranking, important_features

(40,
 array([1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 5, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]),
 array([ True,  True,  True,  True, False,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True, False,  True,  True,  True, False,  True,  True,
         True,  True,  True, False,  True,  True,  True,  True]))

In [5]:
print(poly_features.shape)

poly_df = pd.DataFrame(poly_features)
poly_features_rfe = poly_df.loc[:, important_features]

poly_features_rfe.shape

(15000, 44)


(15000, 40)

In [6]:
test_for = test.drop(columns=['id', 'Sex'])
test_for['Sex_num'] = test['Sex'].replace({'I':0, 'M':1, 'F':2})
test_for

poly_converter2 = PolynomialFeatures(degree=2, include_bias=False)
poly_features2 = poly_converter2.fit(test_for)
poly_features2 = poly_converter2.transform(test_for)

poly2_df = pd.DataFrame(poly_features2)
poly_features2_rfe = poly2_df.loc[:, important_features]
poly_features2_rfe.shape

(10000, 40)

In [7]:
linear_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('model', LinearRegression())
])
ridge_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('model', Ridge())
])
huber_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('model', HuberRegressor())
])
theilsen_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('model', TheilSenRegressor())
])
ransac_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('model', RANSACRegressor())
])

In [8]:
meta_model = HuberRegressor()
stacking_regressor = StackingRegressor(estimators=[
    ('linear', linear_pipeline),
    ('ridge', ridge_pipeline),
    ('huber', huber_pipeline),
    ('theilsen', theilsen_pipeline),
    ('ransac', ransac_pipeline)
    ],
    final_estimator=meta_model,
    cv=5
)
stacking_regressor.fit(poly_features_rfe, y)

y_stacking = stacking_regressor.predict(poly_features2_rfe)

In [9]:
y_stacking.round().sum()

96252.0

In [10]:
csv_data = pd.DataFrame({'id':test['id'], 'age':(y_stacking - 0.1).round()})
csv_data.to_csv('final_test_wfd.csv', index=False)