In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import  train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, HuberRegressor
from sklearn.preprocessing import  PolynomialFeatures,StandardScaler
from sklearn.ensemble import StackingRegressor
import warnings
warnings.filterwarnings("ignore")


### Malumotlarni oqib olish
### Download data

In [3]:
sample_data = pd.read_csv("sample_submission.csv")
test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")


### Kategorik malumotlarni numerik malumotlarga ozgartirib olish
### Convert categorical data to numerical data

In [4]:
train_data['Sex'] = train_data['Sex'].map({'I': 1, 'F': 2, 'M': 3,'Diameter':4})
test_data['Sex'] = test_data['Sex'].map({'I': 1, 'F': 2, 'M': 3,'Diameter':4})

### Bor ustunlardan foydalanib yangi ustunlarni yaratib olish
### Create new columns using existing columns

In [5]:
def feature_engineering(df):

    df['Surface Area'] = 2 * (df['Length'] * df['Diameter'] + df['Length'] * df['Height'] + df['Diameter'] * df['Height'])

    df['Volume'] = df['Length'] * df['Diameter'] * df['Height']

    df['Log Weight'] = df['Weight'].apply(lambda x: np.log(x + 1) if x > 0 else np.nan)

    df['Length Bins'] = pd.qcut(df['Length'], q=4, labels=False)

    return df


### Yangi futureslarni qoshi olish
### Get new futures

In [6]:
train_data = feature_engineering(train_data)
test_data = feature_engineering(test_data)

### prediction va featureslarni ustunlarini ajaratib olish
### extract prediction and features

In [7]:
X_train_df = train_data.drop(columns=["id","Age"])
y = train_data[["Age"]]
X_test_df = test_data.drop(columns=["id"])

cols_train_data = train_data[X_train_df.columns]
cols_test_data = test_data[X_test_df.columns]

cols_test_data
cols_train_data

X = cols_train_data
y = y


### Pipeline orqali Linear regression modeli uchun eng yaxshi bolgan polynominal degree ni aniqlab olish
### Determining the best polynominal degree for a linear regression model through a pipeline

In [8]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

LinearRegression_mae = []
for x in range(1,5):

    pipeline = Pipeline([
        ('poly', PolynomialFeatures(degree=x)),
        ('linear', LinearRegression())
    ])
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    mae_score = mean_absolute_error(y_test, y_pred)
    LinearRegression_mae.append(mae_score)
LinearRegression_mae

[np.float64(1.3977637609543163),
 np.float64(1.357627351760384),
 np.float64(1.379660555648401),
 np.float64(1.9555297120404818)]

### Pipeline orqali Huber regression modeli uchun eng yaxshi bolgan polynominal degree ni aniqlab olish
### Determining the best Huber degree for a linear regression model through a pipeline

In [9]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

HuberRegressor_mae = []
for x in range(1,5):

    pipeline = Pipeline([
        ('poly', PolynomialFeatures(degree=x)),
        ('linear', HuberRegressor())
    ])
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    mae_score = mean_absolute_error(y_test, y_pred)
    HuberRegressor_mae.append(mae_score)
HuberRegressor_mae

[np.float64(1.3655812367170297),
 np.float64(1.5380356085139826),
 np.float64(2.80864295344847),
 np.float64(4.106638965965048)]

### Pipeline orqali Ridge regression modeli uchun eng yaxshi bolgan polynominal degree ni aniqlab olish
### Determining the best ridge degree for a linear regression model through a pipeline

In [10]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

Ridge_mae = []
for x in range(1,5):

    pipeline = Pipeline([
        ('poly', PolynomialFeatures(degree=x)),
        ('linear', Ridge(alpha=4.921544346900318832,max_iter=1000))
    ])
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    mae_score = mean_absolute_error(y_test, y_pred)
    Ridge_mae.append(mae_score)
    Ridge_mae.append(x)
Ridge_mae

[np.float64(1.402195001247534),
 1,
 np.float64(1.359420710651855),
 2,
 np.float64(1.3399002942007063),
 3,
 np.float64(1.4032616803889915),
 4]

### Pipeline orqali Lasso regression modeli uchun eng yaxshi bolgan polynominal degree ni aniqlab olish
### Determining the best Lasso degree for a linear regression model through a pipeline

In [11]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

Lasso_mae = []
for x in range(1,5):

    pipeline = Pipeline([
        ('poly', PolynomialFeatures(degree=x)),
        ('linear', Lasso())
    ])
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    mae_score = mean_absolute_error(y_test, y_pred)
    Lasso_mae.append(mae_score)
    Lasso_mae.append(x)
Lasso_mae

[np.float64(1.6646121240764733),
 1,
 np.float64(1.4777246008517657),
 2,
 np.float64(1.4109674135739405),
 3,
 np.float64(1.4084545774308461),
 4]

### pipeline orqali Ridge regression uchun eng yaxshi estimatorlarni topish
### Finding the best estimators for Ridge regression via pipeline

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=3)),
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

param_grid = {
    'ridge__alpha': np.logspace(0.01, 0.2, 20),
    'ridge__max_iter': [1000, 5000, 10000]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Eng yaxshi parametrlar: {best_params}")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

mae_score = mean_absolute_error(y_test, y_pred)
print(f"Test setdagi MAE: {mae_score}")


Eng yaxshi parametrlar: {'ridge__alpha': np.float64(1.023292992280754), 'ridge__max_iter': 1000}
Test setdagi MAE: 1.3423301645853538


### Tepada eng yaxshi deb topilgan regresiyalar uchun polynominal degrelarni qollagan holda malumotlarni scaling qilib modelga yakun yasaymiz
### We conclude the model by scaling the data, leaving the polynomial degrees for the regressions found to be the best at the top

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

pipeline_ridge = Pipeline([
    ('poly', PolynomialFeatures(degree=3)),
    ('scaler', RobustScaler()),
    ('ridge', Ridge(alpha=0.021544346900318832, max_iter=1000))
])

pipeline_lasso = Pipeline([
    ('poly', PolynomialFeatures(degree=3)),
    ('scaler', RobustScaler()),
    ('lasso', Lasso())
])

pipeline_linear = Pipeline([
    ('poly', PolynomialFeatures(degree=3)),
    ('scaler', RobustScaler()),
    ('linear', LinearRegression())
])

pipeline_huber = Pipeline([
    ('poly', PolynomialFeatures(degree=1)),
    ('scaler', RobustScaler()),
    ('huber', HuberRegressor())
])

param_grid_linear = {
    'linear__fit_intercept': [True, False]
}

param_grid_ridge = {
    'ridge__alpha': [0.1, 1.0, 10.0],
    'ridge__fit_intercept': [True, False]
}

param_grid_lasso = {
    'lasso__alpha': [0.01, 0.1, 1.0],
    'lasso__fit_intercept': [True, False]
}

param_grid_huber = {
    'huber__epsilon': [1.05, 1.1, 1.2],
    'huber__alpha': [0.0001, 0.001, 0.01]
}

grid_linear = GridSearchCV(pipeline_linear, param_grid_linear, cv=5, scoring='neg_mean_absolute_error')
grid_ridge = GridSearchCV(pipeline_ridge, param_grid_ridge, cv=5, scoring='neg_mean_absolute_error')
grid_lasso = GridSearchCV(pipeline_lasso, param_grid_lasso, cv=5, scoring='neg_mean_absolute_error')
grid_huber = GridSearchCV(pipeline_huber, param_grid_huber, cv=5, scoring='neg_mean_absolute_error')

grid_linear.fit(X_train, y_train)
grid_ridge.fit(X_train, y_train)
grid_lasso.fit(X_train, y_train)
grid_huber.fit(X_train, y_train)

best_linear_model = grid_linear.best_estimator_
best_ridge_model = grid_ridge.best_estimator_
best_lasso_model = grid_lasso.best_estimator_
best_huber_model = grid_huber.best_estimator_

stacking_model = StackingRegressor(
    estimators=[
        ('linear', best_linear_model),
        ('ridge', best_ridge_model),
        ('lasso', best_lasso_model),
        ('huber', best_huber_model),
    ],
    final_estimator=LinearRegression()
)

stacking_model.fit(X_train, y_train)

stacking_pred = stacking_model.predict(X_test).round()

stacking_mae = mean_absolute_error(stacking_pred, y_test)
print(f'Stacking Model MAE: {stacking_mae}')

print(f'Best parameters for Linear Regression: {grid_linear.best_params_}')
print(f'Best parameters for Ridge Regression: {grid_ridge.best_params_}')
print(f'Best parameters for Lasso Regression: {grid_lasso.best_params_}')
print(f'Best parameters for Huber Regression: {grid_huber.best_params_}')


Stacking Model MAE: 1.3112
Best parameters for Linear Regression: {'linear__fit_intercept': False}
Best parameters for Ridge Regression: {'ridge__alpha': 0.1, 'ridge__fit_intercept': True}
Best parameters for Lasso Regression: {'lasso__alpha': 0.01, 'lasso__fit_intercept': True}
Best parameters for Huber Regression: {'huber__alpha': 0.0001, 'huber__epsilon': 1.1}


### endi competition bizga bergan test malumotlarni predict qilib olamiz
### now we can predict the test data given to us by the competition

In [14]:

X_train = train_data.drop(["id","Age"], axis = 1)
y_train = train_data["Age"]
X_test = test_data.drop(["id"], axis = 1)

pipeline_ridge = Pipeline([
    ('poly', PolynomialFeatures(degree=3)),
    ('scaler', RobustScaler()),
    ('ridge', Ridge(alpha=0.021544346900318832, max_iter=1000))
])

pipeline_lasso = Pipeline([
    ('poly', PolynomialFeatures(degree=3)),
    ('scaler', RobustScaler()),
    ('lasso', Lasso())
])

pipeline_linear = Pipeline([
    ('poly', PolynomialFeatures(degree=3)),
    ('scaler', RobustScaler()),
    ('linear', LinearRegression())
])

pipeline_huber = Pipeline([
    ('poly', PolynomialFeatures(degree=1)),
    ('scaler', RobustScaler()),
    ('huber', HuberRegressor())
])

param_grid_linear = {
    'linear__fit_intercept': [True, False]
}

param_grid_ridge = {
    'ridge__alpha': [0.1, 1.0, 10.0],
    'ridge__fit_intercept': [True, False]
}

param_grid_lasso = {
    'lasso__alpha': [0.01, 0.1, 1.0],
    'lasso__fit_intercept': [True, False]
}

param_grid_huber = {
    'huber__epsilon': [1.05, 1.1, 1.2],
    'huber__alpha': [0.0001, 0.001, 0.01]
}

grid_linear = GridSearchCV(pipeline_linear, param_grid_linear, cv=5, scoring='neg_mean_absolute_error')
grid_ridge = GridSearchCV(pipeline_ridge, param_grid_ridge, cv=5, scoring='neg_mean_absolute_error')
grid_lasso = GridSearchCV(pipeline_lasso, param_grid_lasso, cv=5, scoring='neg_mean_absolute_error')
grid_huber = GridSearchCV(pipeline_huber, param_grid_huber, cv=5, scoring='neg_mean_absolute_error')

grid_linear.fit(X_train, y_train)
grid_ridge.fit(X_train, y_train)
grid_lasso.fit(X_train, y_train)
grid_huber.fit(X_train, y_train)

best_linear_model = grid_linear.best_estimator_
best_ridge_model = grid_ridge.best_estimator_
best_lasso_model = grid_lasso.best_estimator_
best_huber_model = grid_huber.best_estimator_

stacking_model = StackingRegressor(
    estimators=[
        ('linear', best_linear_model),
        ('ridge', best_ridge_model),
        ('lasso', best_lasso_model),
        ('huber', best_huber_model),
    ],
    final_estimator=LinearRegression()
)

stacking_model.fit(X_train, y_train)

stacking_pred = stacking_model.predict(X_test).round()

### predictlarni dataframega qoshib olamiz
### we add predicts to dataframe

In [15]:
df = pd.DataFrame(pd.read_csv("test.csv")['id'])
df["Age"]=stacking_pred
df

Unnamed: 0,id,Age
0,15000,6.0
1,15001,9.0
2,15002,5.0
3,15003,8.0
4,15004,7.0
...,...,...
9995,24995,9.0
9996,24996,8.0
9997,24997,10.0
9998,24998,9.0


### data frameni jonatish uchun datasetga aylantiramiz
### we turn the data frame into a dataset for connection

In [16]:
df.to_csv("my_submission1.csv", index=False)