# **Machine Learning Project - Pawpularity - Metadata Analysis**
### Armando Fortes (2021403383), David Pissarra (2021403381)

#### Imports ####

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold, KFold, cross_validate, train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
import optuna

#### Constants and Hyperparameters ####

In [None]:
dataset_dir = '../Dataset/'
train_metadata_path = dataset_dir + 'train.csv'
test_metadata_path = dataset_dir + 'test.csv'

In [None]:
SPLITS = 4
REPEATS = 2
RANDOM_SEED = 0
PCA_COMPONENTS = 4

#### Data Loading

In [None]:
train_metadata = pd.read_csv(train_metadata_path).sort_values(by='Id')
X_train = train_metadata.iloc[:,2:-1]
y_train = train_metadata['Pawpularity']

In [None]:
test_metadata = pd.read_csv(test_metadata_path).sort_values(by='Id')
X_test = test_metadata.iloc[:,2:]

#### Data Analysis

In [None]:
def attr_distribution(df, attr, value, color="dodgerblue"):
    x = df.loc[df[attr] == value]['Pawpularity']
    x.plot(kind='hist', bins=20, title=f'{attr} = {value}', color=color, xlabel='Pawpularity')

    _, max_ylim = plt.ylim()
    plt.axvline(x.mean(), color='k', linestyle='dashed', linewidth=1)
    plt.text(x.mean()*1.1, max_ylim*0.9, 'Mean: {:.2f}'.format(x.mean()))

In [None]:
fig = plt.figure(figsize=(13, 13))
columns = 4
rows = 6
set = 1

for attr in train_metadata.columns:
    if attr not in ('Id', 'Pawpularity', 'Bin'):
        fig.add_subplot(rows, columns, set)
        attr_distribution(train_metadata, attr, 0, color='orange')
        fig.add_subplot(rows, columns, set + 1)
        attr_distribution(train_metadata, attr, 1)
        set += 2

plt.tight_layout()

#### PCA Features

In [None]:
pca = PCA(n_components=4)
pca.fit(X_train)
X_train = X_train.join(pd.DataFrame(pca.transform(X_train), index=X_train.index).add_prefix('pca_'))
X_test = X_test.join(pd.DataFrame(pca.transform(X_test), index=X_test.index).add_prefix('pca_'))

In [None]:
X_train

#### Model

In [None]:
names = (
    'XGBRegressor',
    'LGBMRegressor',
    'CatBoostRegressor',
    'GradientBoostingRegressor',
    'KNeighborsRegressor',
    'BernoulliNB'
)

In [None]:
models = {
    'XGBRegressor': XGBRegressor(n_jobs=-1),
    'LGBMRegressor': LGBMRegressor(),
    'CatBoostRegressor': CatBoostRegressor(iterations=900, depth=5, learning_rate=0.05, loss_function = 'RMSE'),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=RANDOM_SEED),
    'KNeighborsRegressor': KNeighborsRegressor(n_neighbors=10),
    'BernoulliNB': BernoulliNB()
}

In [None]:
trained = {}

for name, model in models.items():
    
    cv = RepeatedStratifiedKFold(n_splits=SPLITS, n_repeats=REPEATS, random_state=0)
    scores = cross_validate(model, X_train, y_train, scoring='neg_mean_squared_error', cv=cv, return_estimator=True, n_jobs=-1)

    trained[name] = scores['estimator']

    predictions = np.zeros(X_test.shape[0])
    for estimator in scores['estimator']:
        predictions += estimator.predict(X_test)
    predictions /= len(scores['estimator'])
    
    rmse = np.sqrt(-scores['test_score'])

    print('='*int((80-len(name))/2), name, '='*int((80-len(name))/2))
    print('RMSE:', '{0:.4f}'.format(np.mean(rmse)), 'std:', '{0:.4f}'.format(np.std(rmse)))
    print('Predictions:', np.round(predictions), 'std:', '{0:.4f}'.format(np.std(predictions)))

In [None]:
predictions = {
        'XGBRegressor': np.zeros(X_train.shape[0]),
        'LGBMRegressor': np.zeros(X_train.shape[0]),
        'CatBoostRegressor': np.zeros(X_train.shape[0]),
        'GradientBoostingRegressor': np.zeros(X_train.shape[0]),
        'KNeighborsRegressor': np.zeros(X_train.shape[0]),
        'BernoulliNB': np.zeros(X_train.shape[0]),
    }

for name, instances in trained.items():
    for instance in instances:
        predictions[name] += instance.predict(X_train)
    predictions[name] /= len(instances)
    print(name, np.sqrt(mean_squared_error(predictions[name], y_train.to_numpy())))

In [None]:
def xgboost_objective(trial):
    X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2, random_state=RANDOM_SEED)

    params = {
        'max_depth': trial.suggest_categorical('max_depth', [1, 2, 3, 4]),
        'n_estimators': trial.suggest_categorical('n_estimators', [200, 300, 400, 500]),
        'min_child_weight': trial.suggest_categorical('min_child_weight', [1, 2, 3, 4]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.7),
        'subsample': trial.suggest_float('subsample', 0.4, 0.7),
        'eta': trial.suggest_float('eta', 0.05, 0.5),
        'objective': 'reg:squarederror',
        'seed': RANDOM_SEED
    }

    model = XGBRegressor(**params, n_jobs=-1)

    fit_params = {
        'eval_metric': 'rmse',
        'eval_set': [(X_train_split, y_train_split), (X_valid_split, y_valid_split)],
        'early_stopping_rounds': 400,
        'verbose': False
    }

    model.fit(X_train_split, y_train_split, **fit_params)

    return np.sqrt(mean_squared_error(model.predict(X_valid_split), y_valid_split.to_numpy()))

study = optuna.create_study(direction='minimize')
study.optimize(xgboost_objective, n_trials=1000)