# **Machine Learning Project - Pawpularity**
### Armando Fortes (2021403383), David Pissarra (2021403381)

#### Imports ####

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import BernoulliNB
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import pandas as pd
import numpy as np

#### Constants and Hyperparameters ####

In [None]:
dataset_dir = '../Dataset/'
train_metadata_path = dataset_dir + 'train.csv'
test_metadata_path = dataset_dir + 'test.csv'

In [None]:
SPLITS = 4
REPEATS = 2
RANDOM_STATE = 0
PCA_VARIANCE_RETAINED = 0.9

#### Data Loading

In [None]:
train_metadata = pd.read_csv(train_metadata_path).sort_values(by='Id')
X_train = train_metadata.iloc[:,2:-1]
y_train = train_metadata['Pawpularity']

In [None]:
test_metadata = pd.read_csv(test_metadata_path).sort_values(by='Id')
X_test = test_metadata.iloc[:,2:]

#### Data Analysis

In [None]:
def attr_distribution(df, attr, value, color="dodgerblue"):
    x = df.loc[df[attr] == value]['Pawpularity']
    x.plot(kind='hist', bins=20, title=f'{attr} = {value}', color=color, xlabel='Pawpularity')

    _, max_ylim = plt.ylim()
    plt.axvline(x.mean(), color='k', linestyle='dashed', linewidth=1)
    plt.text(x.mean()*1.1, max_ylim*0.9, 'Mean: {:.2f}'.format(x.mean()))

In [None]:
fig = plt.figure(figsize=(13, 13))
columns = 4
rows = 6
set = 1

for attr in train_metadata.columns:
    if attr not in ('Id', 'Pawpularity', 'Strat'):
        fig.add_subplot(rows, columns, set)
        attr_distribution(train_metadata, attr, 0, color='orange')
        fig.add_subplot(rows, columns, set + 1)
        attr_distribution(train_metadata, attr, 1)
        set += 2

plt.tight_layout()

#### Data Pre-processing (based on PCA)

In [None]:
# choose the minimum number of principal components
pca = PCA(PCA_VARIANCE_RETAINED)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

#### Model

In [None]:
models = {
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=RANDOM_STATE),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(n_estimators=50, max_depth=3, n_jobs=-1),
    'ExtraTreesRegressor': ExtraTreesRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(n_neighbors=2),
    'AdaBoostRegressor': AdaBoostRegressor(random_state=RANDOM_STATE, n_estimators=100),
    'CatBoostRegressor': CatBoostRegressor(iterations=900, depth=5, learning_rate=0.05, loss_function = 'RMSE'),
    'BernoulliNaiveBayes': BernoulliNB(),
    'LGBMRegressor': LGBMRegressor(),
    'XGBRegressor': XGBRegressor(n_jobs=-1),
    'ElasticNet': ElasticNet(random_state=RANDOM_STATE),
    'RidgeRegressor': Ridge(alpha=2.0)
}

for name, model in models.items():
    
    cv = RepeatedStratifiedKFold(n_splits=SPLITS, n_repeats=REPEATS, random_state=0)
    scores = cross_validate(model, X_train, y_train, scoring='neg_mean_squared_error', cv=cv, return_estimator=True, n_jobs=-1)

    predictions = scores['estimator'][0].predict(X_test)  # FIXME use best performing instance instead of the first one
    rmse = np.sqrt(-scores['test_score'])

    print('='*30, name, '='*30)
    print('RMSE Mean:', '{0:.4f}'.format(np.mean(rmse)))
    print('RMSE Std:', '{0:.4f}'.format(np.std(rmse)))
    #print('Predictions:', predictions)
    print('Predictions Std:', '{0:.4f}'.format(np.std(predictions)))

#### Submission ####