In [1]:
# !pip install category-encoders
# !unzip pickle_objects.zip
# !unzip mercedes-benz-greener-manufacturing.zip
# !unzip train.csv.zip
# !unzip test.csv.zip

In [None]:
import os, pickle
import pandas as pd
import numpy as np
import category_encoders as ce
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor, RidgeCV
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
import constants

In [3]:
class Submission:
    def __init__(self):
        self.categorical_columns = constants.categorical_columns
        self.binary_columns = constants.binary_columns
        self.final_features = constants.final_features
        self.n_comp = constants.n_comp
        self.pickle_objects_path = constants.pickle_objects_path
        self.pickle_objects = dict()
        for pickle_object in constants.pickle_objects:
            with open(os.path.join(self.pickle_objects_path, pickle_object+".pkl"), "rb") as f:
                self.pickle_objects[pickle_object] = pickle.load(f)

    def preprocess_data(self, file_path):
        dataframe = pd.read_csv(file_path)
        dataframe["X314 + X315"] = dataframe["X314"] + dataframe["X315"]
        ids = dataframe["ID"]
        X = dataframe[self.categorical_columns + self.binary_columns]
        X = self.pickle_objects["target_encoder"].transform(X)
        X = pd.DataFrame(self.pickle_objects["scaler"].transform(X), columns = self.categorical_columns + self.binary_columns)
        # Projections
        pca2_results = self.pickle_objects["pca"].transform(X)
        ica2_results = self.pickle_objects["ica"].transform(X)
        tsvd_results = self.pickle_objects["tsvd"].transform(X)
        grp_results = self.pickle_objects["grp"].transform(X)
        srp_results = self.pickle_objects["srp"].transform(X)
        # Append decomposition components to datasets
        for i in range(1, self.n_comp + 1):
            X['pca_' + str(i)] = pca2_results[:, i - 1]
            X['ica_' + str(i)] = ica2_results[:, i - 1]
            X['tsvd_' + str(i)] = tsvd_results[:, i - 1]
            X['grp_' + str(i)] = grp_results[:, i - 1]
            X['srp_' + str(i)] = srp_results[:, i - 1]
        X = X.values
        if "y" in dataframe.columns:
            y = dataframe["y"].values
            return X, y
        else:
            return X

    def metric_pipeline(self, file_path):
        if "y" not in pd.read_csv(file_path).columns:
            raise Exception("No target variable found to compute R2-score")
        else:
            X, y = self.preprocess_data(file_path)
            y_pred = np.exp(self.pickle_objects["model"].predict(X))
            score  = r2_score(y, y_pred)
            return score

    def inference_pipeline(self, file_path):
        if "y" not in pd.read_csv(file_path).columns:
            X = self.preprocess_data(file_path)
        else:
            X, _ = self.preprocess_data(file_path)
        ids = pd.read_csv(file_path)["ID"].values
        y_pred = np.exp(self.pickle_objects["model"].predict(X))
        submission = pd.DataFrame()
        submission["ID"] = ids
        submission["y"] = y_pred
        return submission

In [4]:
submission = Submission()
R2_score = submission.metric_pipeline("train.csv")
print("Train R2-score:\t", R2_score)

Train R2-score:	 0.5903056235376779


In [5]:
submission = Submission()
test_submission = submission.inference_pipeline("test.csv")
test_submission.head()

Unnamed: 0,ID,y
0,1,77.519561
1,2,94.29062
2,3,77.788318
3,4,77.406914
4,5,112.470019
