# Пути к данным

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cmc-ml-spotify-recommendations/sample_submission.csv
/kaggle/input/cmc-ml-spotify-recommendations/added_info.csv
/kaggle/input/cmc-ml-spotify-recommendations/audio_features.csv
/kaggle/input/cmc-ml-spotify-recommendations/simple_pipeline.ipynb
/kaggle/input/cmc-ml-spotify-recommendations/tracks_info.csv
/kaggle/input/cmc-ml-spotify-recommendations/train.csv
/kaggle/input/cmc-ml-spotify-recommendations/test.csv


# Код кодирования данных. Написан мной лично для научной работы.

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.base import BaseEstimator, TransformerMixin
import sklearn.preprocessing as skpr
import category_encoders as ce
import numpy as np
import pandas as pd
import datetime
from copy import deepcopy
import math


class CircularEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, limits=None, fit_replace=True, tol=1e-8):
        self.limits = limits
        self.fit_replace = fit_replace
        self.tol = tol
        self._shape = (0, 0)

    def __repr__(self):
        return "Circular Encoder"

    def __str__(self):
        return "Circular Encoder"

    def fit(self, X, y=None):

        # Shape setting
        self._shape = self.__set_shape(X)

        # Defining limit
        if self.fit_replace:
            self.limits = self.__set_limits(X)

        return self

    def transform(self, X, y=None):
        if self.limits is None:
            return deepcopy(X)

        # column_names only for DataFrame
        column_names = None

        # cast to numpy array
        if isinstance(X, pd.DataFrame):
            X_ndarray = X.to_numpy().reshape(self._shape)
            column_names = np.zeros(2 * X.columns.shape[0], dtype=object)
        else:
            X_ndarray = X.reshape(self._shape)

        # main encoding
        result_sin = np.sin((2 * np.pi * X_ndarray) / self.limits)
        result_cos = np.cos((2 * np.pi * X_ndarray) / self.limits)

        result_sin[np.abs(result_sin) < self.tol] = 0.0
        result_cos[np.abs(result_cos) < self.tol] = 0.0

        # combine encoded arrays
        result = np.zeros((self._shape[0], self._shape[1] * 2))
        result[:, np.arange(0, result.shape[1], 2)] = result_sin
        result[:, np.arange(1, result.shape[1], 2)] = result_cos

        # set column_names names and return result
        if column_names is not None:
            column_names[np.arange(0, result.shape[1], 2)] = np.array([f'sin_{col}' for col in X.columns])
            column_names[np.arange(1, result.shape[1], 2)] = np.array([f'cos_{col}' for col in X.columns])
            return pd.DataFrame(result, columns=column_names).infer_objects()
        else:
            return result

    @staticmethod
    def __set_shape(X):
        if len(X.shape) == 1:
            return (X.shape[0], 1)
        elif len(X.shape) > 2:
            raise ValueError(f"You need 2 dimensions instead of {len(X.shape)}")
        else:
            return X.shape

    @staticmethod
    def __set_limits(X):
        if isinstance(X, pd.DataFrame):
            return np.max(np.abs(X.to_numpy()), axis=0) + 1
        else:
            return np.max(np.abs(X), axis=0) + 1

        
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, fit_replace=False, encoder='binary',
                 category_rate=0.1, rated_search=True, fast_mode=True,
                 confidence_level=0.99, worst_proportion=0.01, **encoder_params):

        self._hashing_enc_name = 'hashing'
        self._encoders_list = {'onehot': ce.OneHotEncoder,
                               'target_loo': ce.LeaveOneOutEncoder,
                               self._hashing_enc_name: ce.HashingEncoder,
                               'binary': ce.BinaryEncoder}
        self._target_encoders_list = np.array(['target_loo'])

        # stochastic approach
        eps = 1e-6
        confidence_level = max(eps, min(confidence_level, 1 - eps))
        worst_proportion = max(eps, min(worst_proportion, 1 - eps))
        self.fast_random_size = math.ceil(math.log(1 - confidence_level) / math.log(1 - worst_proportion))
        self.fast_mode = fast_mode

        self.cols = np.array(cols) if cols is not None else None
        self.fit_replace = fit_replace
        self.encoder_name = encoder
        self.category_rate = category_rate
        self.rated_search = rated_search
        self.encoder_params = encoder_params
        self._encoder = None

    def __repr__(self):
        return "CategoricalEncoder"

    def __str__(self):
        return "CategoricalEncoder"

    def fit(self, X, y=None, **fit_params):
        # defining categorical columns
        if self.cols is None or self.fit_replace:
            self.cols = self.define_cols(X)

        try:
            if not self.cols.shape[0]:
                return self
        except IndexError:
            raise IndexError("cols should be a numpy array")

        # save self.cols and get indices for X
        saved_cols = self.cols.copy()
        self.cols = self.cols_to_numeric(X)

        if not self.cols.shape[0]:
            self.cols = saved_cols
            return self

        # defining the encoder
        self.encoder_params['cols'] = X.columns[self.cols] if isinstance(X, pd.DataFrame) else self.cols
        self._encoder = self._encoders_list[self.encoder_name](**self.encoder_params)

        # transforming y if it's not numeric
        y_copy = None

        if y is not None:
            y_copy = deepcopy(y)
            if not self.is_y_approved(y_copy):
                y_copy = skpr.LabelEncoder().fit_transform(y_copy)

        # fitting the encoder
        self._encoder.fit(pd.DataFrame(X), y_copy, **fit_params)

        # self.cols backup
        self.cols = saved_cols
        return self

    def transform(self, X, y=None):
        try:
            if self.cols is None or not self.cols.shape[0] or self._encoder is None:
                return deepcopy(X)
        except IndexError:
            raise IndexError("cols should be a numpy array")

        # save self.cols and get indices for X
        saved_cols = self.cols.copy()
        self.cols = self.cols_to_numeric(X)

        if not self.cols.shape[0]:
            self.cols = saved_cols
            return deepcopy(X)

        # transforming y if it's not numeric
        y_copy = None

        if y is not None:
            y_copy = deepcopy(y)
            if not self.is_y_approved(y_copy):
                y_copy = skpr.LabelEncoder().fit_transform(y_copy)

        # checking whether it's a target encoder or not
        if self.encoder_name in self._target_encoders_list:
            result = self._encoder.transform(pd.DataFrame(X), y_copy, override_return_df=True)
        else:
            result = self._encoder.transform(pd.DataFrame(X), override_return_df=True)

        # adjusting the column names
        cols_before = self._encoder.get_feature_names_in()
        cols_after = self._encoder.get_feature_names_out()
        new_columns = self._rename_transformed_cols(cols_before, cols_after)

        result = result.rename(columns={cols_after[i]: new_columns[i] for i in range(cols_after.shape[0])})

        if isinstance(X, pd.DataFrame):
            # saving X dtypes
            not_cols = list(set(np.arange(X.shape[1])) - set(self.cols))
            saved_dtypes = X.dtypes.to_dict()
            unmodified_dtypes = dict([(X.columns[col], saved_dtypes[X.columns[col]]) for col in not_cols])

            # self.cols backup
            self.cols = saved_cols

            return result.astype(dtype=unmodified_dtypes)
        else:
            # self.cols backup
            self.cols = saved_cols

            return result.to_numpy()

    def fit_transform(self, X, y=None, **fit_params):
        # transforming y if it's not numeric
        y_copy = None

        if y is not None:
            y_copy = deepcopy(y)
            if not self.is_y_approved(y_copy):
                y_copy = skpr.LabelEncoder().fit_transform(y_copy)

        self.fit(X, y_copy, **fit_params)
        return self.transform(X, y_copy)

    def _rename_transformed_cols(self, before, after):
        if after is None:
            return np.array([])
        if self._encoder is None:
            return deepcopy(after)

        result = deepcopy(after)

        # hashing encoder unique renaming
        if self.encoder_name == self._hashing_enc_name:
            for i in range(self._encoder.n_components):
                result[i] = f'{self._hashing_enc_name}_{i}'
        else:
            # getting the columns transformed
            set_before = set(before)
            set_after = set(after)
            sample_names = set_before.intersection(set_after)

            # checking whether set is not empty
            if bool(sample_names):
                # renaming
                for num, col in enumerate(result):
                    if col not in sample_names:
                        result[num] = f'{self.encoder_name}_{result[num]}'

        return result

    def is_y_approved(self, y):
        try:
            y.astype(float)
            return True
        except (ValueError, TypeError):
            return False

    def cols_to_numeric(self, X):
        if self.cols is None or not self.cols.shape[0]:
            return np.array([])

        inds = self.cols.copy()

        try:
            inds = inds.astype(int)
            return inds
        except (ValueError, TypeError):
            if not isinstance(X, pd.DataFrame):
                return np.array([])
            else:
                columns_array = X.columns.to_numpy(dtype=object).copy()
                names_dict = dict((columns_array[i], i) for i in range(columns_array.shape[0]))
                inds = np.array([names_dict[name] for name in inds if name in names_dict])
                return inds

    def define_cols(self, X):
        # cast to pandas DataFrame with inferring object types
        if not isinstance(X, pd.DataFrame):
            X_df = pd.DataFrame(X).infer_objects()
        else:
            X_df = X.infer_objects()

        # check all the columns
        category_cols = []
        for num, col in enumerate(X_df):
            # if dtype is 'category'
            if pd.api.types.is_categorical_dtype(X_df[col]):
                category_cols.append(num)
            # checking object type (strings)
            elif pd.api.types.is_object_dtype(X_df[col]):
                # stochastic approach
                if self.fast_mode:
                    append_need = True
                    for i in range(self.fast_random_size):
                        if not self.is_one_word(X_df[col].iloc[np.random.randint(0, X_df.shape[0])]):
                            append_need = False
                            break
                    if append_need:
                        category_cols.append(num)
                else:
                    # basic approach
                    if np.all(np.vectorize(self.is_one_word)(X_df[col])):
                        category_cols.append(num)
            # checking numeric columns
            elif self.rated_search and pd.api.types.is_numeric_dtype(X_df[col]) and not pd.api.types.is_float_dtype(X_df[col]):
                if X_df[col].nunique() < self.category_rate * X_df.shape[0]:
                    category_cols.append(num)
        return np.array(category_cols)

    def is_one_word(self, s):
        if s is None or not isinstance(s, str):
            return False

        stripped_string = s.strip()
        if not stripped_string or ' ' in stripped_string:
            return False
        else:
            return True

    def get_cols(self):
        if self.cols is not None:
            return self.cols.copy()
        else:
            return np.array([])

    def get_encoder(self):
        return self.encoder_name

    def get_available_encoders(self):
        return np.array(list(self._encoders_list.keys()))


class NumericalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, fit_replace=False, encoder='standard', numeric_rate=0.1,
                 rated_search=True, only_float=True, **encoder_params):

        self._encoders_list = {'standard': skpr.StandardScaler,
                               'min_max': skpr.MinMaxScaler,
                               'normalizer': skpr.Normalizer,
                               'max_abs': skpr.MaxAbsScaler}

        self.only_float = only_float
        self.cols = np.array(cols) if cols is not None else None
        self.fit_replace = fit_replace
        self.encoder_name = encoder
        self.numeric_rate = numeric_rate
        self.rated_search = rated_search
        self.encoder_params = encoder_params
        self._encoder = None

    def __repr__(self):
        return "NumericalEncoder"

    def __str__(self):
        return "NumericalEncoder"

    def fit(self, X, y=None, **fit_params):
        # defining numerical columns
        if self.cols is None or self.fit_replace:
            self.cols = self.define_cols(X)

        try:
            if not self.cols.shape[0]:
                return self
        except IndexError:
            raise IndexError("cols should be a numpy array")

        # save self.cols and get indices for X
        saved_cols = self.cols.copy()
        self.cols = self.cols_to_numeric(X)

        if not self.cols.shape[0]:
            self.cols = saved_cols
            return self

        # defining the encoder
        self._encoder = self._encoders_list[self.encoder_name](**self.encoder_params)

        # fitting the encoder
        X_ndarray = X.to_numpy()[:, self.cols] if isinstance(X, pd.DataFrame) else X[:, self.cols]
        self._encoder.fit(X_ndarray, y, **fit_params)

        # self.cols backup
        self.cols = saved_cols

        return self

    def transform(self, X, y=None):
        try:
            if self.cols is None or not self.cols.shape[0] or self._encoder is None:
                return deepcopy(X)
        except IndexError:
            raise IndexError("cols should be a numpy array")

        # save self.cols and get indices for X
        saved_cols = self.cols.copy()
        self.cols = self.cols_to_numeric(X)

        if not self.cols.shape[0]:
            self.cols = saved_cols
            return deepcopy(X)

        column_names = None

        # cast to numpy array
        if isinstance(X, pd.DataFrame):
            X_ndarray = X.iloc[:, self.cols].to_numpy().astype(float)
            column_names = X.columns.to_numpy(dtype=object).copy()
        else:
            X_ndarray = X[:, self.cols].astype(float)

        # getting the resulting column names
        if column_names is not None:
            column_names[self.cols] = np.array([f'{self.encoder_name}_{name}' for name in column_names[self.cols]])

        # only numerical columns here
        result = self._encoder.transform(X_ndarray)

        if isinstance(X, pd.DataFrame):
            # saving X dtypes
            not_cols = list(set(np.arange(X.shape[1])) - set(self.cols))
            saved_dtypes = X.dtypes.to_dict()
            unmodified_dtypes = dict([(X.columns[col], saved_dtypes[X.columns[col]]) for col in not_cols])

            X_ndarray = deepcopy(X.to_numpy(dtype=object))
            X_ndarray[:, self.cols] = result
            result = pd.DataFrame(X_ndarray, columns=column_names).astype(dtype=unmodified_dtypes).infer_objects()
        else:
            X_ndarray = deepcopy(X)
            X_ndarray[:, self.cols] = result
            result = X_ndarray

        # self.cols backup
        self.cols = saved_cols

        return result

    def define_cols(self, X):
        # cast to pandas DataFrame with inferring object types
        if not isinstance(X, pd.DataFrame):
            X_df = pd.DataFrame(X).infer_objects()
        else:
            X_df = X.infer_objects()

        # check all the columns
        numeric_cols = []
        for num, col in enumerate(X_df):
            # if dtype is 'numeric'
            if pd.api.types.is_numeric_dtype(X_df[col]):
                # appending if float
                if pd.api.types.is_float_dtype(X_df[col]):
                    numeric_cols.append(num)
                elif not self.only_float:
                    # checking rated_search
                    if self.rated_search:
                        if X_df[col].nunique() >= self.numeric_rate * X_df.shape[0]:
                            numeric_cols.append(num)
                    else:
                        numeric_cols.append(num)
        return np.array(numeric_cols)

    def cols_to_numeric(self, X):
        if self.cols is None or not self.cols.shape[0]:
            return np.array([])

        inds = self.cols.copy()

        try:
            inds = inds.astype(int)
            return inds
        except (ValueError, TypeError):
            if not isinstance(X, pd.DataFrame):
                return np.array([])
            else:
                columns_array = X.columns.to_numpy(dtype=object).copy()
                names_dict = dict((columns_array[i], i) for i in range(columns_array.shape[0]))
                inds = np.array([names_dict[name] for name in inds if name in names_dict])
                return inds

    def get_cols(self):
        if self.cols is not None:
            return self.cols.copy()
        else:
            return np.array([])

    def get_encoder(self):
        return self.encoder_name

    def get_available_encoders(self):
        return np.array(list(self._encoders_list.keys()))

# Первый взгляд на данные

[Ссылка на соревнование](https://www.kaggle.com/competitions/cmc-ml-spotify-recommendations/leaderboard)

In [3]:
train_df = pd.read_csv('/kaggle/input/cmc-ml-spotify-recommendations/train.csv')
test_df = pd.read_csv('/kaggle/input/cmc-ml-spotify-recommendations/test.csv')
tracks_info_df = pd.read_csv('/kaggle/input/cmc-ml-spotify-recommendations/tracks_info.csv')
audio_features_df = pd.read_csv('/kaggle/input/cmc-ml-spotify-recommendations/audio_features.csv')
added_info_df = pd.read_csv('/kaggle/input/cmc-ml-spotify-recommendations/added_info.csv')
sample_submission_df = pd.read_csv('/kaggle/input/cmc-ml-spotify-recommendations/sample_submission.csv')

In [4]:
train_df

Unnamed: 0,index,playlist_id,track_id,exists
0,1050194,32931,07vS8obfeZbr8H4MgQfXR7,1
1,645550,35527,1PEqh7awkpuepLBSq8ZwqD,1
2,2630414,12203,7E8pPgBY84oDaXRcqODavR,0
3,3168314,21850,0Atml4huw4Fgyk6YSHiK4M,0
4,3144464,21391,4WYDmIZrwxBHdBYdvi5oQO,0
...,...,...,...,...
2773057,2211111,4280,1CsJNkcfGXCUQhst9Wo4KL,0
2773058,3600703,29901,2VBGLp3N6JxIELDbbIhoSS,0
2773059,3879581,35199,5PEzhJIvmjFCFZcFZRze9S,0
2773060,1843050,29231,0kvWwRs634uP7K50NObw2h,1


In [5]:
test_df

Unnamed: 0,index,playlist_id,track_id
0,0,23073,09rm430z15znrby0txsnvx
1,1,27967,2cQvgkdRXcWLahNBM3luSD
2,2,20120,0lXbcnyo5T3VTa7gJ9fdGo
3,3,21054,0novBnPo3fjSPGK6RyHC56
4,4,5906,5yqevIi6idUAjBo67lJTtz
...,...,...,...
1188451,1188451,18736,31PPeK7rTkevgM3OYdBReA
1188452,1188452,17904,6OXilF9moyWn0YHX0iyKtr
1188453,1188453,27318,4oXMEXcjuypQAFFugKA7dB
1188454,1188454,31922,1KRu9rvn1NU4XCtUGyLGYm


In [6]:
audio_features_df

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,2roOWo3AxvYYFxVYSRl2Lw,0.725,0.91000,9,-5.175,1,0.0828,0.215000,0.02450,0.0747,0.8080,92.717,156293,4
1,33P76PS6skAOeQzYK77xss,0.520,0.96600,5,-5.845,0,0.0601,0.000090,0.19700,0.2970,0.0382,128.020,241875,4
2,7dS1UqsXYDkdVpA2zoHuY4,0.293,0.00546,8,-34.114,0,0.0660,0.962000,0.82900,0.0853,0.2200,99.164,80100,4
3,6eQFys5vvmGRvK0Gmb3QK8,0.414,0.00650,6,-34.234,1,0.0360,0.994000,0.94100,0.1060,0.1570,71.737,135187,4
4,6OOrm1Pi4ohW2IgxaPMXfL,0.702,0.80500,5,-5.432,0,0.0427,0.131000,0.00000,0.3430,0.7780,139.901,173520,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899928,0vzWDHKtEUfbvyVZTiFg4r,0.602,0.65400,7,-6.592,1,0.0283,0.020400,0.00126,0.2310,0.0882,103.025,181893,4
899929,3VvGP9ldsvUyhSYwCh5GL3,0.444,0.13100,0,-17.910,1,0.0377,0.945000,0.60200,0.1210,0.1690,111.149,164773,4
899930,54HMQ6LzInyFQaxv6sK2TG,0.682,0.73600,1,-5.004,1,0.0458,0.017000,0.00000,0.1400,0.8120,150.021,209748,4
899931,3gp1UtKF9QwTwg6VUxNGmz,0.280,0.89600,10,-10.444,0,0.0703,0.000146,0.73300,0.1900,0.1670,87.261,97000,4


In [7]:
added_info_df

Unnamed: 0,playlist_id,track_id,added_at,added_by_id,added_by_type
0,254,2aQpISWUBToaF84DDiTeRV,2024-02-29T16:53:53Z,590,user
1,17014,6EipcpKMRufBSIGLZqmmwz,2021-01-29T10:04:19Z,3013,user
2,18570,5IJnj6Um7jVuRPT4WGj1z6,2016-02-15T14:57:19Z,399,user
3,14643,3vvvpZ3ez7uCEM5uhkj1ou,2024-01-25T16:19:12Z,2449,user
4,12131,0P0IfSuzTC6bLUsHjmDuvW,2023-10-07T14:02:31Z,1014,user
...,...,...,...,...,...
1399258,19350,0W7AbEauB7cP4pidLclApe,2022-02-09T00:40:20Z,3042,user
1399259,14587,6Vjk8MNXpQpi0F4BefdTyq,2022-07-05T15:46:03Z,2449,user
1399260,27434,7795WJLVKJoAyVoOtCWqXN,2022-12-22T19:32:22Z,857,user
1399261,8894,5xJ0KrDk6kjRP4GjAfG3VS,2022-01-10T11:26:03Z,1817,user


In [8]:
tracks_info_df.track_album_album[0]

"{'album_type': 'single', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/5FqTuN42w2zGw4Pzd50yaS'}, 'href': 'https://api.spotify.com/v1/artists/5FqTuN42w2zGw4Pzd50yaS', 'id': '5FqTuN42w2zGw4Pzd50yaS', 'name': 'Reckol', 'type': 'artist', 'uri': 'spotify:artist:5FqTuN42w2zGw4Pzd50yaS'}, {'external_urls': {'spotify': 'https://open.spotify.com/artist/6CPZWzcKiOKkHn4L2XI4i2'}, 'href': 'https://api.spotify.com/v1/artists/6CPZWzcKiOKkHn4L2XI4i2', 'id': '6CPZWzcKiOKkHn4L2XI4i2', 'name': 'cakal', 'type': 'artist', 'uri': 'spotify:artist:6CPZWzcKiOKkHn4L2XI4i2'}], 'available_markets': [], 'external_urls': {'spotify': 'https://open.spotify.com/album/6LpuMzcAiFIOBr4ijkpPuQ'}, 'href': 'https://api.spotify.com/v1/albums/6LpuMzcAiFIOBr4ijkpPuQ', 'id': '6LpuMzcAiFIOBr4ijkpPuQ', 'images': [{'height': 640, 'url': 'https://i.scdn.co/image/ab67616d0000b27360c5021f2e5cc64a8a700902', 'width': 640}, {'height': 300, 'url': 'https://i.scdn.co/image/ab67616d00001e0260c5021f2e5cc64a8a7

# Предобработка данных

In [9]:
import warnings, gc
warnings.filterwarnings("ignore", category=FutureWarning)

## added_info_df

In [10]:
# Удалим лишний столбец и проведём парсинг временных данных
added_info_df = added_info_df.drop('added_by_type', axis=1)
added_info_df['added_at'] = pd.to_datetime(added_info_df['added_at'])
added_info_df['added_at_year'] = added_info_df['added_at'].dt.year
added_info_df['added_at_month'] = added_info_df['added_at'].dt.month
added_info_df['added_at_day'] = added_info_df['added_at'].dt.day
added_info_df['added_at_hour'] = added_info_df['added_at'].dt.hour
added_info_df = added_info_df.drop('added_at', axis=1)

# Кодирование периодических признаков
encoded_month = CircularEncoder(12).fit_transform(added_info_df['added_at_month'].to_frame())
encoded_day = CircularEncoder(31).fit_transform(added_info_df['added_at_day'].to_frame())
encoded_hour =  CircularEncoder(24).fit_transform(added_info_df['added_at_hour'].to_frame())
added_info_df = pd.concat([added_info_df, encoded_month, encoded_day, encoded_hour], axis=1)
added_info_df = added_info_df.drop(['added_at_month', 'added_at_day', 'added_at_hour'], axis=1)

# Группируем по объектам обучающей выборки
result_added_info = added_info_df.groupby(['track_id', 'playlist_id']).agg(lambda x: x.tolist()).reset_index()
result_added_info.added_at_year = result_added_info.added_at_year.apply(lambda x: sum(x) / len(x)).astype(int)
result_added_info.added_by_id = result_added_info.added_by_id.apply(lambda x: sum(x) / len(x)).astype(int)

for col in result_added_info.columns:
    if col[:3] in ['cos', 'sin']:
        result_added_info[col] = result_added_info[col].apply(lambda x: sum(x) / len(x))
        
# Убираем дубликаты
result_added_info.drop_duplicates(keep='first', inplace=True)

In [11]:
# Проверяем наличие пропусков в данных
nan_columns = []
for col in result_added_info.columns:
    if result_added_info[col][result_added_info[col].isna() == True].shape[0] > 0:
        nan_columns.append(col)
if len(nan_columns) == 0:
    print('Пропусков нет')
else:
    print('Пропуски имеются в следующих признаках: ', ', '.join(nan_columns))

Пропусков нет


In [12]:
result_added_info

Unnamed: 0,track_id,playlist_id,added_by_id,added_at_year,sin_added_at_month,cos_added_at_month,sin_added_at_day,cos_added_at_day,sin_added_at_hour,cos_added_at_hour
0,0006Rv1e2Xfh6QooyKJqKS,24238,868,2023,-0.239316,-0.970942,0.980785,-0.195090,-0.965926,0.258819
1,0007ftg5erIndAnrITMGON,1334,1382,2019,0.992709,0.120537,0.831470,-0.555570,-0.965926,0.258819
2,000C3ZY8325A4yktxnnwCl,30244,2863,2017,0.663123,-0.748511,-0.195090,0.980785,0.500000,0.866025
3,000FetCrS9eDC1Ii1lCLfT,12634,2227,2020,-0.992709,0.120537,-0.923880,-0.382683,0.000000,1.000000
4,000HoCMkO5w5BuqgU6TXUD,9889,1751,2023,0.822984,0.568065,0.195090,-0.980785,-1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
1388818,7zzoxJbgjme3366mOp5UnH,20191,2446,2019,0.663123,-0.748511,-0.980785,0.195090,0.707107,-0.707107
1388819,7zzqH8TtAQExYITjk0b7e8,2054,437,2018,-0.464723,0.885456,0.707107,0.707107,-0.707107,0.707107
1388820,7zzrXzx0A7C7RriCE8tVIe,25797,1552,2023,0.992709,0.120537,-0.923880,-0.382683,-0.258819,-0.965926
1388821,7zztYl9vdJP0R8w7l9XEbk,8536,1664,2023,0.992709,0.120537,0.980785,-0.195090,0.866025,0.500000


# audio_features_df

In [13]:
# Убираем дубликаты
audio_features_df.drop_duplicates(keep='first', inplace=True)

# Проверяем наличие пропусков в данных
nan_columns = []
for col in audio_features_df.columns:
    if audio_features_df[col][audio_features_df[col].isna() == True].shape[0] > 0:
        nan_columns.append(col)
if len(nan_columns) == 0:
    print('Пропусков нет')
else:
    print('Пропуски имеются в следующих признаках: ', ', '.join(nan_columns))

# Переименование колонки id
audio_features_df = audio_features_df.rename(columns={'id': 'track_id'})

Пропусков нет


In [14]:
# Кодирование вещественных признаков
audio_features_nums = np.array(['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms'])
result_audio_features = NumericalEncoder(cols=audio_features_nums).fit_transform(audio_features_df)

In [15]:
result_audio_features

Unnamed: 0,track_id,standard_danceability,standard_energy,key,standard_loudness,mode,standard_speechiness,standard_acousticness,standard_instrumentalness,standard_liveness,standard_valence,standard_tempo,standard_duration_ms,time_signature
0,2roOWo3AxvYYFxVYSRl2Lw,0.915699,1.299411,9,0.815225,1,0.022212,-0.531288,-0.704860,-0.686636,1.368767,-0.868776,-0.566747,4
1,33P76PS6skAOeQzYK77xss,-0.158443,1.493671,5,0.718698,0,-0.219975,-1.103402,-0.264251,0.677099,-1.472436,0.288894,0.103000,4
2,7dS1UqsXYDkdVpA2zoHuY4,-1.347858,-1.838363,8,-3.354010,0,-0.157027,1.457301,1.350038,-0.621608,-0.801443,-0.657364,-1.163019,4
3,6eQFys5vvmGRvK0Gmb3QK8,-0.713853,-1.834755,6,-3.371298,1,-0.477099,1.542488,1.636114,-0.494621,-1.033965,-1.556761,-0.731919,4
4,6OOrm1Pi4ohW2IgxaPMXfL,0.795185,0.935175,5,0.778199,0,-0.405616,-0.754905,-0.767439,0.959293,1.258042,0.678500,-0.431932,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899229,0vzWDHKtEUfbvyVZTiFg4r,0.271214,0.411369,7,0.611078,1,-0.559250,-1.049333,-0.764221,0.272212,-1.287895,-0.530752,-0.366407,4
899230,3VvGP9ldsvUyhSYwCh5GL3,-0.556661,-1.402875,0,-1.019503,1,-0.458961,1.412045,0.770222,-0.402601,-0.989675,-0.264347,-0.500385,4
899231,54HMQ6LzInyFQaxv6sK2TG,0.690391,0.695820,1,0.839861,1,-0.372542,-1.058384,-0.767439,-0.286042,1.383530,1.010360,-0.148419,4
899232,3gp1UtKF9QwTwg6VUxNGmz,-1.415974,1.250846,10,0.056121,0,-0.111151,-1.103251,1.104829,0.020691,-0.997057,-1.047692,-1.030763,4


## tracks_info_df

In [16]:
track_artists = [eval(dict_str) for dict_str in tracks_info_df.track_artists.to_list()]
album_album = [eval(dict_str) for dict_str in tracks_info_df.track_album_album.to_list()] 

In [17]:
for i in range(len(track_artists)):
    track_artists[i] = [elem['id'] for elem in track_artists[i]]

In [18]:
track_artists = [elem[0] for elem in track_artists]

In [19]:
track_artists = np.array(track_artists)

In [20]:
album_type = [elem['album_type'] for elem in album_album]
release_date = [elem['release_date'] for elem in album_album]
album_id = [elem['id'] for elem in album_album]

In [21]:
track_artists = pd.DataFrame(track_artists, columns=['artists_ids'])

In [22]:
album_type = pd.DataFrame(album_type, columns=['type_album'])

In [23]:
release_date = pd.DataFrame(release_date, columns=['release_date'])

In [24]:
album_id = pd.DataFrame(album_id, columns=['album_id'])

In [25]:
# Убираем ненужные признаки
drop_columns = ['track_album_type', 'track_album_track', 'track_album_is_local', 
                'track_album_episode', 'track_album_artists', 'track_album_name', 
                'track_album_id', 'track_album_disc_number', 'track_album_duration_ms', 
                'track_album_explicit', 'track_album_popularity', 'track_album_track_number',
                'track_type', 'track_episode', 'track_is_local', 'track_track', 'track_name',
                'track_disc_number', 'track_available_markets', 'track_album_album', 'track_artists']

tracks_info_df = tracks_info_df.drop(drop_columns, axis=1)

# Изменяем тип данных для категориальных признаков
for col in tracks_info_df.columns:
    if tracks_info_df[col].dtype == np.float64:
        tracks_info_df[col] = tracks_info_df[col].astype(int)
    elif tracks_info_df[col].dtype == bool:
        tracks_info_df[col] = tracks_info_df[col].astype(int)

# Проверяем наличие пропусков в данных
nan_columns = []
for col in tracks_info_df.columns:
    if tracks_info_df[col][tracks_info_df[col].isna() == True].shape[0] > 0:
        nan_columns.append(col)
if len(nan_columns) == 0:
    print('Пропусков нет')
else:
    print('Пропуски имеются в следующих признаках: ', ', '.join(nan_columns))

Пропусков нет


In [26]:
tracks_info_num = ['track_duration_ms']
result_tracks_info = NumericalEncoder(cols=tracks_info_num).fit_transform(tracks_info_df)

In [27]:
result_tracks_info = pd.concat([result_tracks_info, album_type, release_date, track_artists, album_id], axis=1)

In [28]:
result_tracks_info

Unnamed: 0,track_id,standard_track_duration_ms,track_explicit,track_popularity,track_track_number,type_album,release_date,artists_ids,album_id
0,5SrPeOqm3SBecyiAsI9HTT,-0.652868,1,0,1,single,2021-04-02,5FqTuN42w2zGw4Pzd50yaS,6LpuMzcAiFIOBr4ijkpPuQ
1,0YfvJaebmC1ch3vMik9yuA,2.112132,0,40,1,album,2021-02-03,4Cru1k3r3v36wCWfcJMK2y,094repVXEwkzPNg56jsJgs
2,6bSpavIeJURyGBNMUlJydq,0.188676,0,12,15,album,2015-02-10,1jjpkAHC8bd9fRFfgKyYLP,3WVvNhEvAjTKjYUNubfHtd
3,0fIBz9zLZGqrCMaAUskhk2,2.567852,0,20,4,album,2000-01-01,6MF58APd3YV72Ln2eVg710,47rGmuTbcsdAzRmyTlOvUp
4,2fG23IIy7rYN8eMNbAHcS2,-0.241374,0,52,9,album,2013-01-01,70kkdajctXSbqSMJbQO424,6IGpQUt0KNi5rBUXZZOFI6
...,...,...,...,...,...,...,...,...,...
899707,2r6pzLfa4huH3L3w7hzZXx,-0.049473,0,21,1,album,1991,4Csr2RlQUb4MODz2R3Usn7,3yaUYGJ1ol0Katez71WNSc
899708,0IEhU1bRVEqWCfb6Fy6w67,-0.647332,0,39,1,single,2018-07-28,0cIbGrllcQSKTM3AxUxHiL,7kwkWQEne0CZCmMTa7ReiW
899709,5WRNi2GaPQa1GBAKxKHJfS,0.363925,1,0,7,album,2018-05-18,6YxtMx8c5tjVRRHsqjEERp,3BcX5gCQZblSPagIBuoVBr
899710,0VwdSxxhCGc0MFvndnGw6T,0.034373,0,3,11,compilation,2015-11-09,3NUsiT2JSyaWAnWaXxDzhQ,3wICzSqLpPBjIawEPSqq8L


# Построение обучающей, валидационной и тестовой выборки

Синтезируем выборку для *train* и *test*

In [29]:
# Обучающая выборка
X_train = train_df.drop(['exists', 'index'], axis=1)
y_train = train_df['exists'].copy()

# Добавляем необходимые признаки
X_train = pd.merge(X_train, result_audio_features, on='track_id', how='left')
X_train = pd.merge(X_train, result_tracks_info, on='track_id', how='left')

# Тестовая выборка
X_test = test_df.drop('index', axis=1)

# Добавляем необходимые признаки
X_test = pd.merge(X_test, result_audio_features, on='track_id', how='left')
X_test = pd.merge(X_test, result_tracks_info, on='track_id', how='left')


In [30]:
# Заполняем пропуски (Train)
for col in X_train.columns:
    if X_train[col][X_train[col].isna() == True].shape[0] > 0:
        print(col)
        X_test[col] = X_test[col].fillna(X_train[col].mean(skipna=True))
        X_train[col] = X_train[col].fillna(X_train[col].mean(skipna=True))

standard_danceability
standard_energy
key
standard_loudness
mode
standard_speechiness
standard_acousticness
standard_instrumentalness
standard_liveness
standard_valence
standard_tempo
standard_duration_ms
time_signature


In [31]:
for col in X_train.columns:
    if X_test[col][X_test[col].isna() == True].shape[0] > 0:
        print(col)

In [32]:
X_train['key'] = X_train['key'].astype(int)
X_train['mode'] = X_train['mode'].astype(int)
X_train['time_signature'] = X_train['time_signature'].astype(int)

In [33]:
X_test['key'] = X_test['key'].astype(int)
X_test['mode'] = X_test['mode'].astype(int)
X_test['time_signature'] = X_test['time_signature'].astype(int)

In [34]:
X_train['track_popularity'] = X_train['track_popularity'].astype(np.float64)
X_test['track_popularity'] = X_test['track_popularity'].astype(np.float64)

In [35]:
X_train

Unnamed: 0,playlist_id,track_id,standard_danceability,standard_energy,key,standard_loudness,mode,standard_speechiness,standard_acousticness,standard_instrumentalness,...,standard_duration_ms,time_signature,standard_track_duration_ms,track_explicit,track_popularity,track_track_number,type_album,release_date,artists_ids,album_id
0,32931,07vS8obfeZbr8H4MgQfXR7,-0.928681,-1.430626,7,-0.628066,1,-0.426954,1.420031,-0.767439,...,-0.061827,4,-0.058537,0,0.0,2,single,2018-12-05,1r1uxoy19fzMxunt3ONAkG,1y2zWE0vkqUSXZ92lFAO2P
1,35527,1PEqh7awkpuepLBSq8ZwqD,1.015253,-0.285884,5,-0.164738,1,-0.489902,0.565497,0.307902,...,0.088014,4,0.082791,0,73.0,3,album,1996-04-16,2EfG2EoT8GFJrMiilbTVl2,1VzYTrtId9CgUTo7VQBFbL
2,12203,7E8pPgBY84oDaXRcqODavR,1.030973,1.188406,9,0.810327,0,-0.480299,-0.892269,-0.766083,...,-0.516044,4,-0.486925,0,0.0,1,single,2014-07-07,3cnAJv9gydgm52KFIsdvO8,1cbvOYy11NyKZfwXGCJCVA
3,21850,0Atml4huw4Fgyk6YSHiK4M,0.281693,-0.178347,7,0.538323,1,-0.481366,-0.837430,-0.767439,...,-0.082565,4,-0.078088,0,0.0,11,album,2001-01-24,7y5ZQx8XgtDa1r7KAkplOZ,1bTlGTasuKVa8QuHhYn1Ac
4,21391,4WYDmIZrwxBHdBYdvi5oQO,1.104329,-0.306697,0,-0.018363,1,-0.207172,-0.939655,1.462425,...,1.284547,4,1.211277,0,7.0,3,compilation,2014-10-03,7hjgsaJuLvrObwm39oEfLq,6LBktkX9tgfoGeRZBAt9TI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773057,4280,1CsJNkcfGXCUQhst9Wo4KL,0.376008,0.917830,9,0.505187,0,-0.492035,-1.101247,-0.722995,...,-0.198410,4,-0.187354,0,5.0,14,compilation,2024-02-23,6CpUfSiqmNciuYLQ1CxH9e,2m1OxoQ0Uk2XXcC19Cyub3
2773058,29901,2VBGLp3N6JxIELDbbIhoSS,0.051146,1.223095,9,0.167631,1,-0.271186,-1.017388,-0.518910,...,-0.007719,4,-0.007499,0,0.0,2,album,2007,1fa0cOhromAZdq2xRA4vv8,6qBZW0ODvRdfJfF8Xb5oY7
2773059,35199,5PEzhJIvmjFCFZcFZRze9S,1.182924,0.456464,1,0.838132,0,3.726503,-0.443439,-0.767439,...,-0.555204,4,-0.523859,0,44.0,3,single,2020-02-14,2NfSBtmWe7oPw1EmetJVso,4zzCQHjlhEEAidzfNRONJL
2773060,29231,0kvWwRs634uP7K50NObw2h,-0.682414,0.938644,4,0.247734,1,-0.207172,-0.862720,-0.126321,...,-0.026767,4,-0.025463,0,2.0,12,album,2017-10-20,0V9fDzVz0TmKcWZLKyjEXx,7iPbUBbGyQlVnFBpf8ahdz


In [36]:
# X_train = X_train.drop('release_date', axis=1)
# X_test = X_test.drop('release_date', axis=1)

# X_train = X_train.drop('album_ids', axis=1)
# X_test = X_test.drop('album_ids', axis=1)


In [37]:
# import gc
# gc.collect()

In [38]:
# del new_list_album

In [39]:
# del new_list

In [40]:
cat_inds = []

for i, col in enumerate(X_train.columns):
    if X_train[col].dtype == np.int64 or X_train[col].dtype == object:
        cat_inds.append(i)

In [41]:
cat_inds

[0, 1, 4, 6, 14, 16, 18, 19, 20, 21, 22]

In [42]:
# X_train.to_csv('/kaggle/working/dataset.csv', index=False)
# X_train = X_train.drop('track_id', axis=1)

In [43]:
from sklearn.model_selection import train_test_split

data_train, data_val, target_train, target_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Обучение

## CatBoost


In [44]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score

train_dataset = Pool(data=data_train,
                     label=target_train,
                     cat_features=cat_inds)

eval_dataset = Pool(data=data_val,
                    label=target_val,
                    cat_features=cat_inds)

model = CatBoostClassifier(task_type="GPU",
                           devices=[0, 1],
                           num_trees=1000,
                           learning_rate=0.46,
                           eval_metric='F1')
model.fit(train_dataset,
          eval_set=eval_dataset)

# Make predictions on the validation data
preds_proba = model.predict_proba(data_val)
preds = np.argmax(preds_proba, axis=1)

# Calculate F1 score on the training data
f1_val = f1_score(target_val, preds)
print(f"F1 Score on Val Data: {f1_val:.2f}")

# Make predictions on the training data
preds_proba = model.predict_proba(data_train)
preds = np.argmax(preds_proba, axis=1)

# Calculate F1 score on the training data
f1_train = f1_score(target_train, preds)
print(f"F1 Score on Train Data: {f1_train:.2f}")

# Make predictions on the data
preds_proba = model.predict_proba(X_train)
preds = np.argmax(preds_proba, axis=1)

# Calculate F1 score on the data
f1_data = f1_score(y_train, preds)
print(f"F1 Score on X_train: {f1_data:.2f}")

0:	learn: 0.7609379	test: 0.6985457	best: 0.6985457 (0)	total: 490ms	remaining: 8m 9s
1:	learn: 0.8000621	test: 0.7697183	best: 0.7697183 (1)	total: 907ms	remaining: 7m 32s
2:	learn: 0.8041458	test: 0.7673401	best: 0.7697183 (1)	total: 1.21s	remaining: 6m 40s
3:	learn: 0.8161546	test: 0.7812623	best: 0.7812623 (3)	total: 1.5s	remaining: 6m 13s
4:	learn: 0.8201883	test: 0.7768167	best: 0.7812623 (3)	total: 1.83s	remaining: 6m 3s
5:	learn: 0.8285045	test: 0.7930470	best: 0.7930470 (5)	total: 2.11s	remaining: 5m 50s
6:	learn: 0.8320754	test: 0.7990169	best: 0.7990169 (6)	total: 2.37s	remaining: 5m 36s
7:	learn: 0.8350222	test: 0.8069258	best: 0.8069258 (7)	total: 2.6s	remaining: 5m 22s
8:	learn: 0.8408097	test: 0.8141624	best: 0.8141624 (8)	total: 2.82s	remaining: 5m 10s
9:	learn: 0.8443778	test: 0.8198651	best: 0.8198651 (9)	total: 3.1s	remaining: 5m 6s
10:	learn: 0.8465854	test: 0.8223406	best: 0.8223406 (10)	total: 3.52s	remaining: 5m 16s
11:	learn: 0.8482352	test: 0.8241838	best: 0.82

In [45]:
# model2 = CatBoostClassifier(num_trees=200,
#                             learning_rate=0.8,
#                             eval_metric='F1')
# model2.fit(train_dataset,
#            eval_set=eval_dataset,
#            init_model=model)

# # Make predictions on the validation data
# preds_proba = model2.predict_proba(data_val)
# preds = np.argmax(preds_proba, axis=1)

# # Calculate F1 score on the training data
# f1_val = f1_score(target_val, preds)
# print(f"F1 Score on Val Data: {f1_val:.2f}")

# # Make predictions on the training data
# preds_proba = model2.predict_proba(data_train)
# preds = np.argmax(preds_proba, axis=1)

# # Calculate F1 score on the training data
# f1_train = f1_score(target_train, preds)
# print(f"F1 Score on Train Data: {f1_train:.2f}")

# # Make predictions on the data
# preds_proba = model2.predict_proba(X_train)
# preds = np.argmax(preds_proba, axis=1)

# # Calculate F1 score on the data
# f1_data = f1_score(y_train, preds)
# print(f"F1 Score on X_train: {f1_data:.2f}")

In [46]:
test_pred = np.argmax(model.predict_proba(X_test), axis=1)
test_pred

array([1, 1, 1, ..., 0, 0, 0])

In [47]:
print(test_pred.sum())

572230


In [48]:
PATH_TO_KAGGLE_SUBMISSION = "/kaggle/working/submission.csv"

submission = pd.DataFrame({"index": test_df["index"], 
                           "exists": test_pred})
submission.to_csv(PATH_TO_KAGGLE_SUBMISSION, index=False)