<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#data.py" data-toc-modified-id="data.py-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>data.py</a></span><ul class="toc-item"><li><span><a href="#library" data-toc-modified-id="library-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>library</a></span></li><li><span><a href="#DataProcessor" data-toc-modified-id="DataProcessor-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>DataProcessor</a></span></li><li><span><a href="#TrainTestProcessor" data-toc-modified-id="TrainTestProcessor-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>TrainTestProcessor</a></span></li><li><span><a href="#SongsProcessor" data-toc-modified-id="SongsProcessor-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>SongsProcessor</a></span></li><li><span><a href="#SongExtraProcessor" data-toc-modified-id="SongExtraProcessor-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>SongExtraProcessor</a></span></li><li><span><a href="#MembersProcessor" data-toc-modified-id="MembersProcessor-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>MembersProcessor</a></span></li><li><span><a href="#EngineeringProcessor" data-toc-modified-id="EngineeringProcessor-1.7"><span class="toc-item-num">1.7&nbsp;&nbsp;</span>EngineeringProcessor</a></span></li><li><span><a href="#[obsolete]-SimilarityProcessor" data-toc-modified-id="[obsolete]-SimilarityProcessor-1.8"><span class="toc-item-num">1.8&nbsp;&nbsp;</span><font color="red">[obsolete] SimilarityProcessor</font></a></span></li></ul></li><li><span><a href="#debug-go" data-toc-modified-id="debug-go-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>debug go</a></span><ul class="toc-item"><li><span><a href="#FeatureProducer" data-toc-modified-id="FeatureProducer-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>FeatureProducer</a></span></li></ul></li><li><span><a href="#main" data-toc-modified-id="main-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>main</a></span><ul class="toc-item"><li><span><a href="#init" data-toc-modified-id="init-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>init</a></span></li><li><span><a href="#load" data-toc-modified-id="load-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>load</a></span></li><li><span><a href="#preprocess" data-toc-modified-id="preprocess-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>preprocess</a></span></li><li><span><a href="#feature-engineering" data-toc-modified-id="feature-engineering-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>feature engineering</a></span></li><li><span><a href="#compute_msno_song_similarity" data-toc-modified-id="compute_msno_song_similarity-3.5"><span class="toc-item-num">3.5&nbsp;&nbsp;</span><font color="red">compute_msno_song_similarity</font></a></span></li></ul></li><li><span><a href="#implicit" data-toc-modified-id="implicit-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>implicit</a></span></li></ul></div>

# data.py

## library

In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import pickle
import time
import os
import logging
from abc import abstractmethod

from implicit.als import AlternatingLeastSquares
from joblib import Parallel, delayed
from scipy.sparse import coo_matrix, linalg
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from torch.utils.data import Dataset
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import cosine_similarity

LOG_FORMAT = '%(asctime)s %(levelname)s << %(message)s'
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, datefmt='%H:%M:%S')

## DataProcessor

In [2]:
class DataProcessor(object):

    def __init__(self):
        return

    @abstractmethod
    def parse(self, df):
        raise NotImplementedError("Please implement method \'parse()\'.")

    @staticmethod
    def process(df, command, ref_df=None):
        start = time.time()

        res = None
        message = None
        if command in ['train', 'test']:
            res = TrainTestProcessor(ref_df).parse(df)
            message = command
        elif command == 'members':
            res = MembersProcessor().parse(df)
            message = command
        elif command == 'songs':
            res = SongsProcessor().parse(df)
            message = command
        elif command == 'song_extra_info':
            res = SongExtraProcessor().parse(df)
            message = command
        elif command == 'engineering':
            assert ref_df is not None, 'Please pass the reference dataframe'
            res = EngineeringProcessor(ref_df).parse(df)
            message = command

        assert res is not None, logging.error("command \"%s\" is valid." % command)
        logging.info("parse %s_df in %0.2fs" % (message, time.time() - start))

        return res

## TrainTestProcessor

In [3]:
class TrainTestProcessor(DataProcessor):

    def __init__(self, ref_df):
        super(TrainTestProcessor, self).__init__()
        self._ref_df = ref_df
        
        # fill missing data
        self._ref_df['source_system_tab'].fillna('others', inplace=True)
        self._ref_df['source_screen_name'].fillna('others', inplace=True)
        self._ref_df['source_type'].fillna('nan', inplace=True)

        # feature engineering
        self._ref_df['source_merged'] = self._ref_df['source_system_tab'].map(str) + ' | ' + \
                                        self._ref_df['source_screen_name'].map(str) + ' | ' + \
                                        self._ref_df['source_type'].map(str)

        self._ref_df = self._ref_df[['source_merged', 'target']].groupby('source_merged').agg(['mean', 'count'])
        self._ref_df.reset_index(inplace=True)
        self._ref_df.columns = ['source_merged', 'source_replay_pb', 'source_replay_count']

    def parse(self, df):
        # fill missing data
        df['source_system_tab'].fillna('others', inplace=True)
        df['source_screen_name'].fillna('others', inplace=True)
        df['source_type'].fillna('nan', inplace=True)

        # feature engineering
        df['source_merged'] = df['source_system_tab'].map(str) + ' | ' +\
                              df['source_screen_name'].map(str) + ' | ' +\
                              df['source_type'].map(str)

        df = df.merge(self._ref_df, on='source_merged', how='left')

        df['1h_source'] = df['source_replay_pb'].apply(TrainTestProcessor.__one_hot_encode_source)

        df['1h_system_tab'] = df['source_system_tab'].apply(TrainTestProcessor.__one_hot_encode_system_tab)
        df['1h_screen_name'] = df['source_screen_name'].apply(TrainTestProcessor.__one_hot_encode_screen_name)
        df['1h_source_type'] = df['source_type'].apply(TrainTestProcessor.__one_hot_encode_source_type)

        # useless feature
        df.drop(['source_replay_pb', 'source_replay_count'], axis=1, inplace=True)

        assert (~df.isnull().any().any()), 'There exists missing data!'

        return df

    @staticmethod
    def __one_hot_encode_system_tab(x):
        return 1 if x == 'my library' else 0

    @staticmethod
    def __one_hot_encode_screen_name(x):
        return 1 if x == 'Local playlist more' or x == 'My library' else 0

    @staticmethod
    def __one_hot_encode_source_type(x):
        return 1 if x == 'local-library' or x == 'local-playlist' else 0

    @staticmethod
    def __one_hot_encode_source(x):
        return 1 if x >= 0.6 else 0

## SongsProcessor

In [4]:
class SongsProcessor(DataProcessor):

    def __init__(self):
        super(SongsProcessor, self).__init__()

    def parse(self, df):
        # fill missing data
        df['artist_name'].fillna('no_artist', inplace=True)
        df['language'].fillna('nan', inplace=True)
        df['composer'].fillna('nan', inplace=True)
        df['lyricist'].fillna('nan', inplace=True)
        df['genre_ids'].fillna('nan', inplace=True)

        # feature engineering
        df['is_featured'] = df['artist_name'].apply(SongsProcessor.__is_featured).astype(np.int8)

        # >> duplicate
        df['artist_count'] = df['artist_name'].apply(SongsProcessor.__artist_count).astype(np.int8)

        df['artist_composer'] = (df['artist_name'] == df['composer'])
        df['artist_composer'] = df['artist_composer'].astype(np.int8)

        # if artist, lyricist and composer are all three same
        df['artist_composer_lyricist'] = ((df['artist_name'] == df['composer']) &
                                          (df['artist_name'] == df['lyricist']) &
                                          (df['composer'] == df['lyricist']))
        df['artist_composer_lyricist'] = df['artist_composer_lyricist'].astype(np.int8)

        # >> duplicate
        df['song_lang_boolean'] = df['language'].apply(SongsProcessor.__song_lang_boolean).astype(np.int8)

        # howeverforever
        df['genre_count'] = df['genre_ids'].apply(SongsProcessor.__parse_splitted_category_to_number)
        df['composer_count'] = df['composer'].apply(SongsProcessor.__parse_splitted_category_to_number)
        df['lyricist_count'] = df['lyricist'].apply(SongsProcessor.__parse_splitted_category_to_number)

        df['1h_lang'] = df['language'].apply(SongsProcessor.__one_hot_encode_lang)

        df['1h_song_length'] = df['song_length'].apply(lambda x: 1 if x <= 239738 else 0)

        assert(~df.isnull().any().any()), 'There exists missing data!'

        return df

    @staticmethod
    def __is_featured(x):
        return 1 if 'feat' in str(x) else 0

    @staticmethod
    def __artist_count(x):
        return 0 if x == 'no_artist' else x.count('and') + x.count(',') + x.count('feat') + x.count('&')

    @staticmethod
    def __song_lang_boolean(x):
        # is song language 17 or 45.
        return 1 if '17.0' in str(x) or '45.0' in str(x) else 0

    @staticmethod
    def __parse_splitted_category_to_number(x):
        if x is np.nan:
            return 0
        x = str(x)
        x.replace('/', '|')
        x.replace(';', '|')
        x.replace('\\', '|')
        x.replace(' and ', '|')
        x.replace('&', '|')
        x.replace('+', '|')
        return x.count('|') + 1

    @staticmethod
    def __one_hot_encode_lang(x):
        return 1 if x in [-1, 17, 45] else 0

## SongExtraProcessor

In [5]:
class SongExtraProcessor(DataProcessor):

    def __init__(self):
        super(SongExtraProcessor, self).__init__()

    def parse(self, df):
        df['song_year'] = df['isrc'].apply(SongExtraProcessor.__transform_isrc_to_year)
        df.drop(['name', 'isrc'], axis=1, inplace=True)

        # howeverforever
        # df['song_country'] = df['isrc'].apply(self._transform_isrc_to_country)
        # df['song_registration'] = df['isrc'].apply(self._transform_isrc_to_reg)
        # df['song_designation'] = df['isrc'].apply(self._transform_isrc_to_desig)

        df['1h_song_year'] = df['song_year'].apply(SongExtraProcessor.__one_hot_encode_year)
        # df['1h_song_country'] = df['song_country'].apply(self._one_hot_encode_country)

        df['song_year'].fillna(2017, inplace=True)
        # df['song_registration'].fillna('***', inplace=True)

        assert (~df.isnull().any().any())

        return df

    @staticmethod
    def __transform_isrc_to_year(isrc):
        if type(isrc) != str:
            return np.nan
        # this year 2017
        suffix = int(isrc[5:7])
        return 1900 + suffix if suffix > 17 else 2000 + suffix

    @staticmethod
    def __one_hot_encode_year(x):
        return 1 if 2013 <= float(x) <= 2017 else 0

## MembersProcessor

In [6]:
class MembersProcessor(DataProcessor):

    def __init__(self):
        super(MembersProcessor, self).__init__()

    def parse(self, df):
        # fill missing data
        df['gender'].fillna('nan', inplace=True)

        # feature engineering
        df['membership_days'] = df['expiration_date'].subtract(df['registration_init_time']).dt.days.astype(int)

        df['registration_year'] = df['registration_init_time'].dt.year
        df['registration_month'] = df['registration_init_time'].dt.month

        df['expiration_year'] = df['expiration_date'].dt.year
        df['expiration_month'] = df['expiration_date'].dt.month

        # useless feature
        df.drop(['registration_init_time'], axis=1, inplace=True)

        # howeverforever
        df['bd'] = df['bd'].apply(MembersProcessor.__transform_bd_outliers)
        df['1h_via'] = df['registered_via'].apply(MembersProcessor.__one_hot_encode_via)

        assert (~df.isnull().any().any()), 'There exists missing data!'

        return df

    @staticmethod
    def __transform_bd_outliers(bd):
        # figure is from "exploration"
        if bd >= 120 or bd <= 7:
            return 'nan'
        mean = 28.99737187910644
        std = 9.538470787507382
        return bd if abs(bd - mean) <= 3 * std else 'nan'

    @staticmethod
    def __one_hot_encode_via(x):
        return 0 if x == 4 else 1

## EngineeringProcessor

In [7]:
class EngineeringProcessor(DataProcessor):

    def __init__(self, ref_df):
        super(EngineeringProcessor, self).__init__()
        self._ref_df = ref_df

    def parse(self, df):
        df = self.generate_play_count(df)
        df = self.generate_track_count(df)
        df = self.generate_cover_lang(df)

        return df

    def generate_play_count(self, df):
        count_df = self._ref_df['song_id'].value_counts().reset_index()
        count_df.columns = ['song_id', 'play_count']
        
        df = df.merge(count_df, on='song_id', how='left')
        df['play_count'].fillna(0, inplace=True)

        return df

    def generate_track_count(self, df):
        track_count_df = self._ref_df[['song_id', 'artist_name']].drop_duplicates('song_id')
        track_count_df = track_count_df.groupby('artist_name').agg('count').reset_index()
        track_count_df.columns = ['artist_name', 'track_count']
        track_count_df = track_count_df.sort_values('track_count', ascending=False)

        artist_count_df = self._ref_df[['artist_name', 'target']].groupby('artist_name').agg(['mean', 'count']).reset_index()
        artist_count_df.columns = ['artist_name', 'replay_pb', 'play_count']

        artist_count_df = artist_count_df.merge(track_count_df, on='artist_name', how='left')

        df = df.merge(artist_count_df[['artist_name', 'track_count']], on='artist_name', how='left')
        df['track_count'].fillna(0, inplace=True)

        return df

    def generate_cover_lang(self, df):
        cover_lang_df = self._ref_df[['artist_name', 'language']].drop_duplicates(['artist_name', 'language'])
        cover_lang_df = cover_lang_df['artist_name'].value_counts().reset_index()
        cover_lang_df.columns = ['artist_name', 'cover_lang']

        df = df.merge(cover_lang_df, on='artist_name', how='left')
        df['cover_lang'].fillna(0, inplace=True)

        return df

## <font color='red'>[obsolete] SimilarityProcessor</font>

In [8]:
class SimilarityProcessor(DataProcessor):
    __MEMBERS_FEATURE = ['city', 'bd', 'gender', 'registered_via', 'expiration_date', 'membership_days',
                         'registration_year', 'registration_month',
                         'expiration_year', 'expiration_month']

    __SONGS_FEATURE = ['genre_ids', 'artist_name', 'language', 'composer', 'lyricist', 'song_year']

    def __init__(self, songs_df, members_df):
        super(SimilarityProcessor, self).__init__()
        self._songs_df = songs_df
        self._members_df = members_df

    def parse(self, df):
        train_df = df[0]
        test_df = df[1]

        return self.__compute_msno_song_similarity(train_df, test_df)

    class ColumnSelector(BaseEstimator, TransformerMixin):

        def __init__(self, columns):
            self.columns = columns

        def fit(self, X, y=None):
            return self

        def transform(self, X):
            return X[self.columns].to_dict(orient='record')

    def __compute_msno_song_similarity(self, train, test):
        for col in train.columns:
            if train[col].dtype == object:
                train[col] = train[col].astype('category')
                test[col] = test[col].astype('category')
        # pipeline
        member_pipeline = Pipeline([
                ('extract', SimilarityProcessor.ColumnSelector(SimilarityProcessor.__MEMBERS_FEATURE)),
                ('dicVect', DictVectorizer())])
        song_pipeline = Pipeline([
                ('extract', SimilarityProcessor.ColumnSelector(SimilarityProcessor.__SONGS_FEATURE)),
                ('dicVect', DictVectorizer())])

        # ? songs = self.songs.merge(self.extra, on='song_id', how='left').fillna('test')
        songs_df = self._songs_df
        members_df = self._members_df.fillna('test')
        msno_x = {v: i for i, v in enumerate(members_df.msno)}
        song_x = {v: i for i, v in enumerate(songs_df.song_id)}

        # transform members_df
        start = time.time()
        msno_m = member_pipeline.fit_transform(members_df)
        logging.debug("transform members_df in %0.2fs" % (time.time() - start))

        # transform songs_df
        start = time.time()
        song_m = song_pipeline.fit_transform(songs_df)
        logging.debug("transform songs_df in %0.2fs" % (time.time() - start))

        known_msno = set(train.msno.unique())
        unknown_msno = list(set(test.msno.unique()) - known_msno)
        total_msno = float(len(unknown_msno))

        known_song = set(train.song_id.unique())
        unknown_song = list(set(test.song_id.unique()) - known_song)
        total_song = float(len(unknown_song))

        unknown_msno_map, unknown_song_map = {}, {}

        start = time.time()
        known_msno_list = members_df.msno.apply(lambda x: x in known_msno)
        known_song_list = songs_df.song_id.apply(lambda x: x in known_song)
        logging.debug("establish known list in %0.2fs" % (time.time() - start))

        # ? Parallel(n_jobs=6)(delayed(self._get_unknown_map)
        #                    (i, members.msno, known_msno_list, True) for i in unknown_msno)

        start = time.time()
        n = 0
        for i in unknown_msno:
            if i in msno_x:
                df = SimilarityProcessor.__get_rank(msno_m, msno_x[i], members_df.msno, known_msno_list)
                unknown_msno_map[i] = df.iloc[0]['id']
            else:
                unknown_msno_map[i] = 'new'
            n += 1
            if (n + 1) % 100 == 0:
                print('msno: %f %%' % ((n/total_msno) * 100))

        n = 0
        for i in unknown_song:
            if i in song_x:
                df = SimilarityProcessor.__get_rank(song_m, song_x[i], songs_df.song_id, known_song_list)
                unknown_song_map[i] = df.iloc[0]['id']
            else:
                unknown_song_map[i] = 'new'
            n += 1
            if (n + 1) % 100 == 0:
                print('song: %f %%' % ((n/total_song) * 100))

        logging.debug("transform all unknown data in %0.2fs" % (time.time() - start))
        return unknown_msno_map, unknown_song_map

    @staticmethod
    def __get_rank(model, w, id_list, known_list):
        result = cosine_similarity(model, model[w].toarray().reshape(1, -1)).reshape(1, -1)[0]
        r = pd.DataFrame({'id': id_list, 'similarity': result, 'known': known_list})
        return r[r.known].sort_values(by='similarity', ascending=False).reset_index(drop=True)

# debug go

## FeatureProducer

In [9]:
class FeatureProducer(object):

    __SONGS_FILE_NAME = 'songs.csv'
    __SONG_EXTRA_FILE_NAME = 'song_extra_info.csv'
    __MEMBERS_FILE_NAME = 'members.csv'
    __TRAIN_FILE_NAME = 'train.csv'
    __TEST_FILE_NAME = 'test.csv'

    __INITIALIZATION_READY = (1 << 0)
    __LOAD_READY = (1 << 1)
    __PREPROCESS_READY = (1 << 2)
    __ENGINEERING_READY = (1 << 3)
    __SIMILARITY_MAPPING_READY = (1 << 4)

    def __init__(self, root='./data'):
        assert os.path.exists(root), '%s not exists!' % root
        self._root = os.path.expanduser(root)

        self._songs_df = None
        self._song_extra_info_df = None
        self._members_df = None
        self._train_df = None
        self._test_df = None
        self._comb_df = None
        self._unknown_msno_map = None
        self._unknown_song_map = None
        self._state = FeatureProducer.__INITIALIZATION_READY

    def load_raw(self):
        """
        Load all raw data under the directory specified.
        Call this function right after initialization.

        :return:
        """

        assert (self._state & FeatureProducer.__INITIALIZATION_READY) > 0, logging.error("Please reconstruct new class")

        start = time.time()

        # load train & test set
        self._train_df = pd.read_csv(os.path.join(self._root, self.__TRAIN_FILE_NAME))
        self._test_df = pd.read_csv(os.path.join(self._root, self.__TEST_FILE_NAME))

        # load song & member set
        self._songs_df = pd.read_csv(os.path.join(self._root, self.__SONGS_FILE_NAME))
        self._song_extra_info_df = pd.read_csv(os.path.join(self._root, self.__SONG_EXTRA_FILE_NAME))
        self._members_df = pd.read_csv(os.path.join(self._root, self.__MEMBERS_FILE_NAME),
                                       parse_dates=['registration_init_time', 'expiration_date'])

        self._state |= FeatureProducer.__LOAD_READY
        logging.info("load raw data in %0.2fs" % (time.time() - start))

    def pre_process(self):
        """
        Pre-process all dataframes and merge them into "train_df" and "test_df".
        Call this function after calling "load_raw"

        :return:
        """

        assert (self._state & FeatureProducer.__LOAD_READY) > 0, logging.error("Please load raw data first")
        
        # pre-process all data-frame
        self._train_df = DataProcessor().process(self._train_df, 'train', self._train_df)
        self._test_df = DataProcessor().process(self._test_df, 'test', self._train_df)
        self._members_df = DataProcessor().process(self._members_df, 'members')
        self._songs_df = DataProcessor().process(self._songs_df, "songs")
        self._song_extra_info_df = DataProcessor().process(self._song_extra_info_df, "song_extra_info")

        # merge all data-frame
        self._songs_df = self._songs_df.merge(self._song_extra_info_df, on='song_id', how='left')

        self._train_df = self._train_df.merge(self._songs_df, on='song_id', how='left')
        self._test_df = self._test_df.merge(self._songs_df, on='song_id', how='left')

        self._train_df = self._train_df.merge(self._members_df, on='msno', how='left')
        self._test_df = self._test_df.merge(self._members_df, on='msno', how='left')

        self._comb_df = self._train_df.append(self._test_df)

        self._state |= FeatureProducer.__PREPROCESS_READY

    def feature_engineering(self):
        """
        Do the advanced feature engineering.
        Call this function after calling "pre_process"

        :return:
        """

        assert (self._state & FeatureProducer.__PREPROCESS_READY) > 0, logging.error("Please proprocess raw data first")

        self._train_df = DataProcessor().process(self._train_df, 'engineering', self._train_df)
        self._test_df = DataProcessor().process(self._test_df, 'engineering', self._comb_df)

        self._state |= FeatureProducer.__ENGINEERING_READY

    def compute_msno_song_similarity(self):
        """
        I don't really know about how this do...
        Call this function after calling "load_raw"

        :return:
        """

        assert (self._state & FeatureProducer.__LOAD_READY) > 0, logging.error("Please load raw data first")

        self._unknown_msno_map, self._unknown_song_map = \
            SimilarityProcessor(self._songs_df, self._members_df).parse([self._train_df, self._test_df])

        self._state |= FeatureProducer.__SIMILARITY_MAPPING_READY
        
    @property
    def train_df(self):
        return self._train_df

    @property
    def test_df(self):
        return self._test_df

    @property
    def members_df(self):
        return self._members_df

    @property
    def songs_df(self):
        return self._songs_df

    @property
    def song_extra_info_df(self):
        return self._song_extra_info_df

    @property
    def comb_df(self):
        return self._comb_df

# main

## init

In [10]:
fp = FeatureProducer(root='../data')

## load

In [11]:
fp.load_raw()

18:29:27 INFO << load raw data in 19.62s


## preprocess

In [12]:
fp.pre_process()

18:29:57 INFO << parse train_df in 30.18s
18:30:12 INFO << parse test_df in 14.84s
18:30:12 INFO << parse members_df in 0.28s
18:30:32 INFO << parse songs_df in 19.79s
18:30:35 INFO << parse song_extra_info_df in 2.97s


## feature engineering

In [13]:
fp.feature_engineering()

18:31:29 INFO << parse engineering_df in 26.97s
18:31:49 INFO << parse engineering_df in 19.49s


## <font color='red'>compute_msno_song_similarity</font>

In [14]:
# fp.compute_msno_song_similarity()

# implicit

In [15]:
fp.train_df

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,source_merged,1h_source,1h_system_tab,1h_screen_name,...,expiration_date,membership_days,registration_year,registration_month,expiration_year,expiration_month,1h_via,play_count,track_count,cover_lang
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,explore | Explore | online-playlist,0,0,0,...,2017-10-05,2103,2012,1,2017,10,1,215,84.0,1.0
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,my library | Local playlist more | local-playlist,1,1,1,...,2017-09-11,2301,2011,5,2017,9,1,1,17080.0,10.0
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,my library | Local playlist more | local-playlist,1,1,1,...,2017-09-11,2301,2011,5,2017,9,1,4,61.0,2.0
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1,my library | Local playlist more | local-playlist,1,1,1,...,2017-09-11,2301,2011,5,2017,9,1,1,1.0,1.0
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,explore | Explore | online-playlist,0,0,0,...,2017-10-05,2103,2012,1,2017,10,1,412,6.0,1.0
5,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,explore,Explore,online-playlist,1,explore | Explore | online-playlist,0,0,0,...,2017-10-05,2103,2012,1,2017,10,1,1108,15.0,1.0
6,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,VkILU0H1h3NMmk9MQrXouNudGk5n8Ls5cqRRuBxeTh4=,my library,Local playlist more,local-playlist,1,my library | Local playlist more | local-playlist,1,1,1,...,2017-09-11,2301,2011,5,2017,9,1,3869,43.0,1.0
7,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,bPIvRTzfHxH5LgHrStll+tYwSQNVV8PySgA3M1PfTgc=,explore,Explore,online-playlist,1,explore | Explore | online-playlist,0,0,0,...,2017-10-05,2103,2012,1,2017,10,1,287,33.0,1.0
8,uHqAtShXTRXju5GE8ri3ITsVFepPf8jUoCF7ffNOuqE=,/bU6IRSK+YNlNbaTkxo7bhsb2EDLPrnksdX3ggcZNhI=,my library,Local playlist more,local-library,1,my library | Local playlist more | local-library,1,1,1,...,2018-03-04,2309,2011,11,2018,3,1,31,155.0,2.0
9,uHqAtShXTRXju5GE8ri3ITsVFepPf8jUoCF7ffNOuqE=,EbI7xoNxI+3QSsiHxL13zBdgHIJOwa3srHd7cDcnJ0g=,my library,Local playlist more,local-library,1,my library | Local playlist more | local-library,1,1,1,...,2018-03-04,2309,2011,11,2018,3,1,738,155.0,2.0
