### Importing Modules

In [None]:
"""
Importing modules
"""
import numpy as np
from sklearn import preprocessing
import datetime
import re
import math
import random
import pickle as pkl
from IPython import get_ipython
#get_ipython().run_line_magic('pylab inline', 'config InlineBackend.figure_formats = ['retina']'

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#sns.set()

from collections import defaultdict

### Reduce Memory Usage

In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if (props[col].dtype != object and props[col].dtype != 'datetime64[ns]'):  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props

### Preparing Test Dataframe

In [None]:
df_members = pd.read_csv('/home/ubuntu/Project_3/kkbox-music-recommendation-challenge/members.csv')

In [None]:
df_members = reduce_mem_usage(df_members)

In [None]:
"""
Parameter df_members: Members dataframe
Convert registration and expiration date to time Series.
Add new column with duration of the users membership
Convert 0's in 'bd' column (age) into the mean of people from the same area and same way of registration

Returns: Members DataFrame
"""
df_members['Registration'] = pd.to_datetime(df_members['registration_init_time'], format = '%Y%m%d')
df_members['Expiration Date'] = pd.to_datetime(df_members['expiration_date'], format = '%Y%m%d')
df_members.drop(columns = ['registration_init_time','expiration_date'], inplace = True)

df_members['gender'].replace(to_replace = np.NaN, value = -1, inplace = True)

#Compute registered timeframe
df_members['Registered Timeframe (days)'] = (df_members['Expiration Date'] - \
df_members['Registration']).apply(lambda x: x.days)
timestamp = df_members['Registration'].sort_values(ascending = False).iloc[0]
df_members['Registration_to_today'] = (pd.to_datetime(timestamp) -pd.to_datetime(df_members['Registration'])).apply(lambda x: x.days)
df_members['Active Timeframe'] = df_members['Registered Timeframe (days)']
df_members['Active Timeframe'] = df_members['Active Timeframe'][df_members['Registered Timeframe (days)'] < df_members['Registration_to_today']] = df_members['Registered Timeframe (days)']
df_members = reduce_mem_usage(df_members)
pkl.dump(df_members, open( "members.pkl", "wb" ))
del df_members

In [None]:
df_members = pkl.load(open("members.pkl", "rb" ))
df_train = pd.read_csv('/home/ubuntu/Project_3/kkbox-music-recommendation-challenge/train.csv')
df_songs = pd.read_csv('/home/ubuntu/Project_3/kkbox-music-recommendation-challenge/songs.csv')

In [None]:
#Reduce memory usage
df_train = reduce_mem_usage(df_train)
df_songs = reduce_mem_usage(df_songs)

In [None]:
df_members = df_train[['msno', 'song_id']].merge(df_members, on = 'msno')
del df_train
df_members = df_members.merge(df_songs[['song_id', 'genre_ids']], on = 'song_id')
del df_songs
age_mean = df_members[df_members.bd != 0].groupby(['registered_via', 'genre_ids'])['bd'].mean().reset_index()
age_dict = defaultdict(int)
for (c,r,a) in zip(age_mean['registered_via'], age_mean['genre_ids'], \
age_mean['bd']):
#age_dict[c,r] = a
    mask = (df_members.registered_via == c)&(df_members.genre_ids == r)&(df_members.bd == 0)
    index_list = (df_members[mask].index)
for index in index_list:
    df_members.loc[index,'bd'] = a
pkl.dump(df_members, open( "members.pkl", "wb" ) )
del df_members

In [None]:
#df_members = pkl.load(open('members.pkl','rb'))
df_members1 = reduce_mem_usage(df_members.drop(columns = 'Expiration Date'))
#pkl.dump(df_members, open( "members.pkl", "wb" ) )

In [None]:
df_members

In [None]:
#Take out song id, isrc

In [None]:
#Label encoding for msno and song_id
#df_members = pkl.load(open('members.pkl','rb'))

#le = preprocessing.LabelEncoder()
#le.fit(df_members['msno'])
#df_members['msno']=le.transform(df_members['msno'])

#le = preprocessing.LabelEncoder()
#le.fit(df_members['song_id'])
#df_members['song_id'] = le.transform(df_members['song_id'])

#df_members = reduce_mem_usage(df_members)
#pkl.dump(df_members, open( "members.pkl", "wb" ) )

In [None]:
#Computing the time the user has been active in the music streaming service

df_members = df_members[['msno', 'song_id','Registration', 'Registered Timeframe (days)']]
timestamp = df_members['Registration'].sort_values(ascending = False).iloc[0]
df_members['Registration_to_today'] = (pd.to_datetime(timestamp) -pd.to_datetime(df_members['Registration'])).apply(lambda x: x.days)
df_members['Active Timeframe'] = df_members['Registered Timeframe (days)']
df_members['Active Timeframe'] = df_members['Active Timeframe'][df_members['Registered Timeframe (days)'] < df_members['Registration_to_today']] = df_members['Registered Timeframe (days)']
df_members = reduce_mem_usage(df_members)
with open('msno_active_timeframe_eda.pkl',"wb")as file:
    pkl.dump(df_members[['msno', 'song_id','Registration', 'Registration_to_today', 'Active Timeframe']],file)

In [None]:
df_members = pkl.load(open("members.pkl", "rb" ))
df_members1 = df_members[['msno', 'song_id']]
df_members1.to_csv("members_msno")
del df_members, df_members1

In [None]:
"""
Add timestamp to each index
"""
df_members = pkl.load(open( "msno_active_timeframe_eda.pkl", "rb" ))
df_members['Timestamp'] = df_members.Registration
df_members['days_between_songs']=round((df_membbers['Active Timeframe']/df_members.groupby(['msno'])['msno'].transform('count')),0)
print('Days between songs have been added')
count1 = 0
for index in df_members.index:
    count = len(df_members.iloc[:index+1][df_members.msno == df_members.msno.iloc[index]])-1
    df_members['Timestamp'].iloc[index] = pd.to_datetime(df_members['Registration'].iloc[index])+ datetime.timedelta(df_members['days_between_songs'].iloc[index]*count)
with open('msno_timestamp.pkl',"wb")as file:
    pkl.dump(df_members[['msno', 'song_id', 'Timestamp']],file) 