
# Data preprocessing

## ICM

### Load data

In [1]:
import numpy as np
import pandas as pd

In [2]:
%cd ..

/Users/fulco/Desktop/recommender-systems-challenge


In [3]:
data_path = "Data/"

In [4]:
def load_icm(icm_file, feature='Feature'):
    df_original = pd.read_csv(filepath_or_buffer=data_path + icm_file, sep=',', header=0,
                              dtype={'ItemID': np.int32, 'Feature': np.int32, 'Data': np.int32})
    df_original.columns = ['ItemID', feature, 'Data']
    return df_original

In [5]:
def load_all_icms():
    ICM_channel = load_icm("data_ICM_channel.csv", feature='Channel')
    ICM_event = load_icm("data_ICM_event.csv", feature='Event')
    ICM_genre = load_icm("data_ICM_genre.csv", feature='Genre')
    ICM_subgenre = load_icm("data_ICM_subgenre.csv", feature='Subgenre')

    return ICM_channel, ICM_event, ICM_genre, ICM_subgenre

In [6]:
ICM_channel, ICM_event, ICM_genre, ICM_subgenre = load_all_icms()

### Example of merging with subsets of two ICMs

In [7]:
s = ICM_subgenre.head(10)
g = ICM_genre.head(10)

In [8]:
s.drop(columns=['Data'],inplace=True)
g.drop(columns=['Data'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [9]:
s

Unnamed: 0,ItemID,Subgenre
0,0,70
1,1,27
2,2,40
3,3,50
4,4,62
5,5,3
6,6,99
7,7,102
8,8,62
9,9,61


In [10]:
g

Unnamed: 0,ItemID,Genre
0,0,3
1,1,3
2,2,6
3,3,0
4,3,1
5,4,4
6,5,3
7,6,7
8,7,6
9,8,4


In [11]:
pd.merge(s, g, how="outer", on='ItemID')

Unnamed: 0,ItemID,Subgenre,Genre
0,0,70,3.0
1,1,27,3.0
2,2,40,6.0
3,3,50,0.0
4,3,50,1.0
5,4,62,4.0
6,5,3,3.0
7,6,99,7.0
8,7,102,6.0
9,8,62,4.0


We have proved that it works, now let's apply it to all ICMs

### Merge all ICMs

In [12]:
data_frames = [ICM_channel, ICM_event, ICM_genre, ICM_subgenre]

In [13]:
for df in data_frames:
    df.drop(columns=['Data'],inplace=True)

In [14]:
from functools import reduce

df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['ItemID'],
                                            how='outer'), data_frames)

In [16]:
df_merged['Data'] = 1

In [17]:
df_merged

Unnamed: 0,ItemID,Channel,Event,Genre,Subgenre,Data
0,0,23.0,121781.0,3.0,70.0,1
1,1,30.0,150102.0,3.0,27.0,1
2,1,30.0,349614.0,3.0,27.0,1
3,2,38.0,9174.0,6.0,40.0,1
4,2,38.0,20833.0,6.0,40.0,1
...,...,...,...,...,...,...
676322,18020,,202802.0,0.0,98.0,1
676323,18020,,241883.0,0.0,98.0,1
676324,18020,,285318.0,0.0,98.0,1
676325,7388,,,4.0,94.0,1


In [18]:
df_merged.to_csv(data_path + 'ICM_merged.csv')