In [1]:
import os
import random
import pickle

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import scipy.sparse as sp



In [2]:
raw_dir = '../data/raw/'
pp_dir = '../data/interim/'

core = 20

train_p = 0.7
val_p = 0.1
test_p = 0.2
random_state = 420

In [3]:
column_names= [
    'movie title',
    'release date',
    'video release date',
    'IMDb URL',
    'unknown',
    'Action',
    'Adventure',
    'Animation',
    "Children's",
    'Comedy',
    'Crime',
    'Documentary',
    'Drama',
    'Fantasy',
    'Film-Noir',
    'Horror',
    'Musical',
    'Mystery',
    'Romance',
    'Sci-Fi',
    'Thriller',
    'War',
    'Western'
]
item_df = pd.read_csv(
  raw_dir+"ml-100k/u.item",
  sep="|",
  encoding="ISO-8859-15",
  names=column_names
).drop(['video release date', 'IMDb URL'], axis=1).reset_index()
item_df.head()

Unnamed: 0,index,movie title,release date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [4]:
assert (train_p + val_p + test_p) == 1.0

## Load raw interactions

In [5]:
# int_train = pd.read_csv(os.path.join(raw_dir, 'interactions_train.csv'))
# int_validation = pd.read_csv(os.path.join(raw_dir, 'interactions_validation.csv'))
# int_test = pd.read_csv(os.path.join(raw_dir, 'interactions_test.csv'))

In [6]:
# len(int_train), len(int_validation), len(int_test)

In [7]:
# Load interaction data
interaction_column_names = ['user_id',
                            'item_id',
                            'rating',
                            'timestamp']

int_all = pd.read_csv(
  raw_dir+"ml-100k/u.data",
  sep="\t",
  # encoding="ISO-8859-15",
  names=interaction_column_names
)
int_all.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## Sample n-core interaction set

In [8]:
int_all.groupby(['rating'])['rating'].count()

rating
1     6110
2    11370
3    27145
4    34174
5    21201
Name: rating, dtype: int64

In [9]:
# Subsample the n-core interaction set (retain users and recipes with 
# at least n interactions above a 3 rating)
int_core = int_all.merge(
    int_all[int_all.rating >= 3].groupby('user_id')['item_id'].count().rename('num_i'),
    how='left',
    on='user_id'
).merge(
    int_all[int_all.rating >= 3].groupby('item_id')['user_id'].count().rename('num_u'),
    how='left',
    on='item_id'
)
int_core = int_core[
    (int_core.num_i >= core) & (int_core.num_u >= core) & (int_core.rating > 3)
    # (int_core.rating >= 3)
]
int_core.shape[0]

51963

In [10]:
# How many unique users and recipes are in the 5-core interaction set?
int_core[['user_id', 'item_id']].nunique()

user_id    845
item_id    811
dtype: int64

In [11]:
int_core.head()

Unnamed: 0,user_id,item_id,rating,timestamp,num_i,num_u
5,298,474,4,884182806,125,188.0
7,253,465,5,891628467,90,73.0
11,286,1014,5,879781125,243,72.0
12,200,222,5,876042340,204,328.0
16,122,387,5,879270459,56,52.0


## Remove unused films from database 

In [12]:
item_df = item_df[item_df['index'].isin(int_core['item_id'])]
item_df.head()

Unnamed: 0,index,movie title,release date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


## Pivot and create movie-user matrix

In [13]:
user_to_movie_df = int_core.pivot(
    index='user_id',
     columns='item_id',
      values='rating').fillna(0)

user_to_movie_df.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1210,1217,1218,1220,1221,1226,1228,1240,1244,1267
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,4.0,0.0,0.0,5.0,4.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
user_to_movie_sparse_df = sp.csr_matrix(user_to_movie_df.values)
user_to_movie_sparse_df

<845x811 sparse matrix of type '<class 'numpy.float64'>'
	with 51963 stored elements in Compressed Sparse Row format>

In [15]:
with open(os.path.join(pp_dir, f'user2movie-{core}-core.pickle'), 'wb') as file:
    pickle.dump(user_to_movie_sparse_df, file)

## Split into train/validation/test sets

In [16]:
random_record = int_core.sample()
random_record

Unnamed: 0,user_id,item_id,rating,timestamp,num_i,num_u
6205,94,644,5,886008390,345,33.0


In [17]:
random_record['item_id'].values[0]

644

In [18]:
item_df[item_df['index'] == random_record['item_id'].values[0]]

Unnamed: 0,index,movie title,release date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
643,644,"Thin Blue Line, The (1988)",01-Jan-1988,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Reindex u and i indices
le_user = LabelEncoder()
le_item = LabelEncoder()

item_df['index'] = le_item.fit_transform(item_df['index'])
int_core['user_id'] = le_user.fit_transform(int_core['user_id'])
int_core['item_id'] = le_item.transform(int_core['item_id'])
int_core = int_core.drop(['num_i', 'num_u'], axis=1).reset_index(drop=True)

In [20]:
transfromed_reandom_id = le_item.transform(random_record['item_id'].values)[0]
transfromed_reandom_id

540

In [21]:
item_df[item_df['index'] == transfromed_reandom_id]

Unnamed: 0,index,movie title,release date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
643,540,"Thin Blue Line, The (1988)",01-Jan-1988,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


yay, same moovie

In [22]:
item_df.to_csv(os.path.join(pp_dir, f'item-library.csv'))

In [23]:
int_core.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,264,408,4,884182806
1,224,399,5,891628467
2,253,749,5,879781125
3,178,201,5,876042340
4,108,341,5,879270459


In [24]:
# Perform train/validate/test split
train_df, val_df, test_df = np.split(
    int_core.sample(frac=1, replace=False, random_state=random_state), 
    [int(train_p * len(int_core)), int((train_p + val_p) * len(int_core))]
)
train_df.shape[0], val_df.shape[0], test_df.shape[0]

  return bound(*args, **kwds)


(36374, 5196, 10393)

In [25]:
train_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
40158,354,513,4,875745631
20601,531,158,5,882956761
23791,514,157,4,885844395
41313,779,693,5,888478882
15089,237,227,4,875742437


In [26]:
train_df.to_csv(os.path.join(pp_dir, f'train-{core}-core.csv'))
val_df.to_csv(os.path.join(pp_dir, f'val-{core}-core.csv'))
test_df.to_csv(os.path.join(pp_dir, f'test-{core}-core.csv'))