In [1]:
import pandas as pd
from collections import deque
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
from joblib import load, dump

import os
import gc

In [2]:
np.random.seed(42)

In [3]:
PATH = '../data/netflix/'

In [5]:
if not os.path.isfile(os.path.join(PATH, 'ratings.csv')):
    data = open(os.path.join(PATH, 'ratings.csv'), mode='w')

    row = list()
    files = ['combined_data_1.txt', 'combined_data_2.txt', 'combined_data_3.txt', 'combined_data_4.txt']
    for file in files:
        print('reading ratings from {}...'.format(file))
        with open(os.path.join(PATH + file)) as f:
            for line in f:
                del row[:]
                line = line.strip()
                if line.endswith(':'):
                    # all are rating
                    movid_id = line.replace(':', '')
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movid_id)
                    data.write(','.join(row))
                    data.write('\n')
        print('Done.\n')
    data.close()

reading ratings from combined_data_1.txt...
Done.

reading ratings from combined_data_2.txt...
Done.

reading ratings from combined_data_3.txt...
Done.

reading ratings from combined_data_4.txt...
Done.



In [6]:
ratings = pd.read_csv(os.path.join(PATH, 'ratings.csv'), header=None)

In [7]:
ratings.columns = ['userId', 'movieId', 'rating', 'timestamp']
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


Indices must be coverted to the series.

In [7]:
movie_id_mapper = dict(zip(np.unique(ratings['movieId']), range(len(ratings['movieId']))))
user_id_mapper = dict(zip(np.unique(ratings['userId']), range(len(ratings['userId']))))

In [8]:
ratings['movieId'] = ratings['movieId'].apply(lambda x: movie_id_mapper[x]) # convert movie indices to sequential order

In [9]:
ratings['userId'] = ratings['userId'].apply(lambda x: user_id_mapper[x]) # convert movie indices to sequential order

Later we will need to compare timestamps in order to select the most recent interaction.

In [10]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], format='%Y-%m-%d')

In [11]:
unique_user_ids = list(user_id_mapper.values())
train_user_ids, test_user_ids = train_test_split(unique_user_ids, test_size=0.2, random_state=42, shuffle=True)

In [12]:
train_data = ratings[ratings['userId'].isin(train_user_ids)]
test_data = ratings[ratings['userId'].isin(test_user_ids)]

In [14]:
train_data.to_csv(PATH + 'train_ratings.csv', index=False)
test_data.to_csv(PATH + 'test_ratings.csv', index=False)

In [None]:
test_data = []

for user_id in tqdm(test_user_ids, position=0):
    user_ratings = ratings[ratings['userId'] == user_id]
    recent_index = user_ratings['timestamp'].idxmax()
    train_ratings = user_ratings[~user_ratings.index.isin([recent_index])]
    test_rating = user_ratings.loc[recent_index]
    control_item = test_rating['movieId']
    train_data.append(train_ratings)
    test_data.append((user_id, control_item))

In [None]:
len(test_data)

In [None]:
if not os.path.isfile(PATH + 'train_data.pickle'):
    dump(train_data, PATH + 'train_data.pickle')
    dump(test_data, PATH + 'test_data.pickle')
else:
    train_data = load(PATH + 'train_data.pickle')
    test_data = load(PATH + 'test_data.pickle')

In [None]:
train_df = pd.concat(train_data)

In [None]:
train_df

In [None]:
test_df = pd.DataFrame(test_data, columns=['userId', 'control_item'])

In [None]:
test_df

In [None]:
sparse_matrix = csr_matrix(
    (train_df['rating'], (train_df['userId'], train_df['movieId'])),
    (len(user_id_mapper.keys()), len(movie_id_mapper.keys()))
)

In [None]:
sparse_matrix

In [None]:
train_data, test_data = train_test_split(sparse_matrix, test_size=0.2, shuffle=True, random_state=42)

In [27]:
netflix_titles = pd.read_csv(PATH + 'netflix_titles.csv')

In [28]:
netflix_titles

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...
...,...,...,...,...,...,...,...,...,...,...,...,...
7782,s7783,Movie,Zozo,Josef Fares,"Imad Creidi, Antoinette Turk, Elias Gergi, Car...","Sweden, Czech Republic, United Kingdom, Denmar...","October 19, 2020",2005,TV-MA,99 min,"Dramas, International Movies",When Lebanon's Civil War deprives Zozo of his ...
7783,s7784,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,"March 2, 2019",2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...
7784,s7785,Movie,Zulu Man in Japan,,Nasty C,,"September 25, 2020",2019,TV-MA,44 min,"Documentaries, International Movies, Music & M...","In this documentary, South African rapper Nast..."
7785,s7786,TV Show,Zumbo's Just Desserts,,"Adriano Zumbo, Rachel Khoo",Australia,"October 31, 2020",2019,TV-PG,1 Season,"International TV Shows, Reality TV",Dessert wizard Adriano Zumbo looks for the nex...


In [14]:
movies_metadata = pd.read_csv('../data/movies_metadata.csv', low_memory=False)
movies_metadata.rename(columns={'original_title': 'movie_name'}, inplace=True)

In [19]:
movie_titles = pd.read_csv(PATH + 'movie_titles.csv', encoding='ISO-8859-1', header=None, names=['movie_id', 'year', 'movie_name'], dtype={'movie_id': np.int64, 'year': np.float64, 'movie_name': np.str})
movie_titles

Unnamed: 0,movie_id,year,movie_name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [28]:
len(movies_metadata['movie_name']), len(movie_titles['movie_name'])

(45466, 17770)

In [25]:
intersection = np.isin(movies_metadata['movie_name'], movie_titles['movie_name'])

In [27]:
intersection.sum()

8488

In [48]:
np.unique(intersection_movies).shape

(7309,)

In [38]:
intersection_movies = movies_metadata[intersection]['movie_name']

In [39]:
imdb = pd.read_csv('../data/imdb_movies.csv', low_memory=False)

In [31]:
imdb_intersection = np.isin(imdb['original_title'], movie_titles['movie_name'])

In [32]:
len(imdb['original_title']), len(movie_titles['movie_name'])

(85855, 17770)

In [40]:
imdb_intersection_movies = imdb[imdb_intersection]['original_title']

In [41]:
union = np.union1d(imdb_intersection_movies, intersection_movies)

In [53]:
union

array(["'Round Midnight", '...And Justice for All', '10', ..., 'Zorro',
       'Zubeidaa', 'Zulu'], dtype=object)