### Setup

In [2]:
import boto3
import sys
import io
import re
import csv
import os
import pandas as pd
import numpy as np
from botocore import UNSIGNED
from botocore.client import Config
if sys.version_info[0] < 3: 
    from StringIO import StringIO # Python 2.x
else:
    from io import StringIO # Python 3.x

# Get our client with bucket region. Set config to an unsigned signiture (req for public users)
resource = boto3.client('s3', config=Config(signature_version=UNSIGNED), region_name='us-west-1')
# Get the bucket name
bucket_name = 'moviereview.data'

In [3]:
# Get list of objects in bucket
resource.list_objects(Bucket=bucket_name).get("Contents")

[{'Key': 'data/',
  'LastModified': datetime.datetime(2022, 11, 16, 0, 31, 54, tzinfo=tzutc()),
  'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
  'Size': 0,
  'StorageClass': 'STANDARD'},
 {'Key': 'data/NetflixJoinedNLP.csv',
  'LastModified': datetime.datetime(2022, 11, 17, 8, 59, 32, tzinfo=tzutc()),
  'ETag': '"6568956b9c7797a27fb4a32a29806e2e"',
  'Size': 1379204,
  'StorageClass': 'STANDARD'},
 {'Key': 'data/doc_vecs.npy',
  'LastModified': datetime.datetime(2022, 11, 16, 0, 32, 58, tzinfo=tzutc()),
  'ETag': '"1a9578ad55bd18a26e2e84c4e4d3f885-4"',
  'Size': 54568064,
  'StorageClass': 'STANDARD'},
 {'Key': 'data/otitle_vecs.npy',
  'LastModified': datetime.datetime(2022, 11, 16, 0, 32, 58, tzinfo=tzutc()),
  'ETag': '"7a4898679335ebd6f63baee13a069d62-329"',
  'Size': 5642950784,
  'StorageClass': 'STANDARD'},
 {'Key': 'data/ptitle_vecs.npy',
  'LastModified': datetime.datetime(2022, 11, 16, 0, 32, 58, tzinfo=tzutc()),
  'ETag': '"d5e1213e1a16991f9293599c00ebdb14-329"',
  'Size': 

### IMDB Dataset

In [3]:
object_key = 'imdb/title.basics.tsv'
csv_obj_imdb = resource.get_object(Bucket=bucket_name, Key=object_key)
# csv_obj_imdb is a json with the data in 'Body'
body = csv_obj_imdb['Body']
csv_read_imdb = body.read()

In [4]:
# Custom df conversion
patrn = "^tt[0-9]*\t(movie|short|tvSeries|tvShort|tvMovie|tvSpecial|video|tvMiniSeries)\t"
data = csv_read_imdb.decode("ISO-8859-1").splitlines() # Gets list of each line in data
rows = []
header = data[0]
for line in data[1:]:
    # If regex pattern matches, add to the string
    if re.search(patrn, line):
        # There is a `t` at the begining of each line
        rows += [line[1:].split('\t')]

imdb = pd.DataFrame(rows, columns=header[1:].split('\t'))

imdb.dropna(subset="startYear", inplace=True)
imdb["startYear"] = imdb.startYear.apply(lambda x: x if x.isnumeric() else np.nan)
imdb["endYear"] = imdb.endYear.apply(lambda x: x if x.isnumeric() else np.nan)
imdb['endYear'] = imdb['endYear'].astype("Int64")
imdb['startYear'] = imdb['startYear'].astype("Int64")

In [5]:
imdb.head()

Unnamed: 0,const,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,t0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,t0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,t0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,t0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,t0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


### Netflix data

In [4]:
object_key = "netflix/movie_titles.csv"
csv_obj_ntfx = resource.get_object(Bucket=bucket_name, Key=object_key)
# csv_obj_ntfx is a json with the data in 'Body'
body = csv_obj_ntfx['Body']
csv_read_ntfx = body.read()
ntfx_bytesio = io.BytesIO(csv_read_ntfx)

In [5]:
# IMPORT NETFLIX MOVIE NAMES
def manual_sep(old_split):
    new_split = old_split[0:2] + [",".join(old_split[2:])]
    return new_split

ntfx = pd.read_csv(ntfx_bytesio,
                   encoding = "ISO-8859-1",
                   header = None,
                   names = ['Movie_Id', 'Year', 'Name'],
                   on_bad_lines=manual_sep,
                   engine='python')
ntfx.dropna(subset='Year', inplace=True)
ntfx['Year'] = ntfx['Year'].astype("Int64")
print("Netflix Movie Names:")
print(f'{ntfx.shape = }')
print(ntfx.head().to_string())

Netflix Movie Names:
ntfx.shape = (17763, 3)
   Movie_Id  Year                          Name
0         1  2003               Dinosaur Planet
1         2  2004    Isle of Man TT 2004 Review
2         3  1997                     Character
3         4  1994  Paula Abdul's Get Up & Dance
4         5  2004      The Rise and Fall of ECW


In [None]:
# IMPORT NETFLIX MOVIE RATINGS
netflix_movie_ratings = [f'netflix/combined_data_{i}.txt' for i in range(1, 5)]
stream = StringIO()
movie_number = "1"
for object_key in netflix_movie_ratings:
    print(f'getting file {object_key}')
    csv_obj_ntfx = resource.get_object(Bucket=bucket_name, Key=object_key)
    body = csv_obj_ntfx['Body']
    csv_read_ntfx = body.read()
    data = csv_read_ntfx.decode("ISO-8859-1").splitlines()
    patrn = "[0-9]:"
    for line in data:
        if re.search(patrn, line):
            movie_num = line.replace(":\n", "")
        else:
            stream.write(movie_num+","+line)
    del(csv_read_ntfx)
    del(data)
stream.seek(0)
print("reading done")
ratings = pd.read_csv(stream,
                      encoding = "ISO-8859-1",
                      names = ['Movie_Id', 'CustomerID', 'Rating', 'Date'],
                      engine='c')
stream.close()
del(stream)
print(f'{ratings.shape = }')
print(ratings.head().to_string())

getting file netflix/combined_data_1.txt
getting file netflix/combined_data_2.txt
getting file netflix/combined_data_3.txt
getting file netflix/combined_data_4.txt
reading done


In [None]:
ratings.head()

### TMDB data

### Vectorization data

In [14]:
# Look at "Connecting Netflix & IMDB Data.ipynb" notebook for
# more info on what these vectors mean. WARNING: Need ~10GB
# need ~10GB of free memory to load in all 3 vectors.
netflix_vecs_object_key = "data/doc_vecs.npy"
imdb_otitle_vecs_object_key = "data/otitle_vecs.npy"
imdb_ptitle_vecs_object_key = "data/ptitle_vecs.npy"
# netflix title vectors (~50MB):
netflix_vecs_npy_obj = resource.get_object(Bucket=bucket_name, Key=netflix_vecs_object_key)
body = netflix_vecs_npy_obj['Body']
body_read = body.read()
netflix_vecs = np.load(file=io.BytesIO(body_read))
# original-name IMDB title vectors (~5GB):
otitle_vecs_npy_obj = resource.get_object(Bucket=bucket_name, Key=imdb_otitle_vecs_object_key)
body = otitle_vecs_npy_obj['Body']
body_read = body.read()
otitle_vecs = np.load(file=io.BytesIO(body_read))
# primary-name IMDB title vectors (~5GB):
ptitle_vecs_npy_obj = resource.get_object(Bucket=bucket_name, Key=imdb_ptitle_vecs_object_key)
body = ptitle_vecs_npy_obj['Body']
body_read = body.read()
ptitle_vecs = np.load(file=io.BytesIO(body_read))

### Connected Dataset

In [6]:
# nlp dataset
object_key = "data/NetflixJoinedNLP.csv"
csv_obj_conn = resource.get_object(Bucket=bucket_name, Key=object_key)
# csv_obj_ntfx is a json with the data in 'Body'
body = csv_obj_conn['Body']
csv_read_conn = body.read()
conn_bytesio = io.BytesIO(csv_read_conn)
conn = pd.read_csv(conn_bytesio)
conn

Unnamed: 0.1,Unnamed: 0,Movie_Id,Year,Name,idx,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,0,1,2003,Dinosaur Planet,tt0389605,tt0389605,tvMiniSeries,Dinosaur Planet,Dinosaur Planet,0,2003,,48,"Animation,Documentary,Family"
1,1,3,1997,Character,tt0119448,tt0119448,movie,Character,Karakter,0,1997,,122,"Crime,Drama,Mystery"
2,2,6,1997,Sick,tt0120126,tt0120126,movie,Sick,"Sick: The Life & Death of Bob Flanagan, Superm...",0,1997,,90,Documentary
3,3,7,1992,8 Man,tt0182668,tt0182668,movie,8 Man,Eitoman - Subete no sabishii yoru no tame ni,0,1992,,83,"Action,Sci-Fi"
4,4,8,2004,What the #$*! Do We Know!?,tt0399877,tt0399877,movie,What the #$*! Do We (K)now!?,What the #$*! Do We (K)now!?,0,2004,,109,"Comedy,Documentary,Drama"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11177,11177,17763,1978,Interiors,tt0077742,tt0077742,movie,Interiors,Interiors,0,1978,,92,Drama
11178,11178,17764,1998,Shakespeare in Love,tt0138097,tt0138097,movie,Shakespeare in Love,Shakespeare in Love,0,1998,,123,"Comedy,Drama,History"
11179,11179,17768,2000,Epoch,tt0233657,tt0233657,tvMovie,Epoch,Epoch,0,2001,,96,"Sci-Fi,Thriller"
11180,11180,17769,2003,The Company,tt0335013,tt0335013,movie,The Company,The Company,0,2003,,112,"Drama,Music,Romance"
