### Setup

In [1]:
import boto3
import sys
import io
import re
import csv
import os
import pandas as pd
import numpy as np
from botocore import UNSIGNED
from botocore.client import Config
if sys.version_info[0] < 3: 
    from StringIO import StringIO # Python 2.x
else:
    from io import StringIO # Python 3.x

# Get our client with bucket region. Set config to an unsigned signiture (req for public users)
resource = boto3.client('s3', config=Config(signature_version=UNSIGNED), region_name='us-west-1')
# Get the bucket name
bucket_name = 'moviereview.data'

In [2]:
# Get list of objects in bucket
resource.list_objects(Bucket=bucket_name).get("Contents")

[{'Key': 'data/',
  'LastModified': datetime.datetime(2022, 11, 16, 0, 31, 54, tzinfo=tzutc()),
  'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
  'Size': 0,
  'StorageClass': 'STANDARD'},
 {'Key': 'data/doc_vecs.npy',
  'LastModified': datetime.datetime(2022, 11, 16, 0, 32, 58, tzinfo=tzutc()),
  'ETag': '"1a9578ad55bd18a26e2e84c4e4d3f885-4"',
  'Size': 54568064,
  'StorageClass': 'STANDARD'},
 {'Key': 'data/otitle_vecs.npy',
  'LastModified': datetime.datetime(2022, 11, 16, 0, 32, 58, tzinfo=tzutc()),
  'ETag': '"7a4898679335ebd6f63baee13a069d62-329"',
  'Size': 5642950784,
  'StorageClass': 'STANDARD'},
 {'Key': 'data/ptitle_vecs.npy',
  'LastModified': datetime.datetime(2022, 11, 16, 0, 32, 58, tzinfo=tzutc()),
  'ETag': '"d5e1213e1a16991f9293599c00ebdb14-329"',
  'Size': 5642950784,
  'StorageClass': 'STANDARD'},
 {'Key': 'imdb/name.basics.tsv',
  'LastModified': datetime.datetime(2022, 9, 23, 20, 52, 9, tzinfo=tzutc()),
  'ETag': '"386b3a6a4e2041681adab177d3a3e6fe-42"',
  'Size': 

### IMDB Dataset

In [3]:
object_key = 'imdb/title.basics.tsv'
csv_obj_imdb = resource.get_object(Bucket=bucket_name, Key=object_key)
# csv_obj_imdb is a json with the data in 'Body'
body = csv_obj_imdb['Body']
csv_read_imdb = body.read()
# Read into buffer IO
# imdb_bytesio = io.BytesIO(csv_read_imdb)
# Read into string IO
# imdb_stringio = StringIO(csv_read_imdb.decode("ISO-8859-1"))

In [4]:
# Custom df conversion
patrn = "^tt[0-9]*\t(movie|short|tvSeries|tvShort|tvMovie|tvSpecial|video|tvMiniSeries)\t"
data = csv_read_imdb.decode("ISO-8859-1").splitlines() # Gets list of each line in data
rows = []
header = data[0]
for line in data[1:]:
    # If regex pattern matches, add to the string
    if re.search(patrn, line):
        # There is a `t` at the begining of each line
        rows += [line[1:].split('\t')]

imdb = pd.DataFrame(rows, columns=header[1:].split('\t'))

imdb.dropna(subset="startYear", inplace=True)
imdb["startYear"] = imdb.startYear.apply(lambda x: x if x.isnumeric() else np.nan)
imdb["endYear"] = imdb.endYear.apply(lambda x: x if x.isnumeric() else np.nan)
imdb['endYear'] = imdb['endYear'].astype("Int64")
imdb['startYear'] = imdb['startYear'].astype("Int64")

In [5]:
imdb.head()

Unnamed: 0,const,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,t0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,t0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,t0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,t0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,t0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


### Netflix data

In [6]:
object_key = "netflix/movie_titles.csv"
csv_obj_ntfx = resource.get_object(Bucket=bucket_name, Key=object_key)
# csv_obj_ntfx is a json with the data in 'Body'
body = csv_obj_ntfx['Body']
csv_read_ntfx = body.read()
ntfx_bytesio = io.BytesIO(csv_read_ntfx)

In [7]:
def manual_sep(old_split):
    new_split = old_split[0:2] + [",".join(old_split[2:])]
    return new_split

ntfx = pd.read_csv(ntfx_bytesio,
                   encoding = "ISO-8859-1",
                   header = None,
                   names = ['Movie_Id', 'Year', 'Name'],
                   on_bad_lines=manual_sep,
                   engine='python')

In [8]:
ntfx.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


### TMDB data

### Vectorization data

In [14]:
# Look at "Connecting Netflix & IMDB Data.ipynb" notebook for
# more info on what these vectors mean. WARNING: Need ~10GB
# need ~10GB of free memory to load in all 3 vectors.
netflix_vecs_object_key = "data/doc_vecs.npy"
imdb_otitle_vecs_object_key = "data/otitle_vecs.npy"
imdb_ptitle_vecs_object_key = "data/ptitle_vecs.npy"
# netflix title vectors (~50MB):
netflix_vecs_npy_obj = resource.get_object(Bucket=bucket_name, Key=netflix_vecs_object_key)
body = netflix_vecs_npy_obj['Body']
body_read = body.read()
netflix_vecs = np.load(file=io.BytesIO(body_read))
# original-name IMDB title vectors (~5GB):
otitle_vecs_npy_obj = resource.get_object(Bucket=bucket_name, Key=imdb_otitle_vecs_object_key)
body = otitle_vecs_npy_obj['Body']
body_read = body.read()
otitle_vecs = np.load(file=io.BytesIO(body_read))
# primary-name IMDB title vectors (~5GB):
ptitle_vecs_npy_obj = resource.get_object(Bucket=bucket_name, Key=imdb_ptitle_vecs_object_key)
body = ptitle_vecs_npy_obj['Body']
body_read = body.read()
ptitle_vecs = np.load(file=io.BytesIO(body_read))