### Setup

In [7]:
import boto3
import sys
import io
import re
import csv
import pandas as pd
import numpy as np
from botocore import UNSIGNED
from botocore.client import Config
if sys.version_info[0] < 3: 
    from StringIO import StringIO # Python 2.x
else:
    from io import StringIO # Python 3.x

# Get our client with bucket region. Set config to an unsigned signiture (req for public users)
resource = boto3.client('s3', config=Config(signature_version=UNSIGNED), region_name='us-west-1')
# Get the bucket name
bucket_name = 'moviereview.data'

In [2]:
# Get list of objects in bucket
resource.list_objects(Bucket=bucket_name).get("Contents")

[{'Key': 'imdb/name.basics.tsv',
  'LastModified': datetime.datetime(2022, 9, 23, 20, 52, 9, tzinfo=tzutc()),
  'ETag': '"386b3a6a4e2041681adab177d3a3e6fe-42"',
  'Size': 712778336,
  'StorageClass': 'STANDARD'},
 {'Key': 'imdb/name.basics.tsv.gz',
  'LastModified': datetime.datetime(2022, 9, 23, 20, 52, 9, tzinfo=tzutc()),
  'ETag': '"2a86674377ff4a6b87e385648015d97e-14"',
  'Size': 232985457,
  'StorageClass': 'STANDARD'},
 {'Key': 'imdb/title.akas.tsv',
  'LastModified': datetime.datetime(2022, 9, 23, 20, 52, 9, tzinfo=tzutc()),
  'ETag': '"2e06bfff7437fed7934d5f2ba152eceb-97"',
  'Size': 1649681155,
  'StorageClass': 'STANDARD'},
 {'Key': 'imdb/title.akas.tsv.gz',
  'LastModified': datetime.datetime(2022, 9, 23, 20, 52, 9, tzinfo=tzutc()),
  'ETag': '"cf7d1828c331be26585f22d656f8d866-17"',
  'Size': 281026780,
  'StorageClass': 'STANDARD'},
 {'Key': 'imdb/title.basics.tsv',
  'LastModified': datetime.datetime(2022, 9, 23, 20, 52, 9, tzinfo=tzutc()),
  'ETag': '"15b8595c466b46227bb1

### IMDB Dataset

In [3]:
object_key = 'imdb/title.basics.tsv'
csv_obj_imdb = resource.get_object(Bucket=bucket_name, Key=object_key)
# csv_obj_imdb is a json with the data in 'Body'
body = csv_obj_imdb['Body']
csv_read_imdb = body.read()
# Read into buffer IO
# imdb_bytesio = io.BytesIO(csv_read_imdb)
# Read into string IO
# imdb_stringio = StringIO(csv_read_imdb.decode("ISO-8859-1"))

In [40]:
# Custom df conversion
patrn = "^tt[0-9]*\t(movie|short|tvSeries|tvShort|tvMovie|tvSpecial|video|tvMiniSeries)\t"
data = csv_read_imdb.decode("ISO-8859-1").splitlines() # Gets list of each line in data
rows = []
header = data[0]
for line in data[1:]:
    # If regex pattern matches, add to the string
    if re.search(patrn, line):
        # There is a `t` at the begining of each line
        rows += [line[1:].split('\t')]

imdb = pd.DataFrame(rows, columns=header[1:].split('\t'))

imdb = imdb[imdb.startYear.dropna(subset='startYear', inplace=False).apply(lambda x: x.isnumeric())]
imdb["endYear"] = imdb.endYear.apply(lambda x: x if x.isnumeric() else np.nan)
imdb['endYear'] = imdb['endYear'].astype("Int64")
imdb['startYear'] = imdb['startYear'].astype("Int64")

TypeError: dropna() got an unexpected keyword argument 'subset'

In [None]:
imdb.head()

### Netflix data

In [4]:
object_key = "netflix/movie_titles.csv"
csv_obj_ntfx = resource.get_object(Bucket=bucket_name, Key=object_key)
# csv_obj_ntfx is a json with the data in 'Body'
body = csv_obj_ntfx['Body']
csv_read_ntfx = body.read()
ntfx_bytesio = io.BytesIO(csv_read_ntfx)

In [5]:
def manual_sep(old_split):
    new_split = old_split[0:2] + [",".join(old_split[2:])]
    return new_split

ntfx = pd.read_csv(ntfx_bytesio,
                   encoding = "ISO-8859-1",
                   header = None,
                   names = ['Movie_Id', 'Year', 'Name'],
                   on_bad_lines=manual_sep,
                   engine='python')

In [6]:
ntfx.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
