### Libraries

In [5]:
import pandas as pd
from pandas import json_normalize

import requests 
import json
import os
from dotenv import load_dotenv 

import time

import re

from bs4 import BeautifulSoup as bs
from fuzzywuzzy import fuzz



### Setting api key


In [6]:
load_dotenv()

True

In [7]:
key = os.getenv("api-key")

In [8]:
url = f'https://imdb-api.com/API/MostPopularMovies/{key}'

In [9]:
res = requests.get(url)

In [10]:
res

<Response [200]>

In [11]:
res.json()

{'items': [{'id': 'tt1630029',
   'rank': '1',
   'rankUpDown': '+1',
   'title': 'Avatar: The Way of Water',
   'fullTitle': 'Avatar: The Way of Water (2022)',
   'year': '2022',
   'image': 'https://m.media-amazon.com/images/M/MV5BYjhiNjBlODctY2ZiOC00YjVlLWFlNzAtNTVhNzM1YjI1NzMxXkEyXkFqcGdeQXVyMjQxNTE1MDA@._V1_Ratio0.7015_AL_.jpg',
   'crew': 'James Cameron (dir.), Sam Worthington, Zoe Saldana',
   'imDbRating': '7.8',
   'imDbRatingCount': '282229'},
  {'id': 'tt9764362',
   'rank': '2',
   'rankUpDown': '-1',
   'title': 'The Menu',
   'fullTitle': 'The Menu (2022)',
   'year': '2022',
   'image': 'https://m.media-amazon.com/images/M/MV5BMzdjNjI5MmYtODhiNS00NTcyLWEzZmUtYzVmODM5YzExNDE3XkEyXkFqcGdeQXVyMTAyMjQ3NzQ1._V1_Ratio0.6716_AL_.jpg',
   'crew': 'Mark Mylod (dir.), Ralph Fiennes, Anya Taylor-Joy',
   'imDbRating': '7.2',
   'imDbRatingCount': '213760'},
  {'id': 'tt10640346',
   'rank': '3',
   'rankUpDown': '+1',
   'title': 'Babylon',
   'fullTitle': 'Babylon (2022)',
   'yea

##### *Note:* This is an unoficial IMDB api and as such provides limited options. Web Scrapping of the Netflix page in IMDB offered more possibilities data extraction.   

### Web Scrapping - IMDB Netflix List

In [12]:
url_imdb = 'https://www.imdb.com/search/title/?companies=co0144901'
headers = {'Accept-Language': 'en-US,en;q=0.5'}
html = requests.get(url_imdb, headers=headers)

In [13]:
soup = bs(html.content, 'html.parser')

In [14]:
titles = soup.find_all("div", attrs = {"class":"psf-widget"})

In [15]:
title = soup.select("div h3 a")
title_ = [i.getText().strip() for i in title]
title_

['You People',
 'Lockwood & Co',
 'Wednesday',
 "That '90s Show",
 'The Walking Dead',
 'Ginny & Georgia',
 'Happy Valley',
 'Glass Onion',
 'Breaking Bad',
 'All Quiet on the Western Front',
 'The Pale Blue Eye',
 'Murder Mystery 2',
 'Outer Banks',
 'Better Call Saul',
 "That '70s Show",
 'The Snow Girl',
 "Grey's Anatomy",
 'Vikings: Valhalla',
 'Stranger Things',
 'Bullet Train',
 'Fauda',
 'Cunk on Earth',
 'New Amsterdam',
 "Narvik: Hitler's First Defeat",
 'Teen Wolf',
 'Supernatural',
 'Kaleidoscope',
 'NCIS',
 'Vikings',
 'Peaky Blinders',
 'The Crown',
 'Friends',
 'You',
 'The Blacklist',
 'The Super Mario Bros. Movie',
 'Modern Family',
 'Women at War',
 'Attack on Titan',
 'Alice in Borderland',
 'Blonde',
 'Suits',
 'Seinfeld',
 "Guillermo del Toro's Pinocchio",
 'The Recruit',
 'Rick and Morty',
 'American Horror Story',
 'Top Gun',
 'Black Mirror',
 'Shameless',
 'Devotion']

In [16]:
title[0].get('href')

'/title/tt14826022/'

In [17]:
genre = soup.select("span.genre")
genre_ = [i.getText().strip() for i in genre]
genre_

['Comedy, Romance',
 'Action, Adventure, Drama',
 'Comedy, Crime, Fantasy',
 'Comedy, Drama, Romance',
 'Drama, Horror, Thriller',
 'Comedy, Drama',
 'Crime, Drama, Thriller',
 'Comedy, Crime, Drama',
 'Crime, Drama, Thriller',
 'Action, Drama, War',
 'Crime, Horror, Mystery',
 'Action, Comedy, Crime',
 'Action, Crime, Drama',
 'Crime, Drama',
 'Comedy, Drama, Romance',
 'Crime, Drama, Mystery',
 'Drama, Romance',
 'Action, Adventure, Drama',
 'Drama, Fantasy, Horror',
 'Action, Comedy, Thriller',
 'Action, Drama, Thriller',
 'Documentary, Comedy',
 'Drama',
 'Drama, History, War',
 'Action, Drama, Fantasy',
 'Drama, Fantasy, Horror',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Adventure, Drama',
 'Crime, Drama',
 'Biography, Drama, History',
 'Comedy, Romance',
 'Crime, Drama, Romance',
 'Crime, Drama, Mystery',
 'Animation, Adventure, Comedy',
 'Comedy, Drama, Romance',
 'Drama, History',
 'Animation, Action, Adventure',
 'Action, Drama, Mystery',
 'Biography, Drama, 

In [18]:
title = soup.find_all('h3', {'class':"lister-item-header"})[0]
title

<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt14826022/">You People</a>
<span class="lister-item-year text-muted unbold">(2023)</span>
</h3>

In [19]:
genres = soup.find_all('p', {'class':"genre"})
genres

[]

In [20]:
def scrape_imdb(n):
   
    title = []
    genre = []
    rating = []
    
    try:
        for i in range(1, n+1):
            url = f'https://www.imdb.com/search/title/?companies=co0144901&start={i}&ref_=adv_nxt'
            headers = {'Accept-Language': 'en-US,en;q=0.5'}

            response = requests.get(url, headers=headers)
            soup = bs(response.text, 'html.parser')

            # find all movie titles in the page
            titles = [a.text for a in soup.find_all('h3', {'class':"lister-item-header"})][0]

            # find genre 
            genres = [a.text for a in soup.find_all('span', {'class':"genre"})][0]

            # find type
            ratings = [a.text for a in soup.find_all('div', {'class':"inline-block ratings-imdb-rating"})][0]

            # add the titles to the results list
            title.append(titles)
            genre.append(genres)
            rating.append(ratings)
            
            #ensure they are all the same length
           # min_len = min(len(titles), len(genres), len(ratings))
           # titles = titles[:min_len]
          #  genres = genres[:min_len]
          #  ratings = ratings[:min_len]

        # create a DataFrame from the results
        df_imdb = pd.DataFrame({'title': title, 'genre': genre, 'rating': rating})
        
        
        return df_imdb
    
    except Exception as e:
        return print(f"Error scraping page {i}: {e}")




In [21]:
df_imdb = scrape_imdb(1000)

KeyboardInterrupt: 

In [None]:
df_imdb.describe()

Unnamed: 0,title,genre,rating
count,1000,1000,1000
unique,1000,203,57
top,\n1.\nThat '90s Show\n(2023– )\n,"\nCrime, Drama, Mystery",\n\n7.5\n
freq,1,56,41


In [None]:
df_imdb["Title_imdb"] = df_imdb["title"].str.extract(r'\n\d+\..*\n([^\n\(\)]+)\n')

In [None]:
df_imdb["genre"] = df_imdb["genre"].str.strip('\n')

In [None]:
df_imdb["rating"] = df_imdb["rating"].str.strip('\n')

In [None]:
df_imdb["rating"].dtype

dtype('O')

In [None]:
df_imdb["rating"] = df_imdb["rating"].astype(float)

In [None]:
df_imdb.sample(5)

Unnamed: 0,title,genre,rating,Title_imdb
864,\n865.\nLove in the Villa\n(2022)\n,"Comedy, Romance",5.4,Love in the Villa
110,\n111.\nSherlock\n(2010–2017)\n,"Crime, Drama, Mystery",9.1,Sherlock
627,\n628.\nThe Unforgivable\n(2021)\n,"Crime, Drama",7.1,The Unforgivable
752,\n753.\nBlack Crab\n(2022)\n,"Action, Adventure, Drama",5.7,Black Crab
469,\n470.\nI Used to Be Famous\n(2022)\n,"Comedy, Drama, Music",6.6,I Used to Be Famous


In [None]:
df_imdb.drop(['title'], axis=1, inplace=True)

In [None]:
df_imdb = df_imdb.reindex(columns=['Title_imdb', 'genre', 'rating'])

In [None]:
df_imdb.sample()

Unnamed: 0,Title_imdb,genre,rating
298,Your Place or Mine,"Comedy, Romance",7.1


In [None]:
df_imdb['Title_imdb'].describe()

count                            996
unique                           986
top       Avatar: The Last Airbender
freq                               2
Name: Title_imdb, dtype: object

In [None]:
df_imdb.shape

(1000, 3)

### CSV File - My Netflix viewing list 

In [None]:
with open("data/netflix_viewing-history.csv") as f:
    print(f)

<_io.TextIOWrapper name='data/netflix_viewing-history.csv' mode='r' encoding='cp1252'>


In [25]:
df_net = pd.read_csv("data/netflix_viewing-history.csv")
df_net.describe()

Unnamed: 0,Title,Date
count,6177,6177
unique,5857,1177
top,: Episode 1,27/11/2022
freq,7,45


In [None]:
df_net.describe()

Unnamed: 0,Title,Date
count,6177,6177
unique,5857,1177
top,: Episode 1,27/11/2022
freq,7,45


In [None]:
df_net.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6177 entries, 0 to 6176
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   6177 non-null   object
 1   Date    6177 non-null   object
dtypes: object(2)
memory usage: 96.6+ KB


In [None]:
df_net['Date'] = pd.to_datetime(df_net['Date'], dayfirst=True)

In [None]:
df_net["Title_net"] = df_net["Title"].str.extract(r"^([^:]+)")

In [None]:
df_net.sample(5)

Unnamed: 0,Title,Date,Title_net
1670,Mo: Season 1: Tombstone,2022-04-09,Mo
3110,Kim's Convenience: Season 2: Sneak Attack,2021-06-28,Kim's Convenience
4346,How I Met Your Mother: Season 2: Aldrin Justice,2020-09-22,How I Met Your Mother
747,I Think You Should Leave with Tim Robinson: Se...,2019-05-16,I Think You Should Leave with Tim Robinson
4164,Pan,2020-11-15,Pan


In [None]:
df_net['Date'].dtype

dtype('<M8[ns]')

In [None]:
df_net['Title_net'].describe()

count          6177
unique          832
top       Star Trek
freq            488
Name: Title_net, dtype: object

In [None]:
df_net.drop_duplicates(subset="Title_net", keep='first', inplace=True)

In [None]:
df_net['Title_net'].describe()

count               832
unique              832
top       The Godfather
freq                  1
Name: Title_net, dtype: object

In [None]:
df_net.sample(5)

Unnamed: 0,Title,Date,Title_net
5276,Dark Shadows,2019-10-23,Dark Shadows
2235,Playing with Fire,2022-09-01,Playing with Fire
161,Amy Schumer Growing,2022-03-30,Amy Schumer Growing
2607,Adventure Beast: Nature Hates Jerks,2021-10-31,Adventure Beast
5878,The Lonely Island Presents: The Unauthorized B...,2019-05-25,The Lonely Island Presents


In [None]:
df_net.drop(['Title'], axis=1, inplace=True)

### Merge IMDB and Netflix Lists 

In [None]:
def merge_dfs(df_imdb, df_net, on='Title2'):
    if on in df_imdb.columns and on in df_net.columns:
        df_net = df_net.rename(columns={ on:f'{on}_right'})
        #on = f'{on}_right'
    df_merged = pd.merge(df_imdb, df_net, on=on)
    return df_merged


In [None]:
df_net.describe()

  df_net.describe()


Unnamed: 0,Date,Title_net
count,832,832
unique,544,832
top,2020-11-14 00:00:00,The Godfather
freq,7,1
first,2017-07-30 00:00:00,
last,2023-12-01 00:00:00,


In [None]:
df_imdb['Title_imdb'].describe()

count                            996
unique                           986
top       Avatar: The Last Airbender
freq                               2
Name: Title_imdb, dtype: object

In [None]:
merge_dfs = pd.merge(df_net, df_imdb, left_on='Title_net', right_on='Title_imdb', how='left')

In [None]:
merge_dfs.sample()

Unnamed: 0,Date,Title_net,Title_imdb,genre,rating
349,2022-10-29,Happy Death Day 2U,,,


In [None]:
df_imdb['Title_imdb'].unique

<bound method Series.unique of 0        That '90s Show
1             Wednesday
2            You People
3       Ginny & Georgia
4      The Walking Dead
             ...       
995              Mirage
996            Mudbound
997        The Takeover
998     Despicable Me 2
999                 NaN
Name: Title_imdb, Length: 1000, dtype: object>

In [None]:
df_net['Title_net'].unique

<bound method Series.unique of 0                     The Godfather
1                            Trolls
2                      VINLAND SAGA
3              Arrested Development
4                     Puss in Boots
                   ...             
6161    Tidying Up with Marie Kondo
6173                     Safe House
6174                     Win It All
6175                    The Cobbler
6176     The Fundamentals of Caring
Name: Title_net, Length: 832, dtype: object>

In [None]:
merge_dfs.sample(10)

Unnamed: 0,Date,Title_net,Title_imdb,genre,rating
590,2021-08-01,Equinox,,,
714,2020-02-23,Game Night,,,
215,2020-05-23,The Adjustment Bureau,,,
382,2022-07-25,The Best of Netflix Is a Joke,,,
289,2018-04-11,Day of the Dead,,,
562,2021-03-13,ARQ,,,
566,2021-06-03,Nerve,,,
605,2020-12-20,Christmas Made to Order,,,
411,2022-03-26,Black Crab,Black Crab,"Action, Adventure, Drama",5.7
225,2020-04-28,The House Bunny,,,


In [None]:
merged = merge_dfs.drop_duplicates(subset = "Title_imdb")

In [None]:
merged

Unnamed: 0,Date,Title_net,Title_imdb,genre,rating
0,2023-01-29,The Godfather,,,
3,2023-01-29,Arrested Development,Arrested Development,Comedy,8.7
7,2023-01-28,Wednesday,Wednesday,"Comedy, Crime, Fantasy",8.2
9,2023-01-14,The Pale Blue Eye,The Pale Blue Eye,"Crime, Horror, Mystery",6.6
10,2023-01-08,1899,1899,"Drama, Mystery",7.4
...,...,...,...,...,...
809,2019-03-22,The Dirt,The Dirt,"Biography, Comedy, Drama",7.0
810,2019-03-18,Triple Frontier,Triple Frontier,"Action, Thriller",6.4
816,2019-04-03,Serenity,Serenity,"Drama, Mystery, Thriller",5.4
820,2019-02-27,Velvet Buzzsaw,Velvet Buzzsaw,"Horror, Mystery, Thriller",5.7


In [None]:
merged['Title_imdb'].describe()

count                      243
unique                     243
top       Arrested Development
freq                         1
Name: Title_imdb, dtype: object

In [None]:
merged['Title_imdb'].isna().sum()

1

In [None]:
merged.to_csv('data/clean-and-merged_movies.csv')
