In [1]:
import pandas as pd
import numpy as np

### IMDb dataset

In [2]:
imdb_title_basics = pd.read_csv('Data/imdb/title.basics.tsv', sep='\t')
imdb_title_ratings = pd.read_csv('Data/imdb/title.ratings.tsv', sep='\t')
imdb_df = pd.merge(left=imdb_title_basics, right=imdb_title_ratings, how='inner', on='tconst')

#imdb_df = imdb_df[imdb_df['titleType'] == 'movie'].reset_index(drop=True)

# Standardize nan-values
values_to_replace = ['{}', '[]', '', 'NA', 'N/A', '-', 'nan', '\\N']
replace_map = {value: np.nan for value in values_to_replace}
imdb_df = imdb_df.replace(replace_map)

imdb_df = imdb_df[~imdb_df['genres'].isna()].reset_index(drop=True)
imdb_df['genres'] = imdb_df['genres'].apply(lambda x: str(x).split(',') if not pd.isna(x) else np.nan)


imdb_df = imdb_df[['tconst', 'titleType', 'primaryTitle', 'isAdult', 'startYear', 'runtimeMinutes', 'genres', 'averageRating', 'numVotes']]
imdb_df.columns = ['imdb_id', 'title_type', 'movie_name', 'is_adult', 'release_year', 'runtime_minutes', 'genres', 'avg_rating', 'num_votes']

imdb_df

  imdb_title_basics = pd.read_csv('Data/imdb/title.basics.tsv', sep='\t')


Unnamed: 0,imdb_id,title_type,movie_name,is_adult,release_year,runtime_minutes,genres,avg_rating,num_votes
0,tt0000001,short,Carmencita,0,1894,1,"[Documentary, Short]",5.7,2007
1,tt0000002,short,Le clown et ses chiens,0,1892,5,"[Animation, Short]",5.8,270
2,tt0000003,short,Pauvre Pierrot,0,1892,4,"[Animation, Comedy, Romance]",6.5,1921
3,tt0000004,short,Un bon bock,0,1892,12,"[Animation, Short]",5.5,178
4,tt0000005,short,Blacksmith Scene,0,1893,1,"[Comedy, Short]",6.2,2698
...,...,...,...,...,...,...,...,...,...
1357493,tt9916730,movie,6 Gunn,0,2017,116,[Drama],7.6,11
1357494,tt9916766,tvEpisode,Episode #10.15,0,2019,43,"[Family, Game-Show, Reality-TV]",7.0,22
1357495,tt9916778,tvEpisode,Escape,0,2019,,"[Crime, Drama, Mystery]",7.2,36
1357496,tt9916840,tvEpisode,Horrid Henry's Comic Caper,0,2014,11,"[Adventure, Animation, Comedy]",8.8,6


In [10]:
scrape_df = imdb_df[imdb_df['num_votes'] > 50000]

scrape_df = scrape_df.sort_values(by='avg_rating', ascending=False).iloc[0:1000].reset_index(drop=True)

scrape_df

Unnamed: 0,imdb_id,title_type,movie_name,is_adult,release_year,runtime_minutes,genres,avg_rating,num_votes
0,tt2301451,tvEpisode,Ozymandias,0,2013,47,"[Crime, Drama, Thriller]",10.0,209208
1,tt2178784,tvEpisode,The Rains of Castamere,0,2013,51,"[Action, Adventure, Drama]",9.9,115054
2,tt2301455,tvEpisode,Felina,0,2013,55,"[Crime, Drama, Thriller]",9.9,135694
3,tt4283088,tvEpisode,Battle of the Bastards,0,2016,60,"[Action, Adventure, Drama]",9.9,221047
4,tt1683088,tvEpisode,Face Off,0,2011,50,"[Crime, Drama, Thriller]",9.9,72264
...,...,...,...,...,...,...,...,...,...
995,tt5580540,tvSeries,Santa Clarita Diet,0,2017,30,"[Comedy, Horror]",7.8,74327
996,tt0058331,movie,Mary Poppins,0,1964,139,"[Comedy, Family, Fantasy]",7.8,182373
997,tt12262116,movie,Thirteen Lives,0,2022,147,"[Action, Adventure, Biography]",7.8,64939
998,tt3281548,movie,Little Women,0,2019,135,"[Drama, Romance]",7.8,232912


In [11]:
scrape_list = scrape_df['imdb_id'].tolist()

### Revenue Scraper

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote


In [4]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
           'Content-Type': 'text/html; charset=UTF-8'}

parser = 'html.parser'

In [21]:
def request_page(imdb_id):
    url = f'https://www.boxofficemojo.com/title/{imdb_id}/'

    print(url)
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        
        response.raise_for_status()
    except:
        return None, None
    
    soup = BeautifulSoup(response.text, parser)
    
    return url, soup
    

In [6]:
def extract_performance(soup):
    release_info_div = soup.find('div', class_='mojo-performance-summary-table')
    
    currency = None
    
    if release_info_div:
        release_info = {}
        release_spans = release_info_div.find_all('span', class_='a-size-small')
            
        for span in release_spans:
            try:
                # Extract release type (e.g., "Domestic", "International", "Worldwide")
                release_type = span.get_text(strip=True)
                if release_type.find(' ') != -1:
                    release_type = release_type.split(' ')[0]
                    
                # Extract revenue
                source_span = span.find_next('span', class_='a-size-medium a-text-bold')
                next_span = source_span.find_next('span')

                if next_span.get('class')[0] == 'percent zero':
                    release_info[release_type] = None
                elif next_span.get('class')[0] == 'money':            
                    revenue_text = next_span.get_text(strip=True) if next_span else None
                        
                    # Extract currency
                    tmp_currency = revenue_text[0] if revenue_text else None
                        
                    if not currency:
                        currency = tmp_currency
                        
                    # Convert revenue, percentage to int, float
                    revenue = int(revenue_text[1:].replace(',', '')) if revenue_text else None
                        
                    # Add performance information to dictionary
                    release_info[release_type] = revenue
            except:
                continue
        
        return currency, release_info
    
    return None


def extract_budget(soup):
    output = {}
    
    values = ['Domestic Distributor', 'Domestic Opening', 'Budget']
    
    currencies = []
    
    # Initialize currency variable
    currency = None
    
    for keyword in values:
        try:
            tmp_elements = soup.find_all('span', text=keyword)
        
            for element in tmp_elements:
                if keyword == 'Domestic Distributor':
                    next_span = element.find_next('span')                    
                    value_text = next_span.get_text(strip=True) if next_span else None
                    index = value_text.find('See full')
                    value_text = value_text[:index].strip()
                    
                    output[keyword] = value_text
                    
                else:
                    value_span = element.find_next('span', class_='money')
                    value_text = value_span.get_text(strip=True) if value_span else None

                    # Extract currency from the first character of value_text
                    tmp_currency = value_text[0] if value_text else None
                    currencies.append(tmp_currency)

                    # Convert value to an integer (assuming it's a currency value)
                    value = int(value_text[1:].replace(',', '')) if value_text else None

                    # Print the extracted values for debugging
                    output[keyword] = value
        except:
            continue
    
    output = output if output != {} else None
    
    if currency is None and currencies:
        currency = currencies[0]
    
    return currency, output

def get_releases(soup):
    try:
        releases_table = soup.find('table', class_='a-bordered a-horizontal-stripes a-size-base-plus')
        
        rows = len(releases_table.find_all('tr')) - 1
        
        return rows
    except:
        return None
    
    

In [22]:
def scrape_page(imdb_id):
    url, response = request_page(imdb_id)
    
    if not response:
        return None
    
    performance_currency, performance = extract_performance(response)
    stats_currency, budget = extract_budget(response)
    releases = get_releases(response)
    
    currencies = []
    if performance_currency != None:
        currencies.append(performance_currency)
    if stats_currency != None:
        currencies.append(stats_currency)
        
    currency = currencies[0] if len(set(currencies)) == 1 else None
    
    performance_worldwide = performance.get('Worldwide', None) if performance else None
    performance_domestic = performance.get('Domestic', None) if performance else None
    performance_international = performance.get('International', None) if performance else None
    
    movie_budget = budget.get('Budget', None) if budget else None
    
    roi = float((performance_worldwide - movie_budget) / movie_budget) if performance_worldwide and movie_budget else None
    percentage_domestic = float(performance_domestic / performance_worldwide) if performance_domestic and performance_worldwide else None
    percentage_international = float(performance_international / performance_worldwide) if performance_international and performance_worldwide else None
    
    output = {
        'imdb_id': imdb_id,
        'source_url': url,
        'currency': currency,
        'domestic_distributor': budget.get('Domestic Distributor', None) if budget else None,
        'domestic_opening': budget.get('Domestic Opening', None) if budget else None,
        'budget': movie_budget,
        'releases': releases,
        'performance_domestic': performance_domestic,
        'performance_international': performance_international,
        'performance_worldwide': performance_worldwide,
        'metric_roi': roi,
        'percentage_domestic': percentage_domestic,
        'percentage_international': percentage_international,
    }

    return output

In [12]:
test = scrape_page(scrape_list[1])

test

https://www.boxofficemojo.com/title/tt2178784/


  tmp_elements = soup.find_all('span', text=keyword)


{'imdb_id': 'tt2178784',
 'source_url': 'https://www.boxofficemojo.com/title/tt2178784/',
 'currency': None,
 'domestic_distributor': None,
 'domestic_opening': None,
 'budget': None,
 'releases': None,
 'performance_domestic': None,
 'performance_international': None,
 'performance_worldwide': None,
 'metric_roi': None,
 'percentage_domestic': None,
 'percentage_international': None}

### Multithreaded Execution

In [14]:
from concurrent.futures import ThreadPoolExecutor, as_completed

In [23]:
results = []
records = scrape_list
failed = 0

with ThreadPoolExecutor(max_workers=14) as executor:
    future_to_record = {executor.submit(scrape_page, record): record for record in records}

    for future in as_completed(future_to_record):
        result = future.result()
        if result is not None:
            results.append(result)
        else:
            failed += 1

# Creating a new DataFrame from the processed data
performance_df = pd.DataFrame(results)

performance_df

https://www.boxofficemojo.com/title/tt0228333/
https://www.boxofficemojo.com/title/tt0594845/
https://www.boxofficemojo.com/title/tt0083949/
https://www.boxofficemojo.com/title/tt0097499/
https://www.boxofficemojo.com/title/tt0061637/
https://www.boxofficemojo.com/title/tt0255819/
https://www.boxofficemojo.com/title/tt0367546/
https://www.boxofficemojo.com/title/tt0255668/
https://www.boxofficemojo.com/title/tt0366182/
https://www.boxofficemojo.com/title/tt0178022/
https://www.boxofficemojo.com/title/tt0099054/
https://www.boxofficemojo.com/title/tt0153301/
https://www.boxofficemojo.com/title/tt0166158/
https://www.boxofficemojo.com/title/tt0080801/


  tmp_elements = soup.find_all('span', text=keyword)


https://www.boxofficemojo.com/title/tt0405393/
https://www.boxofficemojo.com/title/tt0097670/
https://www.boxofficemojo.com/title/tt0097790/
https://www.boxofficemojo.com/title/tt0479879/
https://www.boxofficemojo.com/title/tt0074653/
https://www.boxofficemojo.com/title/tt0073705/
https://www.boxofficemojo.com/title/tt0320792/https://www.boxofficemojo.com/title/tt0063787/

https://www.boxofficemojo.com/title/tt0488380/
https://www.boxofficemojo.com/title/tt0062755/
https://www.boxofficemojo.com/title/tt0116056/
https://www.boxofficemojo.com/title/tt0347167/
https://www.boxofficemojo.com/title/tt1263736/
https://www.boxofficemojo.com/title/tt1691424/
https://www.boxofficemojo.com/title/tt0214042/
https://www.boxofficemojo.com/title/tt0479697/
https://www.boxofficemojo.com/title/tt0450258/
https://www.boxofficemojo.com/title/tt0486219/
https://www.boxofficemojo.com/title/tt1516552/
https://www.boxofficemojo.com/title/tt0063642/
https://www.boxofficemojo.com/title/tt0090366/
https://www.b

Unnamed: 0,imdb_id,source_url,currency,domestic_distributor,domestic_opening,budget,releases,performance_domestic,performance_international,performance_worldwide,metric_roi,percentage_domestic,percentage_international
0,tt0255668,https://www.boxofficemojo.com/title/tt0255668/,,,,,,,,,,,
1,tt0166158,https://www.boxofficemojo.com/title/tt0166158/,$,,,,1.0,,6718.0,6718.0,,,1.000000
2,tt0255819,https://www.boxofficemojo.com/title/tt0255819/,$,Sony Pictures Entertainment (SPE),8606403.0,16000000.0,1.0,28734552.0,647097.0,29381649.0,0.836353,0.977976,0.022024
3,tt0228333,https://www.boxofficemojo.com/title/tt0228333/,$,Screen Gems,3804452.0,28000000.0,1.0,8709640.0,5301192.0,14010832.0,-0.499613,0.621636,0.378364
4,tt0080801,https://www.boxofficemojo.com/title/tt0080801/,$,Twentieth Century Fox,,,1.0,30031783.0,,30031783.0,,1.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9563,tt0372937,https://www.boxofficemojo.com/title/tt0372937/,$,,,,1.0,,70277.0,70277.0,,,1.000000
9564,tt1473380,https://www.boxofficemojo.com/title/tt1473380/,,,,,,,,,,,
9565,tt0109922,https://www.boxofficemojo.com/title/tt0109922/,,,,,,,,,,,
9566,tt1606259,https://www.boxofficemojo.com/title/tt1606259/,$,Vitagraph Films,2647.0,,1.0,2647.0,22008.0,24655.0,,0.107362,0.892638


Unnamed: 0,imdb_id,title_type,movie_name,is_adult,release_year,runtime_minutes,genres,avg_rating,num_votes,source_url,...,domestic_distributor,domestic_opening,budget,releases,performance_domestic,performance_international,performance_worldwide,metric_roi,percentage_domestic,percentage_international
0,tt0000001,short,Carmencita,0,1894,1,"[Documentary, Short]",5.7,2007,,...,,,,,,,,,,
1,tt0000002,short,Le clown et ses chiens,0,1892,5,"[Animation, Short]",5.8,270,,...,,,,,,,,,,
2,tt0000003,short,Pauvre Pierrot,0,1892,4,"[Animation, Comedy, Romance]",6.5,1921,,...,,,,,,,,,,
3,tt0000004,short,Un bon bock,0,1892,12,"[Animation, Short]",5.5,178,,...,,,,,,,,,,
4,tt0000005,short,Blacksmith Scene,0,1893,1,"[Comedy, Short]",6.2,2698,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1357259,tt9916730,movie,6 Gunn,0,2017,116,[Drama],7.6,11,,...,,,,,,,,,,
1357260,tt9916766,tvEpisode,Episode #10.15,0,2019,43,"[Family, Game-Show, Reality-TV]",7.0,22,,...,,,,,,,,,,
1357261,tt9916778,tvEpisode,Escape,0,2019,,"[Crime, Drama, Mystery]",7.2,36,,...,,,,,,,,,,
1357262,tt9916840,tvEpisode,Horrid Henry's Comic Caper,0,2014,11,"[Adventure, Animation, Comedy]",8.8,6,,...,,,,,,,,,,


In [24]:
failed

10256

In [177]:
performance_df.to_excel('scraper_test.xlsx')

### Merge on Dataset

In [15]:
cmu_dataset = pd.read_csv('backup.csv')
cmu_dataset

cmu_dataset

Unnamed: 0.1,Unnamed: 0,movie_wikipedia_id,movie_freebase_id,movie_name,release_year,revenue,runtime,languages,countries,genres,plot_summary,language,word_count,char_count,avg_word_length,sentence_count,lexical_diversity,sentiment_polarity
0,0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","Set in the second half of the 22nd century, th...",en,357,2181,0.163686,15,0.627451,-0.085095
1,1,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",A series of murders of rich young women throug...,en,590,3301,0.178734,36,0.542373,0.035867
2,2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}","Eva, an upper class housewife, becomes frustra...",en,426,2339,0.182129,24,0.582160,0.133259
3,3,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...","Every hundred years, the evil Morgana returns...",en,163,870,0.187356,7,0.631902,0.040568
4,4,6631279,/m/0gffwj,Little city,1997,,93.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...","Adam, a San Francisco-based artist who works a...",en,225,1234,0.182334,9,0.626667,0.165202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25565,25565,26482675,/m/0bbwngb,Eşrefpaşalılar,2010,1847671.0,,,,"{""/m/05p553"": ""Comedy film"", ""/m/07s9rl0"": ""Dr...","The film is about two friends, Tayyar , a mafi...",en,108,601,0.179700,4,0.731481,0.258333
25566,25566,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",Two former National Oceanic Atmospheric Admini...,en,104,664,0.156627,5,0.759615,0.108333
25567,25567,34980460,/m/0g4pl34,Knuckle,2011,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...",{{No plot}} This film follows 12 years in the ...,en,64,368,0.173913,3,0.781250,0.010000
25568,25568,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ...","The story takes place in the year 2092,The Sup...",en,199,1237,0.160873,8,0.713568,0.194003


In [16]:
cmu_dataset['release_year'] = cmu_dataset['release_year'].astype(int)
imdb_df = imdb_df[~imdb_df['release_year'].isna()]
imdb_df['release_year'] = imdb_df['release_year'].astype(int)

merge_df = pd.merge(left=cmu_dataset, right=imdb_df, how='left', on=['movie_name', 'release_year']).reset_index(drop=True)

merge_df = merge_df[~merge_df['imdb_id'].isna()]

merge_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_df['release_year'] = imdb_df['release_year'].astype(int)


Unnamed: 0.1,Unnamed: 0,movie_wikipedia_id,movie_freebase_id,movie_name,release_year,revenue,runtime,languages,countries,genres_x,...,sentence_count,lexical_diversity,sentiment_polarity,imdb_id,title_type,is_adult,runtime_minutes,genres_y,avg_rating,num_votes
0,0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",...,15,0.627451,-0.085095,tt0228333,movie,0,98,"[Action, Horror, Sci-Fi]",4.9,57028.0
1,0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",...,15,0.627451,-0.085095,tt0594845,tvEpisode,0,,"[Documentary, Short]",7.6,20.0
3,2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",...,24,0.582160,0.133259,tt0083949,movie,0,106,[Drama],5.9,623.0
6,5,171005,/m/016ywb,Henry V,1989,10161099.0,137.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/04xvh5"": ""Costume drama"", ""/m/082gq"": ""Wa...",...,3,0.880597,-0.006250,tt0097499,movie,0,137,"[Biography, Drama, History]",7.5,31270.0
8,7,32456683,/m/0gyryjt,Die Fahne von Kriwoj Rog,1967,,108.0,"{""/m/04306rv"": ""German Language""}","{""/m/03f2w"": ""German Democratic Republic""}",,...,5,0.689076,0.071429,tt0061637,movie,0,108,[Drama],7.6,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27066,25562,664006,/m/030xw6,Guilty as Sin,1993,22886222.0,107.0,,"{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",...,37,0.586792,0.009097,tt0107057,movie,0,107,"[Crime, Drama, Thriller]",5.7,5760.0
27067,25563,3868432,/m/0b44p5,Into the Mirror,2003,,113.0,"{""/m/02hwhyv"": ""Korean Language""}","{""/m/06qd3"": ""South Korea""}","{""/m/03npn"": ""Horror""}",...,7,0.696552,-0.011667,tt0372937,movie,0,113,"[Action, Fantasy, Horror]",6.4,3585.0
27068,25564,15394941,/m/03m6zh4,Gopi Kishan,1994,,,"{""/m/03k50"": ""Hindi Language""}","{""/m/03rk0"": ""India""}","{""/m/07s9rl0"": ""Drama"", ""/m/02kdv5l"": ""Action""...",...,55,0.467033,0.027336,tt0109922,movie,0,161,"[Action, Comedy, Drama]",5.8,972.0
27070,25566,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",...,5,0.759615,0.108333,tt1816585,tvMovie,0,82,[Sci-Fi],4.6,1711.0


In [17]:
scrape_list = list(merge_df['imdb_id'].unique())

len(scrape_list)

19824

# Scrapeeeee!

In [18]:
results = []
records = scrape_list
failed = 0

with ThreadPoolExecutor(max_workers=12) as executor:
    future_to_record = {executor.submit(scrape_page, record): record for record in scrape_list}

    for future in as_completed(future_to_record):
        result = future.result()
        if result is not None:
            results.append(result)
        else:
            failed += 1

# Creating a new DataFrame from the processed data
performance_df = pd.DataFrame(results)
performance_df

https://www.boxofficemojo.com/title/tt0228333/
https://www.boxofficemojo.com/title/tt0594845/
https://www.boxofficemojo.com/title/tt0083949/
https://www.boxofficemojo.com/title/tt0097499/
https://www.boxofficemojo.com/title/tt0061637/
https://www.boxofficemojo.com/title/tt0255819/
https://www.boxofficemojo.com/title/tt0367546/
https://www.boxofficemojo.com/title/tt0255668/
https://www.boxofficemojo.com/title/tt0366182/
https://www.boxofficemojo.com/title/tt0178022/
https://www.boxofficemojo.com/title/tt0099054/
https://www.boxofficemojo.com/title/tt0153301/


  tmp_elements = soup.find_all('span', text=keyword)


https://www.boxofficemojo.com/title/tt0166158/
https://www.boxofficemojo.com/title/tt0080801/
https://www.boxofficemojo.com/title/tt0405393/
https://www.boxofficemojo.com/title/tt0097670/
https://www.boxofficemojo.com/title/tt0097790/
https://www.boxofficemojo.com/title/tt0479879/
https://www.boxofficemojo.com/title/tt0074653/
https://www.boxofficemojo.com/title/tt0073705/
https://www.boxofficemojo.com/title/tt0320792/
https://www.boxofficemojo.com/title/tt0063787/
https://www.boxofficemojo.com/title/tt0488380/
https://www.boxofficemojo.com/title/tt0062755/
https://www.boxofficemojo.com/title/tt0116056/
https://www.boxofficemojo.com/title/tt0347167/
https://www.boxofficemojo.com/title/tt1263736/
https://www.boxofficemojo.com/title/tt1691424/
https://www.boxofficemojo.com/title/tt0214042/
https://www.boxofficemojo.com/title/tt0479697/
https://www.boxofficemojo.com/title/tt0450258/
https://www.boxofficemojo.com/title/tt0486219/
https://www.boxofficemojo.com/title/tt1516552/
https://www.b

TypeError: cannot unpack non-iterable NoneType object

In [20]:
len(results)

2328

In [26]:
extended_df = pd.merge(left=merge_df, right=performance_df, how='left', on='imdb_id')

extended_df

Unnamed: 0.1,Unnamed: 0,movie_wikipedia_id,movie_freebase_id,movie_name,release_year,revenue,runtime,languages,countries,genres_x,...,domestic_distributor,domestic_opening,budget,releases,performance_domestic,performance_international,performance_worldwide,metric_roi,percentage_domestic,percentage_international
0,0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",...,Screen Gems,3804452.0,28000000.0,1.0,8709640.0,5301192.0,14010832.0,-0.499613,0.621636,0.378364
1,0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",...,,,,,,,,,,
2,2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",...,,,,,,,,,,
3,5,171005,/m/016ywb,Henry V,1989,10161099.0,137.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/04xvh5"": ""Costume drama"", ""/m/082gq"": ""Wa...",...,The Samuel Goldwyn Company,64933.0,9000000.0,1.0,10161099.0,,10161099.0,0.129011,1.000000,
4,7,32456683,/m/0gyryjt,Die Fahne von Kriwoj Rog,1967,,108.0,"{""/m/04306rv"": ""German Language""}","{""/m/03f2w"": ""German Democratic Republic""}",,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19849,25562,664006,/m/030xw6,Guilty as Sin,1993,22886222.0,107.0,,"{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",...,Walt Disney Studios Motion Pictures,5713708.0,,1.0,22866222.0,,22866222.0,,1.000000,
19850,25563,3868432,/m/0b44p5,Into the Mirror,2003,,113.0,"{""/m/02hwhyv"": ""Korean Language""}","{""/m/06qd3"": ""South Korea""}","{""/m/03npn"": ""Horror""}",...,,,,1.0,,70277.0,70277.0,,,1.000000
19851,25564,15394941,/m/03m6zh4,Gopi Kishan,1994,,,"{""/m/03k50"": ""Hindi Language""}","{""/m/03rk0"": ""India""}","{""/m/07s9rl0"": ""Drama"", ""/m/02kdv5l"": ""Action""...",...,,,,,,,,,,
19852,25566,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",...,,,,,,,,,,


In [28]:
extended_df.to_csv('processed_dataset.csv')