In [53]:
import pandas as pd
import numpy as np
import os

# Import data

In [54]:
#read a txt file and convert it to a dataframe
def read_txt(path):
    df = pd.read_csv(path, sep='\t', header=None)
    return df

In [55]:
plots = pd.read_csv('MovieSummaries/plot_summaries.txt',header=None, sep="\t")
movies = pd.read_csv('MovieSummaries/movie.metadata.tsv',header=None, sep="\t")
characters = pd.read_csv('MovieSummaries/character.metadata.tsv',header=None, sep="\t")
names = pd.read_csv('MovieSummaries/name.clusters.txt',header=None, sep="\t")
tvtropes = pd.read_csv('MovieSummaries/tvtropes.clusters.txt',header=None, sep="\t")

In [56]:
plots.columns = ['WikiID', 'Plot']

1. Wikipedia movie ID
2. Freebase movie ID
3. Movie name
4. Movie release date
5. Movie box office revenue
6. Movie runtime
7. Movie languages (Freebase ID:name tuples)
8. Movie countries (Freebase ID:name tuples)
9. Movie genres (Freebase ID:name tuples)

In [57]:
movies.columns = ['WikiID', 'FreebaseID', 'Name', 'ReleaseDate', 'Revenue', 'Runtime', 'Languages', 'Countries', 'Genres']

In [220]:
movie_with_date = movies[-movies["ReleaseDate"].isna()].copy(deep=True)
dates = movie_with_date["ReleaseDate"]
date_years = dates.astype(str).str.extract(r'(\d{4})')

In [143]:
na_ids = np.where(movies["ReleaseDate"].isna())[0]
no_date_filter = movies["ReleaseDate"].astype(str).str.extract('(\d{4})').isna()

In [101]:
no_date_filter = movies["ReleaseDate"].astype(str).str.extract('(\d{4})').isna()
na_ids = np.where(movies["ReleaseDate"].astype(str).str.extract('(\d{4})').isna().values)[0]
no_date_movies = movies.iloc[na_ids]

In [129]:
no_date_movies["Revenue"].isna().value_counts()
no_date_movies.iloc[np.where(no_date_movies["Revenue"].isna()==False)]["WikiID"]

462       7582987
1043      4724834
1047     34055313
1888     25496318
5007     33382022
           ...   
77890    34502262
78775    27785844
78816     4421925
79278    33934476
79413     9170459
Name: WikiID, Length: 73, dtype: int64

In [107]:
not True

False

In [85]:
movie_without_date

Unnamed: 0,WikiID,FreebaseID,Name,ReleaseDate,Revenue,Runtime,Languages,Countries,Genres
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
81736,,,,,,,,,
81737,,,,,,,,,
81738,,,,,,,,,
81739,,,,,,,,,


In [58]:
#movies['ReleaseDate'] = pd.to_datetime(movies['ReleaseDate'], errors='ignore')
movies["ReleaseDate"].astype(str).str.extract('(\d{4})').astype(int)
movies["ReleaseDate"]

ValueError: cannot convert float NaN to integer

Characters data:
1. Wikipedia movie ID
2. Freebase movie ID
3. Movie release date
4. Character name
5. Actor date of birth
6. Actor gender
7. Actor height (in meters)
8. Actor ethnicity (Freebase ID)
9. Actor name
10. Actor age at movie release
11. Freebase character/actor map ID
12. Freebase character ID
13. Freebase actor ID

In [7]:
characters.columns = ['WikiID', 'FreebaseID', 'ReleaseDate', 'CharacterName', 'DateOfBirth', 'Gender', 'Height', 'Ethnicity', 'Name', 'AgeAtRealease', 'FreebaseCharacterActorMapID', 'FreebaseCharacterID', 'FreebaseActorID']

# Scraping

## Scraping Imbd

In [8]:
import imdb_scraper as imdb

In [9]:
# example of how to use the scraper
myscraper = imdb.ImdbScraper()
myscraper.get_imdb_infos("Pirate of the Caribbean: The Curse of the Black Pearl")

{'global_revenue': 654264015.0,
 'budget': 140000000.0,
 'gross_domestic': 305413918.0,
 'opening_weekend': 46630690.0,
 'rating_score': 8.1,
 'number_of_ratings': 1200000.0,
 'watched_rank': 365.0,
 'producer': 'Gore Verbinski'}

In [10]:
n           = movies.shape[0]
n_computer  = 6
size        = n//n_computer

# create a uniform partition of indices from 0 to n-1 for n_computer computers
indices         = [i for i in range(n)]
partitions      = [indices[i*size:(i+1)*size] for i in range(n_computer)]
partitions[-1]  = partitions[-1] + indices[n_computer*size:] 

In [11]:
# show first and last element of each partition
partition_intervals = [(partitions[i][0], partitions[i][-1]) for i in range(n_computer)]
print(f"number of elements: {n}")
print(f"partition size: {size}\n")
for i in range(len(partition_intervals)):
    print(f"partition {i}: {partition_intervals[i]}")

number of elements: 81741
partition size: 13623

partition 0: (0, 13622)
partition 1: (13623, 27245)
partition 2: (27246, 40868)
partition 3: (40869, 54491)
partition 4: (54492, 68114)
partition 5: (68115, 81740)


In [12]:
"""
Select your index here:
    - Anthony   0 & 1
    - Anton     2
    - Aymeric   3
    - Eric      4
    - Yara      5
"""
index = 0

index_range = partitions[index]

In [13]:
#movie_names = ["The Avengers", "The Godfather", "Vice", "The Dark Knight", "OuiOui"]

# Adding date to have more accurate search results
dataset_names = (movies["Name"] + " " + movies["ReleaseDate"].astype(str)).values
movie_years = movies["ReleaseDate"].astype(str).values

# create an empty DataFrame for the first row
header = pd.DataFrame(columns=["WikiID", "Name", "Date", "global_revenue", "budget", "gross_domestic", "opening_weekend", "rating_score", "number_of_ratings", "watched_rank", "producer"])

csv_file = 'MovieSummaries/imdb_scraped_data_' + str(index) + '.csv'

myscraper = imdb.ImdbScraper()

# if the csv_file doesn't exist, make a header
if not os.path.isfile(csv_file):
    # write the DataFrame to a CSV file
    header.to_csv(csv_file, index=False)

for i in index_range:
    name = dataset_names[i]
    wikiID = movies["WikiID"].values[i]

    movie_infos = myscraper.get_imdb_infos(name)
    row = [wikiID, movies["Name"].values[i], movies["ReleaseDate"].values[i], *list(movie_infos.values())]

    # create a new DataFrame for the row
    row_df = pd.DataFrame([row], columns=header.columns)

    # replace None values with the string 'None'
    row_df = row_df.fillna('None')

    # append the row DataFrame to the CSV file
    row_df.to_csv(csv_file, mode='a', header=False, index=False)

    #print(f"{name}  : {movie_infos}")

KeyboardInterrupt: 

<br><br>

---


## TMBD API

In [12]:
import tmdb_scraper

In [13]:
# example
my_tmdb_scraper = tmdb_scraper.tmdb_scraper()
my_tmdb_scraper.get_tmdb_infos("Star Wars : Episode IV - A New Hope")

{'tmdb_id': 11,
 'movie_name': 'Star Wars : Episode IV - A New Hope',
 'release_date': '1977-05-25',
 'revenue': 775398007,
 'budget': 11000000,
 'rating': 8.204,
 'vote_count': 19278,
 'popularity': 86.431,
 'runtime': 121,
 'production': None}

In [12]:
dataset_names   = movies["Name"].values

# create an empty DataFrame for the first row
header = pd.DataFrame(columns=["tmdb_id", "movie_name", "release_date", "revenue", "budget", "rating", "vote_count", "popularity", "runtime", "production"])

csv_file = 'MovieSummaries/tmbd_scraped_data.csv'

# if the csv_file doesn't exist yet, make a header
if not os.path.isfile(csv_file):
    # write the DataFrame to a CSV file
    header.to_csv(csv_file, index=False)

index_range = np.arange(0, 10)

for i in index_range:
    name = dataset_names[i]

    movie_infos = my_tmdb_scraper.get_tmdb_infos(name)
    row = [*list(movie_infos.values())]

    # create a new DataFrame for the row
    row_df = pd.DataFrame([row], columns=header.columns)

    # replace None values with the string 'None'
    row_df = row_df.fillna('None')

    # append the row DataFrame to the CSV file
    row_df.to_csv(csv_file, mode='a', header=False, index=False)

    #print(f"{name}  : {movie_infos}")

# Testing zone

## Wikipedia API

In [243]:
import wikipediaapi
import requests

In [250]:
resp = requests.get("https://en.wikipedia.org/w/api.php?action=query&pageids=975900&format=json")

In [251]:
resp.json()

{'batchcomplete': '',
 'query': {'pages': {'975900': {'pageid': 975900,
    'ns': 0,
    'title': 'Ghosts of Mars'}}}}

## TMDB

In [10]:
import requests
import tmdbsimple as tmdb

In [11]:
movie_name = "My Friend Ganesha"

In [12]:
tmdb.REQUESTS_SESSION = requests.Session()
search      = tmdb.Search()

search.movie(query=movie_name)
tmdb_id     = search.results[0]['id']
result      = tmdb.Movies(tmdb_id).info()

revenue     = result['revenue']
budget      = result['budget']
rating      = result['vote_average']
vote_count  = result['vote_count']
popularity  = result['popularity']
runtime     = result['runtime']
production  = result['production_companies'][0]['name']
release_date = result['release_date']

IndexError: list index out of range

## IMDB

In [225]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re

In [226]:
movie_name = dataset_names[0]
movie_name

'Ghosts of Mars 2001-08-24'

In [227]:
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=chrome_options)

# access the webpage
driver.get("https://www.imdb.com/")

# find the search bar & search button
search_bar = driver.find_element("xpath", '//*[@id="suggestion-search"]')
search_button = driver.find_element("xpath", '//*[@id="suggestion-search-button"]')

# search for the movie
search_bar.send_keys(movie_name)
search_button.click()


In [228]:
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

In [230]:
dates = soup.find_all("span", {"class": "ipc-metadata-list-summary-item__li"})

In [235]:
dates

[<span aria-disabled="false" class="ipc-metadata-list-summary-item__li">2001</span>,
 <span aria-disabled="false" class="ipc-metadata-list-summary-item__li">Natasha Henstridge, Ice Cube</span>,
 <span aria-disabled="false" class="ipc-metadata-list-summary-item__li">2001</span>,
 <span aria-disabled="false" class="ipc-metadata-list-summary-item__li">Vidéo</span>,
 <span aria-disabled="false" class="ipc-metadata-list-summary-item__li">John Carpenter</span>,
 <span aria-disabled="false" class="ipc-metadata-list-summary-item__li">2001</span>,
 <span aria-disabled="false" class="ipc-metadata-list-summary-item__li">Vidéo</span>,
 <span aria-disabled="false" class="ipc-metadata-list-summary-item__li">Anthrax, Tiago Becker</span>,
 <span aria-disabled="false" class="ipc-metadata-list-summary-item__li">2001</span>,
 <span aria-disabled="false" class="ipc-metadata-list-summary-item__li">Vidéo</span>,
 <span aria-disabled="false" class="ipc-metadata-list-summary-item__li">Danielle Burgio, John Ca

In [238]:
movies.iloc[10]

WikiID                                                    175026
FreebaseID                                             /m/017n1p
Name                                               Sarah and Son
ReleaseDate                                                 1930
Revenue                                                      NaN
Runtime                                                     86.0
Languages                     {"/m/02h40lc": "English Language"}
Countries              {"/m/09c7w0": "United States of America"}
Genres         {"/m/07s9rl0": "Drama", "/m/01g6gs": "Black-an...
Name: 10, dtype: object

In [None]:

# find the first result
first_result = driver.find_element("xpath", '//*[@id="__next"]/main/div[2]/div[3]/section/div/div[1]/section[2]/div[2]/ul/li[1]')
first_result.click()

#parse the resulting webpage
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
box_office = soup.find('div', {'data-testid': "title-boxoffice-section"})
try:
    brut_li                 = box_office.find('li', {'data-testid' : "title-boxoffice-cumulativeworldwidegross"})
    global_revenue_raw      = brut_li.find('span', {'class' : "ipc-metadata-list-item__list-content-item"}).text
    global_revenue          = int(re.findall(r'\d+', global_revenue_raw.replace('\u202f', ''))[0])
except:
    global_revenue          = None

try:
    budget_li               = box_office.find('li', {'data-testid' : "title-boxoffice-budget"})
    budget_raw              = budget_li.find('span', {'class' : "ipc-metadata-list-item__list-content-item"}).text
    budget                  = int(re.findall(r'\d+', budget_raw.replace('\u202f', ''))[0])
except:
    budget = None

try:
    gross_domestic_li       = box_office.find('li', {'data-testid' : "title-boxoffice-grossdomestic"})
    gross_domestic_raw      = gross_domestic_li.find('span', {'class' : "ipc-metadata-list-item__list-content-item"}).text
    gross_domestic          = int(re.findall(r'\d+', gross_domestic_raw.replace('\u202f', ''))[0])
except:
    gross_domestic          = None

try:
    opening_weekend_li      = box_office.find('li', {'data-testid' : "title-boxoffice-openingweekenddomestic"})
    opening_weekend_raw     = opening_weekend_li.find('span', {'class' : "ipc-metadata-list-item__list-content-item"}).text
    opening_weekend         = int(re.findall(r'\d+', opening_weekend_raw.replace('\u202f', ''))[0])
except:
    opening_weekend         = None

try:
    rating_score_div        = soup.find('div', {'data-testid' : "hero-rating-bar__aggregate-rating__score"})
    rating_score_raw        = rating_score_div.text
    rating_score            = float(re.findall(r'\d{1}.\d{1}', rating_score_raw.replace(',', '.').replace('\u202f', ''))[0])
except:
    rating_score            = None

try:
    number_of_ratings_div   = soup.find('div', {'data-testid' : "hero-rating-bar__aggregate-rating"})
    number_of_ratings_raw   = number_of_ratings_div.find('div', {'class' : "sc-bde20123-3 bjjENQ"}).text
    number_of_ratings       = float(re.findall(r'\d+.?\d?', number_of_ratings_raw.replace(',', '.').replace('\u202f', ''))[0])

    rating_unit             = number_of_ratings_raw[-1]
    if (rating_unit=='M'):
        number_of_ratings   = 1000000*number_of_ratings
    elif (rating_unit=='k'):
        number_of_ratings   = 1000*number_of_ratings
except:
    number_of_ratings       = None

try:
    watched_rank_div        = soup.find('div', {'data-testid' : "hero-rating-bar__popularity"})
    watched_rank_raw        = watched_rank_div.find('div', {'data-testid' : "hero-rating-bar__popularity__score"}).text
    watched_rank            = int(re.findall(r'\d+', watched_rank_raw.replace('\u202f', ''))[0])
except:
    watched_rank            = None

try:
    cast_section            = soup.find('section', {"data-testid" : "title-cast"})
    cast_ul                 = cast_section.find('ul', {"class" : "ipc-metadata-list ipc-metadata-list--dividers-all sc-bfec09a1-8 iiDmgX ipc-metadata-list--base"})
    cast_raw                = cast_ul.find('a', {'class' : 'ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link'}).text
    producer               = cast_raw
except:
    producer                = None

In [None]:
producer