In [None]:
import pandas as pd
import numpy as np
import os

# Importing the datasets

In [None]:
#read a txt file and convert it to a dataframe
def read_txt(path):
    df = pd.read_csv(path, sep='\t', header=None)
    return df

In [None]:
# import the data
plots = pd.read_csv('MovieSummaries/plot_summaries.txt',header=None, sep="\t")
movies = pd.read_csv('MovieSummaries/movies_metadata.tsv',header=None, sep="\t")
characters = pd.read_csv('MovieSummaries/characters_metadata.tsv',header=None, sep="\t")
names = pd.read_csv('MovieSummaries/names_clusters.txt',header=None, sep="\t")
tvtropes = pd.read_csv('MovieSummaries/tvtropes_clusters.txt',header=None, sep="\t")

In [None]:
plots.columns = ['WikiID', 'Plot']

`movies` data

1. Wikipedia movie ID
2. Freebase movie ID
3. Movie name
4. Movie release date
5. Movie box office revenue
6. Movie runtime
7. Movie languages (Freebase ID:name tuples)
8. Movie countries (Freebase ID:name tuples)
9. Movie genres (Freebase ID:name tuples)

In [None]:
movies.columns = ['WikiID', 'FreebaseID', 'Name', 'ReleaseDate', 'Revenue', 'Runtime', 'Languages', 'Countries', 'Genres']

In [None]:
# extracting the years from the release date feature
movie_with_date = movies[-movies["ReleaseDate"].isna()].copy(deep=True)
dates = movie_with_date["ReleaseDate"]
date_years = dates.astype(str).str.extract(r'(\d{4})')

`characters` data:
1. Wikipedia movie ID
2. Freebase movie ID
3. Movie release date
4. Character name
5. Actor date of birth
6. Actor gender
7. Actor height (in meters)
8. Actor ethnicity (Freebase ID)
9. Actor name
10. Actor age at movie release
11. Freebase character/actor map ID
12. Freebase character ID
13. Freebase actor ID

In [None]:
characters.columns = ['WikiID', 'FreebaseID', 'ReleaseDate', 'CharacterName', 'DateOfBirth', 'Gender', 'Height', 'Ethnicity', 'Name', 'AgeAtRealease', 'FreebaseCharacterActorMapID', 'FreebaseCharacterID', 'FreebaseActorID']

# Scraping

## Scraping IMDB

In [None]:
import imdb_scraper as imdb

### Example on how to use the scraper

<b><span style="color:red">Don't close the browser when it is in use </span></b>

In [None]:
# initialize the scraper object
myscraper = imdb.ImdbScraper()

# select a movie wikipedia id
movie_id = movies.iloc[74623]["WikiID"]

# scrape the movie infos
scraped_data = myscraper.get_imdb_infos(movie_id)

# close the browser
myscraper.close()

scraped_data

### Let the scraping begin!

Computing `n_computer` partitions to parallelize ImDB's scraping on multiple computers

In [None]:
n           = movies.shape[0]
n_computer  = 6
size        = n//n_computer

# create a uniform partition of indices from 0 to n-1 for n_computer computers
indices         = [i for i in range(n)]
partitions      = [indices[i*size:(i+1)*size] for i in range(n_computer)]
partitions[-1]  = partitions[-1] + indices[n_computer*size:] 

In [None]:
# show first and last element of each partition
partition_intervals = [(partitions[i][0], partitions[i][-1]) for i in range(n_computer)]
print(f"number of elements: {n}")
print(f"partitions' size: {size}\n")
for i in range(len(partition_intervals)):
    print(f"partition {i}: {partition_intervals[i]}")

Each user should use its index to compute his attributed dataset

In [None]:
"""
Select your index here:
    - Anthony   0 & 1
    - Anton     2
    - Aymeric   3
    - Eric      4
    - Yara      5
"""
index = 0

Since ImDB has a protection against bots, using `requests` yields a `forbidden` error. The use of `selenium` to simmulate a real human operator avoids this problem and allows to scrape, though more slowly.

In [None]:
def scrape_partition(index):
    # create a header dataframe
    header = pd.DataFrame(columns=["WikiID", "Name", "global_revenue", "budget", "gross_domestic", "opening_weekend", 
                               "rating_score", "number_of_ratings", "watched_rank", "producer", "release_year"])  
    
    # csv file name
    csv_file = 'MovieSummaries/imdb_scraped_data_' + str(index) + '.csv'
    
    #initialize the scraper object
    myscraper = imdb.ImdbScraper()
    
    # if the csv_file doesn't exist, create it and write the header
    if not os.path.isfile(csv_file):
        header.to_csv(csv_file, index=False)
        starting_index = 0
    else:
        scraped = pd.read_csv(csv_file)
        starting_index = scraped.shape[0]
       
    index_range = partitions[index][starting_index:]
    
    for i in index_range:
        movie_id = movies.iloc[i]["WikiID"]
    
        movie_infos = myscraper.get_imdb_infos(movie_id)
        row = [movie_id, movies["Name"].values[i], *list(movie_infos.values())]
    
        # create a new DataFrame for the row
        row_df = pd.DataFrame([row], columns=header.columns)
    
        # replace None values with the string 'None'
        row_df = row_df.fillna('None')
    
        # append the row DataFrame to the CSV file
        row_df.to_csv(csv_file, mode='a', header=False, index=False)
    
    myscraper.close()

In [None]:
scrape_partition(index)

## Parallelized version

This part shows an implementation of the scraper on multiple threads on a single computer.

In [None]:
import threading

In [None]:
n           = movies.shape[0]
n_threads   = 8
size        = n//n_threads

# create a uniform partition of indices from 0 to n-1 for n_threads computers
indices         = [i for i in range(n)]
partitions      = [indices[i*size:(i+1)*size] for i in range(n_threads)]
partitions[-1]  = partitions[-1] + indices[n_threads*size:] 

In [None]:
# show first and last element of each partition
partition_intervals = [(partitions[i][0], partitions[i][-1]) for i in range(n_threads)]
print(f"number of elements: {n}")
print(f"partitions' size: {size}\n")
for i in range(len(partition_intervals)):
    print(f"partition {i}: {partition_intervals[i]}")

In [None]:
threads = []

for i in range(n_threads):
    threads.append(threading.Thread(target=scrape_partition, args=(i,)))

for thread in threads:
    thread.start()

for thread in threads:
    thread.join()

## Merging the scraped partitions

In [None]:
csv_file = r"./MovieSummaries/imdb_scraped_dataset.csv"
n_computer  = 6
try:
    partition_dataset = pd.read_csv(csv_file)
except:
    partition_file = r'partitions/imdb_scraped_data_'
    partition_dataset = pd.read_csv(partition_file + '0.csv')

    for i in range(1, n_computer):
        partition = partition_file + str(i) + '.csv'
        partition_dataset = pd.concat([partition_dataset, pd.read_csv(partition)], axis=0).fillna('None')
        partition_dataset.to_csv(csv_file, index=False)