In [6]:
import pandas as pd
import numpy as np
import os

# Importing the datasets

In [7]:
DATA_FOLDER = 'data/MovieSummaries/'
ADDITIONAL_FOLDER = 'data/AdditionalData/'

In [8]:
#read a txt file and convert it to a dataframe
def read_txt(path):
    df = pd.read_csv(path, sep='\t', header=None)
    return df

In [9]:
# import the data
plots = pd.read_csv(DATA_FOLDER + 'plot_summaries.txt',header=None, sep="\t")
movies = pd.read_csv(DATA_FOLDER + 'movie.metadata.tsv',header=None, sep="\t")
characters = pd.read_csv(DATA_FOLDER + 'character.metadata.tsv',header=None, sep="\t")
names = pd.read_csv(DATA_FOLDER + 'name.clusters.txt',header=None, sep="\t")
tvtropes = pd.read_csv(DATA_FOLDER + 'tvtropes.clusters.txt',header=None, sep="\t")

In [10]:
plots.columns = ['wikipedia_movie_id', 'plot']

`movies` data

1. Wikipedia movie ID
2. Freebase movie ID
3. Movie name
4. Movie release date
5. Movie box office revenue
6. Movie runtime
7. Movie languages (Freebase ID:name tuples)
8. Movie countries (Freebase ID:name tuples)
9. Movie genres (Freebase ID:name tuples)

In [11]:
movies.columns = ['wikipedia_movie_id', 'freebase_movie_id', 'name', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']

`characters` data:
1. Wikipedia movie ID
2. Freebase movie ID
3. Movie release date
4. Character name
5. Actor date of birth
6. Actor gender
7. Actor height (in meters)
8. Actor ethnicity (Freebase ID)
9. Actor name
10. Actor age at movie release
11. Freebase character/actor map ID
12. Freebase character ID
13. Freebase actor ID

In [12]:
characters.columns = ['wikipedia_movie_id', 'freebase_movie_id', 'release_date', 'character_name', 'date_of_birth', 'gender', 'height', 'ethnicity', 'name', 'age_at_release', 'freebase_character_map_id', 'freebase_caracter_id', 'freebase_actor_id']

# Scraping

## Scraping IMDB

In [13]:
import imdb_scraper as imdb

### Example on how to use the scraper

<b><span style="color:red">Don't close the browser when it is in use </span></b>

In [14]:
# initialize the scraper object
myscraper = imdb.ImdbScraper()

# select a movie wikipedia id
movie_id = movies.iloc[74623]["wikipedia_movie_id"]

# scrape the movie infos
scraped_data = myscraper.get_imdb_infos(movie_id)

# close the browser
myscraper.close()

scraped_data

{'global_revenue': None,
 'budget': 2846000.0,
 'gross_domestic': None,
 'opening_weekend': None,
 'rating_score': 6.7,
 'number_of_ratings': 1700.0,
 'watched_rank': None,
 'producer': 'Robert Wise',
 'release_year': None}

### Let the scraping begin!

Computing `n_computer` partitions to parallelize ImDB's scraping on multiple computers

In [15]:
n           = movies.shape[0]
n_computer  = 6
size        = n//n_computer

# create a uniform partition of indices from 0 to n-1 for n_computer computers
indices         = [i for i in range(n)]
partitions      = [indices[i*size:(i+1)*size] for i in range(n_computer)]
partitions[-1]  = partitions[-1] + indices[n_computer*size:] 

In [16]:
# show first and last element of each partition
partition_intervals = [(partitions[i][0], partitions[i][-1]) for i in range(n_computer)]
print(f"number of elements: {n}")
print(f"partitions' size: {size}\n")
for i in range(len(partition_intervals)):
    print(f"partition {i}: {partition_intervals[i]}")

number of elements: 81741
partitions' size: 13623

partition 0: (0, 13622)
partition 1: (13623, 27245)
partition 2: (27246, 40868)
partition 3: (40869, 54491)
partition 4: (54492, 68114)
partition 5: (68115, 81740)


Each user should use its index to compute his attributed dataset

In [17]:
"""
Select your index here:
    - Anthony   0 & 1
    - Anton     2
    - Aymeric   3
    - Eric      4
    - Yara      5
"""
index = 0

Since ImDB has a protection against bots, using `requests` yields a `forbidden` error. The use of `selenium` to simmulate a real human operator avoids this problem and allows to scrape, though more slowly.

In [18]:
def scrape_partition(index):
    """
    Scrape the IMDB data for the movies in the partition with the given index.
    The scraped dataset is saved in a separate csv file for each partition.

    Input:
    index(int) : The index of the partition to scrape.

    Output:
    None
    """
    try:
        os.makedirs(ADDITIONAL_FOLDER + 'imdb_partitions')
    except:
        pass

    # create a header dataframe
    header = pd.DataFrame(columns=["wikipedia_movie_id", "name", "global_revenue", "budget", "gross_domestic", "opening_weekend", 
                               "rating_score", "number_of_ratings", "watched_rank", "producer", "release_year"])  
    
    # csv file name
    csv_file = ADDITIONAL_FOLDER + 'imdb_partitions/imdb_scraped_data_' + str(index) + '.csv'
    
    #initialize the scraper object
    myscraper = imdb.ImdbScraper()
    
    # if the csv_file doesn't exist, create it and write the header
    if not os.path.isfile(csv_file):
        header.to_csv(csv_file, index=False)
        starting_index = 0
    else:
        scraped = pd.read_csv(csv_file)
        starting_index = scraped.shape[0]
       
    index_range = partitions[index][starting_index:]
    
    for i in index_range:
        movie_id = movies.iloc[i]["wikipedia_movie_id"]
    
        movie_infos = myscraper.get_imdb_infos(movie_id)
        row = [movie_id, movies["name"].values[i], *list(movie_infos.values())]
    
        # create a new DataFrame for the row
        row_df = pd.DataFrame([row], columns=header.columns)
    
        # replace None values with the string 'None'
        row_df = row_df.fillna('None')
    
        # append the row DataFrame to the CSV file
        row_df.to_csv(csv_file, mode='a', header=False, index=False)
    
    myscraper.close()

In [19]:
# run the scraper
scrape_partition(index)

## Multi-processed version

This part shows an implementation of the scraper running in parallel on multiple threads on a single computer. The number of threads `n_threads` can be increased if the user's computer has access to more threads.

In [20]:
import threading

In [21]:
n           = movies.shape[0]
n_threads   = 6
size        = n//n_threads

# create a uniform partition of indices from 0 to n-1 for n_threads computers
indices         = [i for i in range(n)]
partitions      = [indices[i*size:(i+1)*size] for i in range(n_threads)]
partitions[-1]  = partitions[-1] + indices[n_threads*size:] 

In [22]:
# show first and last element of each partition
partition_intervals = [(partitions[i][0], partitions[i][-1]) for i in range(n_threads)]
print(f"number of elements: {n}")
print(f"partitions' size: {size}\n")
for i in range(len(partition_intervals)):
    print(f"partition {i}: {partition_intervals[i]}")

number of elements: 81741
partitions' size: 13623

partition 0: (0, 13622)
partition 1: (13623, 27245)
partition 2: (27246, 40868)
partition 3: (40869, 54491)
partition 4: (54492, 68114)
partition 5: (68115, 81740)


In [23]:
threads = []

for i in range(n_threads):
    threads.append(threading.Thread(target=scrape_partition, args=(i,)))

for thread in threads:
    thread.start()

for thread in threads:
    thread.join()

## Merging the scraped partitions

In [24]:
csv_file = ADDITIONAL_FOLDER + r"/imdb_scraped_dataset.csv"
n_computer  = 6
try:
    partition_dataset = pd.read_csv(csv_file)
except:
    partition_file = ADDITIONAL_FOLDER + r'imdb_partitions/imdb_scraped_data_'
    partition_dataset = pd.read_csv(partition_file + '0.csv')

    for i in range(1, n_computer):
        partition = partition_file + str(i) + '.csv'
        partition_dataset = pd.concat([partition_dataset, pd.read_csv(partition)], axis=0).fillna('None')
        partition_dataset.to_csv(csv_file, index=False)