# Lecture 8: Assessing Gender Gap Off-Screen in Serial Production (Part 2)
Date: December 1, 2023
Duration: 3 hours

## Outline


### Data scraping, cleaning and preprocessing (1 hour)
- Scraping new IMDb page
- Handling missing values, outliers

In [None]:
import time
import streamlit as st
import pandas as pd
from PIL import Image
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

In [None]:
# Set the path to the ChromeDriver (if not added to PATH)
chrome_driver_path = "./chromedriver"
chrome_binary_path = "./chrome-linux64/chrome"

# Create a Service object
service = Service(chrome_driver_path)

# Set up Chrome options for headless mode
chrome_options = Options()
chrome_options.binary_location = chrome_binary_path  # Specify the path to Chrome binary
chrome_options.add_argument("--headless")  # Run Chrome in headless mode
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
chrome_options.add_argument("--window-size=1920,1080")  # Set window size
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36")

# Initialize the WebDriver instance using the Service object and Chrome options
driver = webdriver.Chrome(service=service, options=chrome_options)

In [None]:
class Generator:
    def __init__(self, gen):
        self.gen = gen
        self.value = None

    def __iter__(self):
        self.value = yield from self.gen

In [None]:
# Helper function to load an image for the sidebar
def load_image(image_path):
    with open(image_path, 'rb') as file:
        img = Image.open(file)
        return img

In [None]:
def scrape_imdb(url):
    # Load the IMDb page
    driver.get(url)  # Replace with your IMDb URL

    # Initialize WebDriverWait
    wait = WebDriverWait(driver, 5)

    # Wait for the initial page to load
    time.sleep(4)

    # Extract the total number of results using the updated method
    try:
        total_results_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.sc-54d06b29-3')))
        total_results_str = total_results_element.text
        # Extracting the number from the first line
        first_line = total_results_str.split('\n')[0]
        total_results_number_str = first_line.split()[-1].replace('.', '').replace(',', '')
    except NoSuchElementException:
        print("Total number of results not found.")
        exit()
    except TimeoutException:
        print("Timeout reached extracting the total number of results.")
        exit()

    if total_results_number_str.isdigit():
        total_results = int(total_results_number_str)
    else:
        raise ValueError("Unable to extract total number of results as an integer")


    loaded_results = 0
    # Click the "50 more" button until all results are loaded
    while loaded_results < total_results:
        try:
            load_more_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.ipc-see-more__text')))
            driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)

            # Use JavaScript to click the button
            driver.execute_script("arguments[0].click();", load_more_button)

            time.sleep(3)  # Wait for the page to load more results
            results_text = driver.find_element(By.CSS_SELECTOR, '.sc-54d06b29-3').text
            first_line = results_text.split('\n')[0]
            loaded_results_str = first_line.split('-')[-1].split()[0].replace('.', '').replace(',', '')
            if loaded_results_str.isdigit():
                loaded_results = int(loaded_results_str)
                # Calculate progress percentage
                progress_fraction = (loaded_results / total_results)
                yield progress_fraction
            else:
                raise ValueError("Unable to extract loaded results number as an integer")
        except Exception as e:
            print("Error:", e)
            break


    # Extract the HTML content
    html_content = driver.page_source
    # Create a BeautifulSoup object for parsing the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Use a set to store IMDB codes and ensure uniqueness
    imdb_codes_set = set()
    for tag in soup.find_all('a', href=True):
        if 'title/tt' in tag.get('href'):
            code = tag.get('href').split('/')[2].split('?')[0]
            imdb_codes_set.add(code)

    # Convert the set to a list for the DataFrame
    imdb_codes_list = list(imdb_codes_set)

    # Close the Selenium browser
    driver.quit()
    return imdb_codes_list

In [None]:
def main():
    # Sidebar with logo and instructions
    st.sidebar.image(load_image('logo_IMDb_scraper.webp'), use_column_width=True)
    st.sidebar.info("""
            Welcome to the IMDb scraping tool. To extract titles and related data from IMDb using an advanced search URL, follow these steps:

            1. Perform an advanced search on IMDb (IMDb Advanced Search) with desired criteria like genre, year, rating, etc.
            2. Copy the URL from the browser's address bar after viewing the results.
            3. Paste the copied URL into the designated field in the web app.
            4. Click the "Start Scraping" button to initiate data extraction from the listed titles.
            5. Upon completion, the results can be downloaded in CSV.

            Note: The search URL can be directly manipulated by using '!' to exclude parameters (e.g., 'genres=!drama') and including categories like 'country_of_origin' or 'primary_language' to personalize search parameters not available in IMDb's standard interface. Respect IMDb's terms of service and legal restrictions on web scraping.
        """)

    # Main app interface
    st.title("IMDb Web Scraper")

    # Text input for URL
    url = st.text_input("Enter the IMDb URL to scrape:")

    # Container for messages and progress bar
    status_container = st.empty()


    # Button to start scraping
    if st.button("Start Scraping") and url:
        progress_bar = st.progress(0)
        gen_wrapper = Generator(scrape_imdb(url))

        for progress in gen_wrapper:
            progress_bar.progress(progress)

        scraped_data = gen_wrapper.value

        if scraped_data:
            status_container.success("Scraping Completed!")
            results_df = pd.DataFrame({'IMDB_Code': scraped_data})
            csv_data = results_df.to_csv(index=False).encode('utf-8')
            st.download_button(label="Download Data as CSV", data=csv_data, file_name="scraped_data.csv",
                                   mime="text/csv")
        else:
            status_container.error("No data scraped.")

if __name__ == "__main__":
    main()

### Data analysis techniques (1 hour)
- Descriptive statistics, hypothesis testing

In [1]:
import pandas as pd
import PySimpleGUI as sg
import dns
import copy
import json
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
pbar = ProgressBar()
pbar.register()
import pymongo
import multiprocessing
import warnings
warnings.filterwarnings("ignore")
import imdb
from imdb import IMDb, IMDbError
ia = IMDb()
CPU_COUNT = multiprocessing.cpu_count()

In [2]:
def read_table():

    #sg.set_options(auto_size_buttons=True)
    filename = sg.popup_get_file(
        'Dataset to read',
        #no_titlebar=True,
        #grab_anywhere=True,
        file_types=(("CSV Files", "*.csv"),),
        )

    if not filename:
        sg.popup("No filename supplied, exit")
        raise SystemExit("Cancelling: no filename supplied")

    return filename

In [3]:
def get_infoset():
    infolist = ia.get_movie_infoset()
    unwanted_infosets = {'main', 'news', 'soundtrack'}
    infolist = [ele for ele in infolist if ele not in unwanted_infosets]
    #define layout
    layout=[[sg.Text('The default infoset is main \n You can add more infosets',size=(30, 2), justification='left')],
        [sg.Listbox(infolist, default_values='', select_mode='extended', key='info', size=(30, 8))],
        [sg.Button('SAVE'), sg.Button('CANCEL')]]
    
    #Define Window
    win =sg.Window('Additional infosets',layout)
    
    #Read  values entered by user
    e,v=win.read()
    strv = ", ".join(v['info'])
    #close first window
    win.close()
    #display string in a popup         
    sg.popup('Chosen infosets:',      
                'main, '+ strv )
    return v['info']

In [4]:
def get_database( ):
    # Very basic window.  Return values using auto numbered keys

    layout = [
        [sg.Text('Please enter Database and Collection names')],
        [sg.Text('Connection string', size=(15, 1)), sg.InputText()],
        [sg.Text('Database', size=(15, 1)), sg.InputText()],
        [sg.Text('collection', size=(15, 1)), sg.InputText()],
        [sg.Submit(), sg.Cancel()]
    ]

    window = sg.Window('Database entry window', layout)
    event, values = window.read()
    window.close()
    return values

In [5]:
#reads the list of titles from the file and deletes the first two letters of the code
def get_data(filename):
    try:
        titles = pd.read_csv(filename, usecols=[0], names=['_id'])
        titles['_id'] = titles['_id'].str.slice_replace(start=0, stop=2, repl='')
    except:
        sg.popup("The dataset is incorrect, exit")
        raise SystemExit("Cancelling: The dataset is incorrect")
    return titles

In [6]:
def identify(DataObj):
    idoc = {}
    tag=''
    if isinstance(DataObj, imdb.Person.Person):
        tag = 'nm'
    elif isinstance(DataObj, imdb.Movie.Movie):
        tag = 'tt'
    elif isinstance(DataObj, imdb.Company.Company):
        tag = 'co'
    else:
        # insert here exception-handling
        pass    
    ID = DataObj.getID()
    idoc['_id'] = tag+str(ID)
    #idoc['id_'] = ID
    return idoc

In [7]:
def convert(DataObj):
    document = {}

    classes = (
        imdb.Person.Person,
        imdb.Movie.Movie,
        imdb.Company.Company)

    for key in DataObj.keys():

        if type(DataObj[key]) is dict:
            document[key] = convert(DataObj[key])
            

        elif type(DataObj[key]) is list:
            document.update(identify(DataObj))
            values = DataObj[key]

            if len(values) == 0:
                continue

            sample = values[0]

            if type(sample) in classes:
                val = [x.data for x in values]
                for x in val:
                    n = val.index(x)
                    x.update(identify(values[n]))
                    document[key] = val

            elif len(values) == 1 and type(values[0]) not in classes:
                document[key] = values[0]

            elif len(values) == 1 and type(values[0]) in classes:
                data = values[0].data
                data.update(identify(values[0]))
                document[key] = [data]

            elif type(sample) in (str, bytes):
                document[key] = DataObj[key]

        elif type(DataObj[key]) in classes:
            (DataObj[key]).data.update(identify(DataObj[key]))
            document[key] = convert((DataObj[key]).data)

        else:
            document[key] = DataObj[key]

    return document

In [8]:
def append_error_message(error_message):
    """Append error message as a new line at the end of file"""
    # Open the file in append & read mode ('a+')
    with open('errors.txt', "a+") as file_object:
        # Move read cursor to the start of file.
        file_object.seek(0)
        # If file is not empty then append '\n'
        data = file_object.read(100)
        if len(data) > 0:
            file_object.write("\n")
        # Append text at the end of file
        file_object.write(error_message)

In [9]:
#Download the filmography file starting from the title identifier (title) 
#and attributes it (in json format) to the variable movie
def get_main(title, infoset):
    
    try:
        mv = ia.get_movie(title, info = infoset)
#     except (KeyError):
#         new_infoset = copy.copy(infoset)
#         new_infoset.remove('episodes')
#         mv = ia.get_movie(title, info = new_infoset)
    except IMDbError as e:
        append_error_message(str(e))
        movie = None
        return movie
    movie = json.dumps(convert(mv))
    return movie

In [10]:
#applies the previous one for each title identifier contained in the dataframe. 
#It works in parallel by taking advantage of the available cores.
def dask_impl(df, infoset):
    # from dask.diagnostics import ProgressBar
    # pbar = ProgressBar()
    # pbar.register()
    return dd.from_pandas(df, npartitions=CPU_COUNT).apply(
    lambda row: get_main(
        row._id, infoset),
    axis=1, 
    meta=(int)
  ).compute()

In [11]:
#Non-parallel function alternative to the previous one
def apply_impl(df, infoset):
    return df.apply(
        lambda row: get_main(
        row._id, infoset), axis = 1
    )

In [12]:
def connect(values, coll=None):

    client = pymongo.MongoClient(values[0])
    db = client[str(values[1])]
    collection = db[str(values[2])]
    return collection

In [13]:
def to_mongo(mov, values):
    collection = connect(values)
    pyresponse = json.loads(mov)
    collection.insert_one(pyresponse)

In [14]:
def app(df, values):
    try:
        dd.from_pandas(df, npartitions=CPU_COUNT).apply(to_mongo, args=(values,), meta=(int)).compute()
    except:
        sg.popup("Something wrong with the connection, exit")
        raise SystemExit("Cancelling: Something wrong with the connection")

In [15]:
def main():
    sg.theme('Material1')      # Add some color to the window

    filename = read_table()
    titles = get_data(filename)
    values = get_database()
    infoset = get_infoset()
    infoset.insert(0, 'main')
    
    layout = [  [sg.Text('Below you can see the download progress:')],
    [sg.Output(size=(60,3), key='-OUTPUT-')]    ]
    window = sg.Window('Window Title', layout, finalize = True)
    
    df = dask_impl(titles, infoset)
    window.close()
    df.dropna(inplace=True)
    app(df, values)
    sg.popup("Operation completed successfully")

In [18]:
main()

### Data visualization (30 minutes)
- Tools and techniques

In [None]:
def aggregation(field):
    #Inserire qui la stringa di connessione a MongoDB con il proprio nome utente e password
    client = pymongo.MongoClient('<Atlas connection string>')
    #Indicare qui il nome del database e della collection generati da IMDb2Mongo
    result = client['<Database>']['<Collection>'].aggregate([
    {
        '$match': {
            f'{field}': {
                '$exists': True, 
                '$ne': []
            }
        }
    }, {
        '$project': {
            '_id': 0, 
            f'{field}': 1, 
            'year': 1
        }
    }, {
        '$unwind': {
            'path': f'${field}'
        }
    }, {
        '$addFields':{
        f"{field}.code": f'${field}._id',
        f"{field}.role": f'{field}',
        f"{field}.year": '$year'
    }
    }, {
        '$replaceRoot': {
            'newRoot': f'${field}'
        }
    },{
        '$project': {
            '_id': 0
        }
    },{
        '$merge': {
# Inserire qui il nome della Crew Collection che verrà salvata su MongoDB
            'into':'<Crew Collection Name>'
        }
    }
]) 

In [None]:
def main():
    # I fields vanno controllati e eventualmente aggiunti i nuovi
    fields = ['art department', 'art direction', 'assistant director', 'camera and electrical department', 'cast', 'casting department', 'casting director', 'cinematographer', 'composer', 'costume department', 'costume designer', 'creator', 'director', 'editor', 'editorial department', 'location management', 'make up', 'miscellaneous crew', 'music department', 'producer', 'production design', 'production manager', 'script department', 'set decoration', 'sound crew', 'special effects', 'stunt performer', 'visual effects', 'writer']
    for field in fields:
        aggregation(field)

In [None]:
main()

## Genderize dataset

In [None]:
crew = pd.read_csv("Crew_significant03112022.csv", sep=',')
gender = pd.read_csv("names_gendered_rev_ita.14.09.2022.csv", sep=',')

In [None]:
crew.rename(columns = {'code':'nconst'}, inplace = True)

In [None]:
crew_gender = crew.merge(gender, how='left')

In [None]:
crew_gender.head()

In [None]:
crew_gender.drop('name', axis=1, inplace=True)

In [None]:
crew_gender.head()

In [None]:
crew_gender.to_csv(r"crew_significant_gender.03.11.2022.csv", index=False)

### Q&A and discussion (30 minutes)