## Web scraping to enrich our CSV ##

In [None]:
import pandas as pd  
import numpy as np
import re
import pickle
from tqdm import tqdm


from selenium import webdriver     # I import everything I need to do web scrapping with selenium
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException



# Ignoring the warnings.
import warnings
warnings.filterwarnings('ignore')

In [None]:
# I activate all these options for doing the web scrapping

opciones= Options()
opciones.add_experimental_option('excludeSwitches', ['enable-automation'])
#para ocultarme como robot
opciones.add_experimental_option('useAutomationExtension', False)
opciones.add_argument('--start-maximized') 
opciones.add_argument('user.data-dir=selenium')
opciones.add_argument('--incognito

In [194]:
movies = pd.read_csv('/Users/davidusuariocasa/Desktop/Projects/ETL_project/CSVs/Oscars_cleaned.csv') # I open the csv with the CSV I want to iterate from 

## Loop to get each film's 'score' and 'duration' from IMDB ##


In [279]:
driver = webdriver.Firefox()

driver.get("https://www.imdb.com/") # This is the web page we get the info from
driver.implicitly_wait(2)

for index, (value, year) in enumerate(zip(movies['film'], movies['year_film'])): # We iterate through each film (row), and we want to write its title and year
    try:
        
        driver.find_element("css selector", "#suggestion-search").send_keys(f'{value} {str(year)}') # In IMDB's search box, we write each film and its year
        time.sleep(2)

        try:
            
            driver.find_element("css selector", "#react-autowhatever-navSuggestionSearch--item-0 > a:nth-child(1) > div:nth-child(2)").click() # Try to click on the first option that our seach shows
        except:
            movies.loc[index, 'score'] = np.nan # If our movie doesn't have an IMDB profile, we fill our column 'score' and 'duration' with null values, and clear our search box
            movies.loc[index, 'duration'] = np.nan
            driver.find_element("css selector", "#suggestion-search").clear()

            continue  # Continue to the next iteration

        try:
            duration = driver.find_element('xpath', '/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/ul/li[3]').text
            movies.loc[index, 'duration'] = duration # If the film has a duration written, take it and assign it to our 'duration' column
        except:
            movies.loc[index, 'duration'] = np.nan # If not, assign a null value to the 'duration' column
            continue  # Continue to the next iteration

        try:
            score = driver.find_element('xpath', '/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[2]/div[2]/div/div[1]/a/span/div/div[2]/div[1]').text
            movies.loc[index, 'score'] = score # If the film has a score written, take it and assign it to our 'score' column
        except:
            movies.loc[index, 'score'] = np.nan # If not, assign a null value to the 'score' column

    except TimeoutException:
        continue  # Go to the next iteration, if it shows an error of Timeout Exception 

driver.quit() # At the end of the for loop, close the Firefox navegator 



## First iteration ##

In [212]:
'''First time we iterated through our 'movie' df, an error appeared at index 260, so we take the first 260 
indexes (because they took the data that we wanted), and store them in a new variable called 'movies_sub1''''

movies_sub1 = movies.iloc[:260] # This is our first subdf, with the data we wanted before the error


## Second iteration ## 

In [221]:
movies_sub2 = movies.iloc[260:] # Our second subdf is composed by the unscrapped rows that didn't work on our first time

In [222]:
movies_sub2 = movies_sub2.reset_index() # We reset the indexes and drop the column 'index'
movies_sub2 = movies_sub2.drop('index', axis=1)
movies_sub2

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,duration,score
0,1943,1944,16,SOUND RECORDING,"RKO Radio Studio Sound Department, Stephen Dun...",This Land Is Mine,,
1,1943,1944,16,SPECIAL EFFECTS,Photographic Effects by Fred Sersen; Sound Eff...,Crash Dive,,
2,1943,1944,16,WRITING (Original Motion Picture Story),William Saroyan,The Human Comedy,,
3,1943,1944,16,WRITING (Original Screenplay),Norman Krasna,Princess O'Rourke,,
4,1943,1944,16,WRITING (Screenplay),"Julius J. Epstein, Philip G. Epstein, Howard Koch",Casablanca,,
...,...,...,...,...,...,...,...,...
1884,2022,2023,95,SOUND,"Mark Weingarten, James H. Mather, Al Nelson, C...",Top Gun: Maverick,,
1885,2022,2023,95,VISUAL EFFECTS,"Joe Letteri, Richard Baneham, Eric Saindon and...",Avatar: The Way of Water,,
1886,2022,2023,95,WRITING (Adapted Screenplay),Screenplay by Sarah Polley,Women Talking,,
1887,2022,2023,95,WRITING (Original Screenplay),Written by Daniel Kwan & Daniel Scheinert,Everything Everywhere All at Once,,


In [None]:
driver = webdriver.Firefox() 

# Now, we repeat the same iteration as before, but with our subdataframe number 2

driver.get("https://www.imdb.com/") # This is the web page we get the info from
driver.implicitly_wait(2)

for index, (value, year) in enumerate(zip(movies_sub2['film'], movies_sub2['year_film'])): # We iterate through each film (row), and we want to write its title and year
    try:
        
        driver.find_element("css selector", "#suggestion-search").send_keys(f'{value} {str(year)}') # In IMDB's search box, we write each film and its year
        time.sleep(2)

        try:
            
            driver.find_element("css selector", "#react-autowhatever-navSuggestionSearch--item-0 > a:nth-child(1) > div:nth-child(2)").click() # Try to click on the first option that our seach shows
        except:
            movies_sub2.loc[index, 'score'] = np.nan # If our movie doesn't have an IMDB profile, we fill our column 'score' and 'duration' with null values, and clear our search box
            movies_sub2.loc[index, 'duration'] = np.nan
            driver.find_element("css selector", "#suggestion-search").clear()

            continue  # Continue to the next iteration

        try:
            duration = driver.find_element('xpath', '/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/ul/li[3]').text
            movies_sub2.loc[index, 'duration'] = duration # If the film has a duration written, take it and assign it to our 'duration' column
        except:
            movies_sub2.loc[index, 'duration'] = np.nan # If not, assign a null value to the 'duration' column
            continue  # Continue to the next iteration

        try:
            score = driver.find_element('xpath', '/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[2]/div[2]/div/div[1]/a/span/div/div[2]/div[1]').text
            movies_sub2.loc[index, 'score'] = score # If the film has a score written, take it and assign it to our 'score' column
        except:
            movies_sub2.loc[index, 'score'] = np.nan # If not, assign a null value to the 'score' column

    except TimeoutException:
        continue  # Go to the next iteration, if it shows an error of Timeout Exception 

driver.quit() # At the end of the for loop, close the Firefox navegator 



In [234]:
movies_sub2 = movies_sub2.iloc[:419]

'''It worked until our index 419 from movies_sub2, so we rewrite the variable
with the rows that worked. This is our second subdataframe'''

## Third iteration ##

In [262]:
movies_sub3 = movies.iloc[679:]
movies_sub3 = movies_sub3.reset_index()
movies_sub3 = movies_sub3.drop('index', axis=1)
movies_sub3

'''Same as before. Our third dataframe is composed by the unscrapped rows that didn't work in our second iteration.
We reset the indexes and drop the 'index' column'''

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,duration,score
0,1960,1961,33,MUSIC (Scoring of a Musical Picture),"Morris Stoloff, Harry Sukman",Song without End (The Story of Franz Liszt),,
1,1960,1961,33,MUSIC (Song),Music and Lyrics by Manos Hadjidakis,Never on Sunday,,
2,1960,1961,33,BEST MOTION PICTURE,"Billy Wilder, Producer",The Apartment,,
3,1960,1961,33,SHORT SUBJECT (Cartoon),"William L. Snyder, Producer",Munro,,
4,1960,1961,33,SHORT SUBJECT (Live Action),"Ezra R. Baker, Producer",Day of the Painter,,
...,...,...,...,...,...,...,...,...
1465,2022,2023,95,SOUND,"Mark Weingarten, James H. Mather, Al Nelson, C...",Top Gun: Maverick,,
1466,2022,2023,95,VISUAL EFFECTS,"Joe Letteri, Richard Baneham, Eric Saindon and...",Avatar: The Way of Water,,
1467,2022,2023,95,WRITING (Adapted Screenplay),Screenplay by Sarah Polley,Women Talking,,
1468,2022,2023,95,WRITING (Original Screenplay),Written by Daniel Kwan & Daniel Scheinert,Everything Everywhere All at Once,,


In [None]:
driver = webdriver.Firefox() 

# Now, we repeat the same iteration as before, but with our subdataframe number 3

driver.get("https://www.imdb.com/") # This is the web page we get the info from
driver.implicitly_wait(2)

for index, (value, year) in enumerate(zip(movies_sub3['film'], movies_sub3['year_film'])): # We iterate through each film (row), and we want to write its title and year
    try:
        
        driver.find_element("css selector", "#suggestion-search").send_keys(f'{value} {str(year)}') # In IMDB's search box, we write each film and its year
        time.sleep(2)

        try:
            
            driver.find_element("css selector", "#react-autowhatever-navSuggestionSearch--item-0 > a:nth-child(1) > div:nth-child(2)").click() # Try to click on the first option that our seach shows
        except:
            movies_sub3.loc[index, 'score'] = np.nan # If our movie doesn't have an IMDB profile, we fill our column 'score' and 'duration' with null values, and clear our search box
            movies_sub3.loc[index, 'duration'] = np.nan
            driver.find_element("css selector", "#suggestion-search").clear()

            continue  # Continue to the next iteration

        try:
            duration = driver.find_element('xpath', '/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/ul/li[3]').text
            movies_sub3.loc[index, 'duration'] = duration # If the film has a duration written, take it and assign it to our 'duration' column
        except:
            movies_sub3.loc[index, 'duration'] = np.nan # If not, assign a null value to the 'duration' column
            continue  # Continue to the next iteration

        try:
            score = driver.find_element('xpath', '/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[2]/div[2]/div/div[1]/a/span/div/div[2]/div[1]').text
            movies_sub3.loc[index, 'score'] = score # If the film has a score written, take it and assign it to our 'score' column
        except:
            movies_sub3.loc[index, 'score'] = np.nan # If not, assign a null value to the 'score' column

    except TimeoutException:
        continue  # Go to the next iteration, if it shows an error of Timeout Exception 

driver.quit() # At the end of the for loop, close the Firefox navegator 



In [273]:
movies_sub3 = movies_sub3.iloc[:719] 

'''Our iteration stopped working at index 719, so we rewrite movies_sub3 
with the rows that worked and store it. This is our third subdataframe'''

In [295]:
movies_sub3 # This is our third subdataframe

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,duration,score
0,1960,1961,33,MUSIC (Scoring of a Musical Picture),"Morris Stoloff, Harry Sukman",Song without End (The Story of Franz Liszt),,
1,1960,1961,33,MUSIC (Song),Music and Lyrics by Manos Hadjidakis,Never on Sunday,1h 31min,"7,3\n/10"
2,1960,1961,33,BEST MOTION PICTURE,"Billy Wilder, Producer",The Apartment,2h 5min,"8,3\n/10"
3,1960,1961,33,SHORT SUBJECT (Cartoon),"William L. Snyder, Producer",Munro,1h 37min,"5,5\n/10"
4,1960,1961,33,SHORT SUBJECT (Live Action),"Ezra R. Baker, Producer",Day of the Painter,,
...,...,...,...,...,...,...,...,...
714,1991,1992,64,COSTUME DESIGN,Albert Wolsky,Bugsy,2h 16min,"6,8\n/10"
715,1991,1992,64,DIRECTING,Jonathan Demme,The Silence of the Lambs,1h 58min,"8,6\n/10"
716,1991,1992,64,DOCUMENTARY (Feature),"Allie Light and Irving Saraf, Producers",In the Shadow of the Stars,1h 33min,"6,5\n/10"
717,1991,1992,64,DOCUMENTARY (Short Subject),"Debra Chasnoff, Producer","Deadly Deception: General Electric, Nuclear We...",,


## Fourth iteration ## 

In [298]:
movies_sub4 = movies.iloc[1398:]
movies_sub4 = movies_sub4.reset_index()
movies_sub4 = movies_sub4.drop('index', axis=1)
movies_sub4

'''We take the unscrapped rows that didn't work on the third iteration, and store it in our fourth subdataframe
We reset indexes and drop the 'index' column'''

"We take the unscrapped rows that didn't work on the third iteration, and store it in our fourth subdataframe\nWe reset indexes and drop the 'index' column"

In [None]:
driver = webdriver.Firefox()

# Now, we repeat the same iteration as before, but with our subdataframe number 4


driver.get("https://www.imdb.com/") # This is the web page we get the info from
driver.implicitly_wait(2)

for index, (value, year) in enumerate(zip(movies_sub4['film'], movies_sub4['year_film'])): # We iterate through each film (row), and we want to write its title and year
    try:
        
        driver.find_element("css selector", "#suggestion-search").send_keys(f'{value} {str(year)}') # In IMDB's search box, we write each film and its year
        time.sleep(2)

        try:
            
            driver.find_element("css selector", "#react-autowhatever-navSuggestionSearch--item-0 > a:nth-child(1) > div:nth-child(2)").click() # Try to click on the first option that our seach shows
        except:
            movies_sub4.loc[index, 'score'] = np.nan # If our movie doesn't have an IMDB profile, we fill our column 'score' and 'duration' with null values, and clear our search box
            movies_sub4.loc[index, 'duration'] = np.nan
            driver.find_element("css selector", "#suggestion-search").clear()

            continue  # Continue to the next iteration

        try:
            duration = driver.find_element('xpath', '/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/ul/li[3]').text
            movies_sub4.loc[index, 'duration'] = duration # If the film has a duration written, take it and assign it to our 'duration' column
        except:
            movies_sub4.loc[index, 'duration'] = np.nan # If not, assign a null value to the 'duration' column
            continue  # Continue to the next iteration

        try:
            score = driver.find_element('xpath', '/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[2]/div[2]/div/div[1]/a/span/div/div[2]/div[1]').text
            movies_sub4.loc[index, 'score'] = score # If the film has a score written, take it and assign it to our 'score' column
        except:
            movies_sub4.loc[index, 'score'] = np.nan # If not, assign a null value to the 'score' column

    except TimeoutException:
        continue  # Go to the next iteration, if it shows an error of Timeout Exception 

driver.quit() # At the end of the for loop, close the Firefox navegator 



In [305]:
movies_sub4 = movies_sub4[:748] 
movies_sub4

'''Our iteration stopped working at index 748, so we rewrite movies_sub4 
with the rows that worked and store it. This is our fourth subdataframe'''

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,duration,score
0,1927,1928,1,ACTOR,Emil Jannings,The Last Command,1h 28min,"8,0\n/10"
1,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,,
2,1927,1928,1,ART DIRECTION,William Cameron Menzies,The Dove;,1h 30min,"6,5\n/10"
3,1927,1928,1,CINEMATOGRAPHY,Charles Rosher,Sunrise,1h 34min,"8,1\n/10"
4,1927,1928,1,CINEMATOGRAPHY,Karl Struss,Sunrise,1h 34min,"8,1\n/10"
...,...,...,...,...,...,...,...,...
2141,2022,2023,95,PRODUCTION DESIGN,Production Design: Christian M. Goldbeck; Set ...,All Quiet on the Western Front,2h 28min,"7,8\n/10"
2142,2022,2023,95,SHORT FILM (Animated),Charlie Mackesy and Matthew Freud,"The Boy, the Mole, the Fox and the Horse",32min,"7,8\n/10"
2143,2022,2023,95,SHORT FILM (Live Action),Tom Berkeley and Ross White,An Irish Goodbye,,
2144,2022,2023,95,SOUND,"Mark Weingarten, James H. Mather, Al Nelson, C...",Top Gun: Maverick,2h 10min,"8,3\n/10"


## Fifth and last iteration ##

In [309]:
movies_sub5 = movies.iloc[2146:]
movies_sub5 = movies_sub5.reset_index()
movies_sub5 = movies_sub5.drop('index', axis=1)
movies_sub5 # These are the remaining rows to iterate

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,duration,score
0,2022,2023,95,WRITING (Adapted Screenplay),Screenplay by Sarah Polley,Women Talking,,
1,2022,2023,95,WRITING (Original Screenplay),Written by Daniel Kwan & Daniel Scheinert,Everything Everywhere All at Once,,
2,2022,2023,95,AWARD OF COMMENDATION,To Ryan Laney for his innovative adaptation an...,Welcome to Chechnya,,


In [310]:
driver = webdriver.Firefox()

# Now, we repeat the same iteration as before, but with our subdataframe number 5


driver.get("https://www.imdb.com/") # This is the web page we get the info from
driver.implicitly_wait(2)

for index, (value, year) in enumerate(zip(movies_sub5['film'], movies_sub5['year_film'])): # We iterate through each film (row), and we want to write its title and year
    try:
        
        driver.find_element("css selector", "#suggestion-search").send_keys(f'{value} {str(year)}') # In IMDB's search box, we write each film and its year
        time.sleep(2)

        try:
            
            driver.find_element("css selector", "#react-autowhatever-navSuggestionSearch--item-0 > a:nth-child(1) > div:nth-child(2)").click() # Try to click on the first option that our seach shows
        except:
            movies_sub5.loc[index, 'score'] = np.nan # If our movie doesn't have an IMDB profile, we fill our column 'score' and 'duration' with null values, and clear our search box
            movies_sub5.loc[index, 'duration'] = np.nan
            driver.find_element("css selector", "#suggestion-search").clear()

            continue  # Continue to the next iteration

        try:
            duration = driver.find_element('xpath', '/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/ul/li[3]').text
            movies_sub5.loc[index, 'duration'] = duration # If the film has a duration written, take it and assign it to our 'duration' column
        except:
            movies_sub5.loc[index, 'duration'] = np.nan # If not, assign a null value to the 'duration' column
            continue  # Continue to the next iteration

        try:
            score = driver.find_element('xpath', '/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[2]/div[2]/div/div[1]/a/span/div/div[2]/div[1]').text
            movies_sub5.loc[index, 'score'] = score # If the film has a score written, take it and assign it to our 'score' column
        except:
            movies_sub5.loc[index, 'score'] = np.nan # If not, assign a null value to the 'score' column

    except TimeoutException:
        continue  # Go to the next iteration, if it shows an error of Timeout Exception 

driver.quit() # At the end of the for loop, close the Firefox navegator 



In [312]:
movies_sub5 # This is our fifth and last subdataframe

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,duration,score
0,2022,2023,95,WRITING (Adapted Screenplay),Screenplay by Sarah Polley,Women Talking,1h 44min,"6,9\n/10"
1,2022,2023,95,WRITING (Original Screenplay),Written by Daniel Kwan & Daniel Scheinert,Everything Everywhere All at Once,2h 19min,"7,8\n/10"
2,2022,2023,95,AWARD OF COMMENDATION,To Ryan Laney for his innovative adaptation an...,Welcome to Chechnya,1h 47min,"7,9\n/10"


 ## Concatenation of all subdataframes ## 

In [None]:
df_final = pd.concat([movies_sub1, movies_sub2, movies_sub3, movies_sub4, movies_sub5], ignore_index=True)

'''We concatenate all subdataframes and store them in a variable called df_final'''


In [318]:
df_final

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,duration,score
0,1927,1928,1,ACTOR,Emil Jannings,The Last Command,1h 28min,"8,0\n/10"
1,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,,
2,1927,1928,1,ART DIRECTION,William Cameron Menzies,The Dove;,1h 30min,"6,5\n/10"
3,1927,1928,1,CINEMATOGRAPHY,Charles Rosher,Sunrise,1h 34min,"8,1\n/10"
4,1927,1928,1,CINEMATOGRAPHY,Karl Struss,Sunrise,1h 34min,"8,1\n/10"
...,...,...,...,...,...,...,...,...
2144,2022,2023,95,SOUND,"Mark Weingarten, James H. Mather, Al Nelson, C...",Top Gun: Maverick,2h 10min,"8,3\n/10"
2145,2022,2023,95,VISUAL EFFECTS,"Joe Letteri, Richard Baneham, Eric Saindon and...",Avatar: The Way of Water,3h 12min,"7,6\n/10"
2146,2022,2023,95,WRITING (Adapted Screenplay),Screenplay by Sarah Polley,Women Talking,1h 44min,"6,9\n/10"
2147,2022,2023,95,WRITING (Original Screenplay),Written by Daniel Kwan & Daniel Scheinert,Everything Everywhere All at Once,2h 19min,"7,8\n/10"


In [320]:
df_final.to_csv('scrapped_csv', index=False) # We save our final df as a CSV
