In [7]:
import pandas as pd
import os
import ast
import wikipediaapi
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

In [29]:
PATH_IN = 'dataset'

fname = os.path.join(PATH_IN, 'movie.metadata.tsv')
movie = pd.read_csv(fname, delimiter='\t', header=None)
movie.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue', 'Movie runtime', 'Movie languages (Freebase ID:name tuples)', 'Movie countries (Freebase ID:name tuples)', 'Movie genres (Freebase ID:name tuples)']

fname = os.path.join(PATH_IN, 'character.metadata.tsv')
character = pd.read_csv(fname, delimiter='\t', header=None)
character.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie release date', 'Character name', 'Actor date of birth', ' Actor gender', 'Actor height (in meters)', 'Actor ethnicity (Freebase ID)', 'Actor name',
                     'Actor age at movie release', 'Freebase character/actor map ID', 'Freebase character ID', 'Freebase actor ID']

fname = os.path.join(PATH_IN, 'plot_summaries.txt')
plot_summaries = pd.read_csv(fname, delimiter='\t', header=None)
plot_summaries.columns = ['Wikipedia movie ID', 'Summary']


fname = os.path.join(PATH_IN, 'tvtropes.clusters.txt')
tvtropes = pd.read_csv(fname, delimiter='\t', header=None, names = ['Trope','StringDict'])


tvtropes['Dictionnary'] = tvtropes['StringDict'].apply(ast.literal_eval)
df = pd.json_normalize(tvtropes['Dictionnary'])
tvtropes['Character Name'] = df['char']
tvtropes['Movie name'] = df['movie']
tvtropes['Freebase movie ID'] = df['id']
tvtropes['Actor name'] = df['actor']
tvtropes = tvtropes.drop(tvtropes.columns[1], axis=1)
tvtropes = tvtropes.drop(tvtropes.columns[1], axis=1)



fname = os.path.join(PATH_IN, 'name.clusters.txt')
name_clusters = pd.read_csv(fname, delimiter='\t', header=None, names = ['Character Name','ID'])

In [85]:
movie["Movie release year"] = movie["Movie release date"].apply(lambda x: x if len(str(x)) == 4 else pd.to_datetime(x, errors='coerce').year)
movie["Movie release year"] = movie["Movie release year"].fillna(0).astype(int)
movie["Movie release year"]


0        2001
1        2000
2        1988
3        1987
4        1983
         ... 
81736    2011
81737    2011
81738    1972
81739    1992
81740    2002
Name: Movie release year, Length: 81741, dtype: int32

In [8]:
movie['Movie name'].value_counts()[movie['Movie name'].value_counts() >= 2]

Movie name
Alice in Wonderland      17
Macbeth                  16
Les Misérables           13
A Christmas Carol        13
Hero                     12
                         ..
Perfect Day               2
Straw Dogs                2
Falling in Love Again     2
Aradhana                  2
Double Vision             2
Name: count, Length: 4387, dtype: int64

In [9]:
movie[movie['Movie name'] == 'Alice in Wonderland']

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
4113,2828945,/m/085bgh,Alice in Wonderland,1903-05,,10.0,{},"{""/m/07ssc"": ""United Kingdom""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen..."
5852,62180,/m/0gtkg,Alice in Wonderland,1933-12-22,,76.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01hmnh"": ""Fantasy"", ""/m/01g6gs"": ""Black-a..."
9719,14482638,/m/04jpg2p,Alice in Wonderland,2010-03-05,1024300000.0,108.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/01zhp"": ""Computer Animation"", ""/m/0hqxf"":..."
10537,9522001,/m/02phpvf,Alice in Wonderland,2006,,,"{""/m/0999q"": ""Malayalam Language""}","{""/m/03rk0"": ""India""}","{""/m/02l7c8"": ""Romance Film"", ""/m/07s9rl0"": ""D..."
17101,13895620,/m/03cmhm1,Alice in Wonderland,,,,{},"{""/m/09c7w0"": ""United States of America""}","{""/m/04rlf"": ""Music"", ""/m/0hqxf"": ""Family Film..."
18184,22966826,/m/063_py3,Alice in Wonderland,1950,,83.0,{},"{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama"", ""/m/0hqxf"": ""Family Fi..."
28939,912670,/m/03p86z,Alice in Wonderland,1966,,72.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant..."
41969,2212695,/m/06wbq4,Alice in Wonderland,1976-12-10,90000000.0,81.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/04t36"": ""Musical"", ""/m/01z4y"": ""Comedy"", ..."
47640,28948624,/m/0dgqblh,Alice in Wonderland,1983,,90.0,"{""/m/02h40lc"": ""English Language""}",{},"{""/m/0hqxf"": ""Family Film"", ""/m/03k9fj"": ""Adve..."
51402,2828905,/m/085bd1,Alice in Wonderland,1999-02-28,,150.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/03k9fj"": ""Adventure"", ""/m/015w9s"": ""Telev..."


In [10]:
movie['Movie release date'].unique()

array(['2001-08-24', '2000-02-16', '1988', ..., '1927-11-22',
       '1972-09-22', '1992-05-21'], dtype=object)

In [12]:
plot_summaries

Unnamed: 0,Wikipedia movie ID,Summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho..."
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42300,35102018,American Luthier focuses on Randy Parsons’ tra...
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [13]:
tvtropes

Unnamed: 0,char,id,actor,Character Name,Movie name,Freebase movie ID,Actor name
0,Professor Philip Brainard,/m/0jy9q0,Robin Williams,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams
1,Professor Keenbean,/m/02vchl3,Michael McShane,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
2,Dr. Reinhardt Lane,/m/0k6fkc,Ian McKellen,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen
3,Dr. Harold Medford,/m/0k6_br,Edmund Gwenn,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn
4,Daniel Jackson,/m/0k3rhh,James Spader,Daniel Jackson,Stargate,/m/0k3rhh,James Spader
...,...,...,...,...,...,...,...
496,Morgan Earp,/m/0k776f,Bill Paxton,Morgan Earp,Tombstone,/m/0k776f,Bill Paxton
497,Colorado Ryan,/m/0k2kqg,Ricky Nelson,Colorado Ryan,Rio Bravo,/m/0k2kqg,Ricky Nelson
498,Tom Sawyer,/m/0k5nsh,Shane West,Tom Sawyer,The League of Extraordinary Gentlemen,/m/0k5nsh,Shane West
499,William H. 'Billy the Kid' Bonney,/m/03lrjk0,Emilio Estevez,William H. 'Billy the Kid' Bonney,Young Guns II,/m/03lrjk0,Emilio Estevez


Generate the data frame containing the winning oscar films

In [None]:


movie_names = []

#For windows users 
service = Service('C:\webdrivers\chromedriver.exe')
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service = service, options = options)

# for mac users
# driver = webdriver.Chrome()

url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Cinematography"
driver.get(url)


time.sleep(5)


tr_tags = driver.find_elements(By.TAG_NAME, 'tr')

for tr_tag in tr_tags:
    style = tr_tag.get_attribute('style')
    
    
    if 'rgb(250, 235, 134)' in style:  
      
        td_tags = tr_tag.find_elements(By.TAG_NAME, 'td')
        
        if len(td_tags) > 1:
            name = td_tags[0].text.strip()
            movie_names.append(name)
            movie_info = td_tags[1].text.strip()


driver.quit()
movies_oscar_winning = pd.DataFrame(movie_names, columns=['Movie name'])

In [23]:
movies_oscar_winning

Unnamed: 0,Movie name
0,Sunrise: A Song of Two Humans
1,White Shadows in the South Seas
2,With Byrd at the South Pole
3,Tabu: A Story of the South Seas
4,Shanghai Express
...,...
120,1917
121,Mank
122,Dune
123,All Quiet on the Western Front


Generate the data frame containing the actors that won an oscar

In [8]:
oscar_winners = pd.DataFrame()
names_list = []
# Set up the WebDriver
# For Windows users 
#service = Service('C:\webdrivers\chromedriver.exe')
#options = webdriver.ChromeOptions()
#driver = webdriver.Chrome(service = service, options = options)  # Make sure to have ChromeDriver installed

# For Mac users
driver = webdriver.Chrome()

# Specify the target URL
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor"
driver.get(url)

# Adding a wait to ensure all content is loaded
time.sleep(5)

# Find all 'tr' tags in the table to extract year, movie, and symbol information
tr_tags = driver.find_elements(By.TAG_NAME, 'tr')

# Iterate over each 'tr' tag to find the required information
for tr_tag in tr_tags:
    # Extract 'td' elements from the row
    td_tags = tr_tag.find_elements(By.TAG_NAME, 'td')

    if len(td_tags) > 0:
        # Extract the text content and background color from each 'td' tag
        for td_tag in td_tags:
            text = td_tag.text.strip()
            style = td_tag.get_attribute('style')

            # Check for specific symbols in the text and categorize them
            if '‡' in text:
                cleaned_text = text.replace('‡', '').strip()
                first_two_words = ' '.join(cleaned_text.split()[:2])
                names_list.append(first_two_words)
            #Refused oscar
            elif '§' in text:
                cleaned_text = text.replace('§', '').strip()
            
            #Posthume oscar
            elif '†' in text:
                cleaned_text = text.replace('†', '').strip()
               

# Close the browser
driver.quit()
character['oscar'] = [actor in  names_list for actor in character['Actor name']]

In [10]:
character[character['oscar'] == True]

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID,oscar
64,175026,/m/017n1p,1930,,1897-08-31,M,1.780,,Fredric March,,/m/03l6s7d,,/m/0h1_w,True
260,1369204,/m/04x8zs,1939,,1895-09-22,M,,/m/041rx,Paul Muni,,/m/09hyc43,,/m/0c92y,True
280,7447003,/m/0kv7sg,1987-05-22,,1916-04-05,M,1.905,/m/07bch9,Gregory Peck,71.0,/m/02tb8hr,,/m/0k9j_,True
286,167857,/m/016fyc,1994-10,Roger 'Verbal' Kint,1959-07-26,M,1.790,,Kevin Spacey,,/m/0k6t6y,/m/02xhr4s,/m/048lv,True
455,164387,/m/015wmg,1979-06-29,Arthur Kirkland,1940-04-25,M,1.700,/m/0xnvg,Al Pacino,39.0,/m/0jshhb,/m/02nw965,/m/0bj9k,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450300,7078738,/m/0h34pj,1968,Ned Merrill,1913-11-02,M,1.880,,Burt Lancaster,54.0,/m/02vb5yp,/m/0h2_2qy,/m/01g42,True
450327,1406622,/m/04_0sq,2003,,1951-07-06,M,1.830,,Geoffrey Rush,51.0,/m/0jz6sx,,/m/0170pk,True
450438,11823946,/m/02rtqvb,1996-10-25,Feste,1943-12-31,M,1.730,/m/0dryh9k,Ben Kingsley,52.0,/m/02vcv1c,/m/0267w5f,/m/016k6x,True
450550,6456053,/m/0g605h,1964-12-31,Lord Charles Frinton,1908-03-05,M,1.854,,Rex Harrison,56.0,/m/02vd7tt,/m/0hnw0mk,/m/0p9qb,True


In [None]:
#Creating a table with n reviews of each movie

#We can then adapt it for the movies which got an award
movie_titles = movies_oscar_winning['Movie name'].unique()

movie_reviews = pd.DataFrame(columns=['Movie name', 'Review'])

# For Windows users 
service = Service('C:\webdrivers\chromedriver.exe')
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service = service, options = options)

#For mac users
# driver = webdriver.Chrome()

nb_reviews_per_movie = 1

print("movie_titles", movie_titles)

movie_titles_dict = {}

for movie_title in movie_titles:
    url = f"https://www.rottentomatoes.com/search?search={movie_title.replace(' ', '_').lower()}"
    driver.get(url)
    time.sleep(5)
    
    movie_element = driver.find_element(By.TAG_NAME, 'movieTitle')
    movie_element.click()
    time.sleep(5)
    #review_elements = driver.find_elements(By.CLASS_NAME, 'review')
    #reviews = [review.text for review in review_elements[:nb_reviews_per_movie]]
    
    movie_titles_dict[movie_title] = movie_element.text

driver.quit()


movie_titles ['Sunrise: A Song of Two Humans' 'White Shadows in the South Seas'
 'With Byrd at the South Pole' 'Tabu: A Story of the South Seas'
 'Shanghai Express' 'A Farewell to Arms' 'Cleopatra'
 "A Midsummer Night's Dream" 'Anthony Adverse' 'The Garden of Allah'
 'The Good Earth' 'A Star Is Born' 'The Great Waltz' 'Sweethearts'
 'Wuthering Heights' 'Gone with the Wind' 'Rebecca' 'The Thief of Bagdad'
 'How Green Was My Valley' 'Blood and Sand' 'Mrs. Miniver'
 'The Black Swan' 'The Song of Bernadette' 'Phantom of the Opera' 'Laura'
 'Wilson' 'The Picture of Dorian Gray' 'Leave Her to Heaven'
 'Anna and the King of Siam' 'The Yearling' 'Great Expectations'
 'Black Narcissus' 'The Naked City' 'Joan of Arc' 'Battleground'
 'She Wore a Yellow Ribbon' 'The Third Man' "King Solomon's Mines"
 'A Place in the Sun' 'An American in Paris' 'The Bad and the Beautiful'
 'The Quiet Man' 'From Here to Eternity' 'Shane' 'On the Waterfront'
 'Three Coins in the Fountain' 'The Rose Tattoo' 'To Catch 

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".movieTitle"}
  (Session info: chrome=130.0.6723.92); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF769C63AF5+28005]
	(No symbol) [0x00007FF769BC83F0]
	(No symbol) [0x00007FF769A6580A]
	(No symbol) [0x00007FF769AB5A3E]
	(No symbol) [0x00007FF769AB5D2C]
	(No symbol) [0x00007FF769AFEA97]
	(No symbol) [0x00007FF769ADBA7F]
	(No symbol) [0x00007FF769AFB8B3]
	(No symbol) [0x00007FF769ADB7E3]
	(No symbol) [0x00007FF769AA75C8]
	(No symbol) [0x00007FF769AA8731]
	GetHandleVerifier [0x00007FF769F5646D+3118813]
	GetHandleVerifier [0x00007FF769FA6CC0+3448624]
	GetHandleVerifier [0x00007FF769F9CF3D+3408301]
	GetHandleVerifier [0x00007FF769D2A44B+841403]
	(No symbol) [0x00007FF769BD344F]
	(No symbol) [0x00007FF769BCF4C4]
	(No symbol) [0x00007FF769BCF65D]
	(No symbol) [0x00007FF769BBEBB9]
	BaseThreadInitThunk [0x00007FFB7953257D+29]
	RtlUserThreadStart [0x00007FFB7B82AF08+40]
