In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

def get_data(url,actor):
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
        region_divs = soup.find_all('div', class_='value')
        if len(region_divs) == 4:
            data = [[actor, region_divs[0].text[:-1], region_divs[1].text[:-1], region_divs[2].text[:-1], region_divs[3].text[:-1]]]
        else:
            return None
    else:
        return None
    return data

In [46]:
# read a tsv file and return a list of actors name (at index 8)
metadata_char = pd.read_csv('character.metadata.tsv', sep='\t',header=None)
# create a column with the number of occurence of each actor
metadata_char['count'] = metadata_char.groupby(8)[8].transform('count')
# sort the dataframe by the number of occurences
metadata_char = metadata_char.sort_values(by='count',ascending=False)
# find the number of times Mel Blanc appears
print(len(metadata_char[metadata_char[8] == 'Charley Boorman']))

11


In [17]:
# This cell creates the actor list per occurence removing already scrapped actors

# read a tsv file and return a list of actors name (at index 8)
metadata_char = pd.read_csv('character.metadata.tsv', sep='\t',header=None)
# create a column with the number of occurence of each actor
metadata_char['count'] = metadata_char.groupby(8)[8].transform('count')
# sort the dataframe by the number of occurences
metadata_char = metadata_char.sort_values(by='count',ascending=False)
# find the number of times Mel Blanc appears
print(len(metadata_char[metadata_char[8] == 'Mel Blanc']))
actors = metadata_char.iloc[:,8].unique()

# this is the ordered (per occurence) list of actors to scrap
print(actors)

# this transforms the list into the correct names for URL
#transform every actor name with spaces into a concatenated word with _ (ex: 'John Doe' -> 'John_Doe')
actors = [actor.replace(' ', '_') if isinstance(actor, str) else "r" for actor in actors]
actors = [actor.replace('.', '') if isinstance(actor, str) else "r" for actor in actors]
actors = [actor.replace("'", '') if isinstance(actor, str) else "r" for actor in actors]
actors = [actor.replace("-", '_') if isinstance(actor, str) else "r" for actor in actors]
# actors = [actor.replace(",", '') if isinstance(actor, str) else "r" for actor in actors]
# actors = [actor.replace("(", '') if isinstance(actor, str) else "r" for actor in actors]
# actors = [actor.replace(")", '') if isinstance(actor, str) else "r" for actor in actors]

#full list of actors
actors = np.array(actors)

# this is the list of actors already scrapped and found
df = pd.read_csv('actors.csv', sep=',')
# filter all the names that are already in the csv file (to avoid duplicates)
actors = actors[~np.isin(actors, df['Actor'])]

# re create the original actors list
metadata_char = pd.read_csv('character.metadata.tsv', sep='\t',header=None)
original_list = metadata_char.iloc[:,8].unique()
original_list = [actor.replace(' ', '_') if isinstance(actor, str) else "r" for actor in original_list]
# remove the first 3700 values of the original_list from actors to avoid duplicates
actors = actors[~np.isin(actors, original_list[:3700])] # 3700 is the number of actors already scrapped in the original list

# remove the first 6800 values of the list because already scrapped (last scrapped was Marsha_Thomason not saved)

print(actors[:10])

# TODO: remove the already scrapped actors from the good list once it is used


791
['Mel Blanc' 'Mithun Chakraborty' 'Oliver Hardy' ... 'Michael Mastro'
 'Lisbeth Holm Larsen' nan]
['Brahmanandam' 'Harold_Lloyd' 'Stan_Laurel' 'Lon_Chaney,_Sr' 'Om_Puri'
 'Aruna_Irani' 'Akshay_Kumar' 'Sridevi_Kapoor' 'Naseeruddin_Shah'
 'Govinda']


In [18]:
print("Actors yet to scrap: "+ str(len(actors)))

Actors yet to scrap: 130577


In [19]:
def get_data_list(actors,df):
    for actor in actors:
        print(actor)
        url = 'https://today.yougov.com/topics/entertainment/explore/actor/'+actor
        data = get_data(url,actor)
        if data is None:
            url = 'https://today.yougov.com/topics/entertainment/explore/actor/'+actor+'-Actor'
            data = get_data(url,actor)
        if data is not None:
            print("Found it!")
            df_temp = pd.DataFrame(data, columns=['Actor','Fame','Liked', 'Disliked','Neutral'])
            df = pd.concat([df, df_temp])
    return df

Size 134k, current 4800

In [45]:
start = 6800
block = 50
for i in range(10):
    df = get_data_list(actors[start+block*i:start+block*(i+1)],df)
    print("Saved up to: "+str(start+block*(i+1)))
    print(df.shape)
    # save the dataframe into a csv file
    df.to_csv('actors.csv', index=False)
    print("Saved!")

Marsha_Thomason
Margie_Hines
Minoru_Chiaki
Hans_Christian_Blech
Lawrence_Chou
Krysten_Ritter
Harvey_Stephens
Jennifer_Carpenter
Barbara_Parkins
Yunus_Parvez
Rodolfo_Acosta
Mark_Moses
Cüneyt_Arkın
Jessie_Matthews
Michael_Tse
Helmut_Dantine
David_Lythgoe
Omar_Benson_Miller
Adriana_Asti
Bob_Holt
Justin_Chon
Nobuko_Miyamoto
Kyoka_Suzuki
Gloria_Diaz
Ami_Dolenz
B_Reeves_Eason
Mark_Feuerstein
Bindu_Madhavi
Andrea_Marcovicci
Lim_Chang_jung
Sonika_Gill
Eddy_Ko
Rick_Hoffman
William_S_Burroughs
Liane_Balaban
Audrey_Wasilewski
Deborah_Van_Valkenburgh
Devin_Ratray
Yeom_Jeong_ah
Gerard_Parkes
Jacques_Nolot
Vonetta_McGee
Alf_Kjellin
Sonia_Sahani
Marie_Gillain
Charley_Boorman


KeyboardInterrupt: 

In [44]:
print(df)
print(df.shape)

              Actor Fame Liked Disliked Neutral
0          Ice_Cube   93    57       11      25
1     Jason_Statham   78    59        4      14
2         Pam_Grier   62    39        4      19
3       John_Hawkes   50    29        5      16
4        Seth_Green   78    45        7      26
..              ...  ...   ...      ...     ...
0              Mr_T   92    64        6      22
0   Sherman_Hemsley   76    55        6      15
0       Bobby_Darin   69    46        4      19
0       Joey_Fatone   67    36        8      23
0      Mark_Rylance   37    23        2      12

[1355 rows x 5 columns]
(1355, 5)


Last saved data from 0-3700

In [104]:
# save the dataframe into a csv file
df.to_csv('actors.csv', index=False)

In [10]:
url = 'https://today.yougov.com/topics/entertainment/explore/actor/Morgan_Freeman'
actor = 'Morgan_Freeman'
data = get_data(url,actor)
if data:
    df_temp = pd.DataFrame(data, columns=['Actor', 'Fame', 'Liked', 'Unliked', 'Neutral'])
else:
    print('ici')
    url = 'https://today.yougov.com/topics/entertainment/explore/actor/Morgan_Freeman-Actor'
    data = get_data(url,actor)
    df_temp = pd.DataFrame(data, columns=['Actor', 'Fame', 'Liked', 'Unliked', 'Neutral'])

print(data)
#df = pd.concat([df, df_temp])

ici
[['Morgan_Freeman', '98', '86', '3', '9']]


In [30]:
url = 'https://today.yougov.com/topics/entertainment/explore/actor/Maria_Garcia'
r = requests.get(url)
%time 
print(r.status_code)

# 6s for non existing actor code 200
# 2.3s for existing actor code 200
#1.5s for non existing actor code 200 (cached?)




CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs
200
