In [155]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from currency_converter import CurrencyConverter
from currency_converter import RateNotFoundError
from datetime import date

warnings.filterwarnings('ignore')
imdb = pd.read_csv('./data/IMDB_Movies_2000_2020.csv')

In [156]:
imdb = imdb[imdb["budget"].notna()]
imdb = imdb[imdb["usa_gross_income"].notna()]
imdb = imdb[imdb["worlwide_gross_income"].notna()]
imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3684 entries, 0 to 5475
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          3684 non-null   object 
 1   title                  3684 non-null   object 
 2   original_title         3684 non-null   object 
 3   year                   3684 non-null   int64  
 4   date_published         3684 non-null   object 
 5   genre                  3684 non-null   object 
 6   duration               3684 non-null   int64  
 7   country                3684 non-null   object 
 8   language_1             3684 non-null   object 
 9   language_2             1574 non-null   object 
 10  language_3             685 non-null    object 
 11  director               3684 non-null   object 
 12  writer                 3683 non-null   object 
 13  actors                 3684 non-null   object 
 14  actors_1               3684 non-null   object 
 15  acto

In [157]:
# convert currency for a certain col
def currency_conv(col_name, df):
    c = CurrencyConverter(decimal=True)
    for ind in df.index:
        if(type(df.at[ind, col_name]) == str):
            if (df.at[ind, col_name].isnumeric() == False):
                temp_str = df.at[ind, col_name]
                #print(temp_str)
                currency = temp_str[0:3]
                #print("currency:", currency)
                amount = int(temp_str[3:].strip())
                try:
                    converted = c.convert(amount, currency.strip(), 'USD', date=date(2022, 3, 31))
                    #print(converted)
                except ValueError:
                    df = df.drop(index=ind)
                except RateNotFoundError:
                    df = df.drop(index=ind)
                else:
                    df.at[ind, col_name] = float(converted)
            
    #df.sort_index()
    return df

In [158]:
imdb = currency_conv("budget", imdb)
imdb = currency_conv("usa_gross_income", imdb)
imdb = currency_conv("worlwide_gross_income", imdb)

In [159]:
print(imdb.columns)

Index(['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language_1', 'language_2',
       'language_3', 'director', 'writer', 'actors', 'actors_1', 'actors_f2',
       'description', 'desc35', 'avg_vote', 'votes', 'budget',
       'usa_gross_income', 'worlwide_gross_income', 'reviews_from_users'],
      dtype='object')


In [160]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3684 entries, 0 to 5475
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          3684 non-null   object 
 1   title                  3684 non-null   object 
 2   original_title         3684 non-null   object 
 3   year                   3684 non-null   int64  
 4   date_published         3684 non-null   object 
 5   genre                  3684 non-null   object 
 6   duration               3684 non-null   int64  
 7   country                3684 non-null   object 
 8   language_1             3684 non-null   object 
 9   language_2             1574 non-null   object 
 10  language_3             685 non-null    object 
 11  director               3684 non-null   object 
 12  writer                 3683 non-null   object 
 13  actors                 3684 non-null   object 
 14  actors_1               3684 non-null   object 
 15  acto

In [164]:
#imdb["budget"] = pd.to_numeric(imdb["budget"])

In [162]:
imdb["usa_gross_income"] = pd.to_numeric(imdb["usa_gross_income"])

In [163]:
imdb["worlwide_gross_income"] = pd.to_numeric(imdb["worlwide_gross_income"])

## Profits for Each Genre

In [28]:
array_genre = []
for str in imdb.genre:
    genres = str.split(',')
    for genre in genres:
        array_genre.append(genre.strip())
        
genre_list, frequency = np.unique(array_genre, return_counts=True)

In [29]:
genre_budget = []
genre_usa_income = []
genre_global_income = []
genre_usa_profit = []
genre_global_profit = []

for genre_name in genre_list:
    sum_budget = imdb["budget"][imdb["genre"].str.contains(genre_name)].sum()
    sum_usa_income = imdb["usa_gross_income"][imdb["genre"].str.contains(genre_name)].sum()
    sum_global_income = imdb["worlwide_gross_income"][imdb["genre"].str.contains(genre_name)].sum()
    sum_usa_profit = sum_usa_income - sum_budget
    sum_global_profit = sum_global_income - sum_budget
    
    genre_budget.append(sum_budget)
    genre_usa_income.append(sum_usa_income)
    genre_global_income.append(sum_global_income)
    genre_usa_profit.append(sum_usa_profit)
    genre_global_profit.append(sum_global_profit)
    
df_genre = {"genre": genre_list,
            "num": frequency,
            "all_budget": genre_budget,
            "avg_budget": genre_budget/frequency,
            "all_usa_income": genre_global_income,
            "avg_usa_income": genre_global_income/frequency,
            "all_global_income": genre_global_income,
            "avg_global_income": genre_global_income/frequency,
            "all_usa_profit": genre_usa_profit,
            "avg_usa_profit": genre_usa_profit/frequency,
            "all_global_profit": genre_global_profit,
            "avg_global_profit": genre_global_profit/frequency
            }
df_genre = pd.DataFrame(df_genre)

In [30]:
df_genre

Unnamed: 0,genre,num,all_budget,avg_budget,all_usa_income,avg_usa_income,all_global_income,avg_global_income,all_usa_profit,avg_usa_profit,all_global_profit,avg_global_profit
0,Action,1052,72294870000.0,68721360.0,209480432812,199125900.0,209480432812,199125900.0,8510990000.0,8090294.0,137185600000.0,130404500.0
1,Adventure,782,71629810000.0,91598230.0,240432699372,307458700.0,240432699372,307458700.0,16821700000.0,21511120.0,168802900000.0,215860500.0
2,Animation,228,19269180000.0,84513950.0,70516176338,309281500.0,70516176338,309281500.0,7340647000.0,32195820.0,51247000000.0,224767500.0
3,Biography,303,8006258000.0,26423300.0,20750333642,68482950.0,20750333642,68482950.0,1728507000.0,5704642.0,12744080000.0,42059650.0
4,Comedy,1395,52220240000.0,37433860.0,155103005998,111185000.0,155103005998,111185000.0,20433920000.0,14647970.0,102882800000.0,73751090.0
5,Crime,684,20854510000.0,30489060.0,46151947427,67473610.0,46151947427,67473610.0,1491699000.0,2180847.0,25297430000.0,36984550.0
6,Drama,1936,53023050000.0,27387940.0,138467806142,71522630.0,138467806142,71522630.0,8586289000.0,4435067.0,85444750000.0,44134690.0
7,Family,227,15153770000.0,66756680.0,46611967857,205339100.0,46611967857,205339100.0,4474093000.0,19709660.0,31458200000.0,138582400.0
8,Fantasy,329,21514550000.0,65393760.0,66551536395,202284300.0,66551536395,202284300.0,3516603000.0,10688760.0,45036990000.0,136890500.0
9,History,126,4403455000.0,34948060.0,9392674795,74545040.0,9392674795,74545040.0,-271594900.0,-2155515.0,4989220000.0,39596980.0


In [177]:
genres = df_genre.genre
best_films = []
for genre in genres:
    needed_titles = imdb[["title", "worlwide_gross_income"]][imdb["genre"].str.contains(genre)]
    needed_title = needed_titles["title"][needed_titles["worlwide_gross_income"]==max(needed_titles["worlwide_gross_income"])]
    needed_title = needed_title.values[0]
    best_films.append(needed_title)
    
df_genre["best_film"] = best_films

In [178]:
df_genre

Unnamed: 0,genre,num,all_budget,avg_budget,all_usa_income,avg_usa_income,all_global_income,avg_global_income,all_usa_profit,avg_usa_profit,all_global_profit,avg_global_profit,best_film,poster
0,Action,1052,72294870000.0,68721360.0,209480432812,199125900.0,209480432812,199125900.0,8510990000.0,8090294.0,137185600000.0,130404500.0,Avengers: Endgame,http://static.rbcasting.com/FANTASTIC-4-I-FANT...
1,Adventure,782,71629810000.0,91598230.0,240432699372,307458700.0,240432699372,307458700.0,16821700000.0,21511120.0,168802900000.0,215860500.0,Avengers: Endgame,http://image.tmdb.org/t/p/original/u19pRkgBvVk...
2,Animation,228,19269180000.0,84513950.0,70516176338,309281500.0,70516176338,309281500.0,7340647000.0,32195820.0,51247000000.0,224767500.0,Il Re Leone,http://image.tmdb.org/t/p/original/u19pRkgBvVk...
3,Biography,303,8006258000.0,26423300.0,20750333642,68482950.0,20750333642,68482950.0,1728507000.0,5704642.0,12744080000.0,42059650.0,Bohemian Rhapsody,http://i.imgur.com/PZ6JESM.jpg
4,Comedy,1395,52220240000.0,37433860.0,155103005998,111185000.0,155103005998,111185000.0,20433920000.0,14647970.0,102882800000.0,73751090.0,Frozen II - Il segreto di Arendelle,https://celebmafia.com/wp-content/uploads/2016...
5,Crime,684,20854510000.0,30489060.0,46151947427,67473610.0,46151947427,67473610.0,1491699000.0,2180847.0,25297430000.0,36984550.0,Fast & Furious 8,https://www.cineraglio.it/wp-content/uploads/2...
6,Drama,1936,53023050000.0,27387940.0,138467806142,71522630.0,138467806142,71522630.0,8586289000.0,4435067.0,85444750000.0,44134690.0,Avengers: Endgame,https://media.senscritique.com/media/000020056...
7,Family,227,15153770000.0,66756680.0,46611967857,205339100.0,46611967857,205339100.0,4474093000.0,19709660.0,31458200000.0,138582400.0,La bella e la bestia,http://image.tmdb.org/t/p/original/hs8i1Vpwrx6...
8,Fantasy,329,21514550000.0,65393760.0,66551536395,202284300.0,66551536395,202284300.0,3516603000.0,10688760.0,45036990000.0,136890500.0,Avatar,https://celebmafia.com/wp-content/uploads/2016...
9,History,126,4403455000.0,34948060.0,9392674795,74545040.0,9392674795,74545040.0,-271594900.0,-2155515.0,4989220000.0,39596980.0,Dunkirk,https://image.tmdb.org/t/p/original/aLx9UXH9fK...


## Profits For Each Country

In [31]:
array_country = []
for str in imdb.country:
    countrys = str.split(',')
    for country in countrys:
        array_country.append(country.strip())
        
country_list, frequency = np.unique(array_country, return_counts=True)

In [32]:
country_budget = []
country_usa_income = []
country_global_income = []
country_usa_profit = []
country_global_profit = []

for country_name in country_list:
    sum_budget = imdb["budget"][imdb["country"].str.contains(country_name)].sum()
    sum_usa_income = imdb["usa_gross_income"][imdb["country"].str.contains(country_name)].sum()
    sum_global_income = imdb["worlwide_gross_income"][imdb["country"].str.contains(country_name)].sum()
    sum_usa_profit = sum_usa_income - sum_budget
    sum_global_profit = sum_global_income - sum_budget
    
    country_budget.append(sum_budget)
    country_usa_income.append(sum_usa_income)
    country_global_income.append(sum_global_income)
    country_usa_profit.append(sum_usa_profit)
    country_global_profit.append(sum_global_profit)
    
df_country = {"country": country_list, 
              "num": frequency,
              "all_budget": country_budget,
              "avg_budget": country_budget/frequency,
              "all_usa_income": country_global_income,
              "avg_usa_income": country_global_income/frequency,
              "all_global_income": country_global_income,
              "avg_global_income": country_global_income/frequency,
              "all_usa_profit": country_usa_profit,
              "avg_usa_profit": country_usa_profit/frequency,
              "all_global_profit": country_global_profit,
              "avg_global_profit": country_global_profit/frequency
              }
df_country = pd.DataFrame(df_country)

In [33]:
df_country

Unnamed: 0,country,num,all_budget,avg_budget,all_usa_income,avg_usa_income,all_global_income,avg_global_income,all_usa_profit,avg_usa_profit,all_global_profit,avg_global_profit
0,Afghanistan,1,2.000000e+07,2.000000e+07,75011029,7.501103e+07,75011029,7.501103e+07,-4.199922e+06,-4.199922e+06,5.501103e+07,5.501103e+07
1,Algeria,1,1.609645e+07,1.609645e+07,22963701,2.296370e+07,22963701,2.296370e+07,-1.577575e+07,-1.577575e+07,6.867251e+06,6.867251e+06
2,Angola,1,5.000000e+05,5.000000e+05,101729,1.017290e+05,101729,1.017290e+05,-3.996300e+05,-3.996300e+05,-3.982710e+05,-3.982710e+05
3,Argentina,17,1.622324e+08,9.543082e+06,391307270,2.301807e+07,391307270,2.301807e+07,-8.159453e+07,-4.799678e+06,2.290749e+08,1.347499e+07
4,Australia,125,7.064687e+09,5.651750e+07,19705530424,1.576442e+08,19705530424,1.576442e+08,6.385102e+08,5.108081e+06,1.264084e+10,1.011267e+08
...,...,...,...,...,...,...,...,...,...,...,...,...
83,USA,3150,1.457051e+11,4.625559e+07,437428911220,1.388663e+08,437428911220,1.388663e+08,4.255919e+10,1.351085e+07,2.917238e+11,9.261073e+07
84,Ukraine,1,3.000000e+07,3.000000e+07,108979549,1.089795e+08,108979549,1.089795e+08,1.715062e+06,1.715062e+06,7.897955e+07,7.897955e+07
85,United Arab Emirates,28,1.625700e+09,5.806071e+07,4843282289,1.729744e+08,4843282289,1.729744e+08,5.575554e+07,1.991269e+06,3.217582e+09,1.149137e+08
86,Venezuela,1,1.400000e+06,1.400000e+06,3217176,3.217176e+06,3217176,3.217176e+06,-7.075100e+04,-7.075100e+04,1.817176e+06,1.817176e+06


In [122]:
needed_titles

407    The Quiet American
Name: title, dtype: object

In [189]:
df_country_code = pd.read_csv('df_country_code.csv')

In [190]:
countrys = df_country.country
best_films = []
for country in countrys:
    needed_titles = imdb[["title", "worlwide_gross_income"]][imdb["country"].str.contains(country)]
    needed_title = needed_titles["title"][needed_titles["worlwide_gross_income"]==max(needed_titles["worlwide_gross_income"])]
    needed_title = needed_title.values[0]
    best_films.append(needed_title)
    
df_country_code["best_film"] = best_films

In [191]:
#df_country_code = df_country_code.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
df_country_code

Unnamed: 0,country,num,all_budget,avg_budget,all_usa_income,avg_usa_income,all_global_income,avg_global_income,all_usa_profit,avg_usa_profit,all_global_profit,avg_global_profit,code,best_film,poster
0,Afghanistan,1,2.000000e+07,2.000000e+07,75011029,7.501103e+07,75011029,7.501103e+07,-4.199922e+06,-4.199922e+06,5.501103e+07,5.501103e+07,AFG,Il cacciatore di aquiloni,https://aforismi.meglio.it/img/film/il-cacciat...
1,Algeria,1,1.609645e+07,1.609645e+07,22963701,2.296370e+07,22963701,2.296370e+07,-1.577575e+07,-1.577575e+07,6.867251e+06,6.867251e+06,DZA,Days of Glory,https://cdn.shopify.com/s/files/1/0747/3829/pr...
2,Angola,1,5.000000e+05,5.000000e+05,101729,1.017290e+05,101729,1.017290e+05,-3.996300e+05,-3.996300e+05,-3.982710e+05,-3.982710e+05,AGO,Rubber,https://i.pinimg.com/736x/56/9a/3c/569a3cb545e...
3,Argentina,17,1.622324e+08,9.543082e+06,391307270,2.301807e+07,391307270,2.301807e+07,-8.159453e+07,-4.799678e+06,2.290749e+08,1.347499e+07,ARG,Focus - Niente è come sembra,https://i.jeded.com/i/dancer-in-the-dark.11408...
4,Australia,125,7.064687e+09,5.651750e+07,19705530424,1.576442e+08,19705530424,1.576442e+08,6.385102e+08,5.108081e+06,1.264084e+10,1.011267e+08,AUS,Aquaman,https://fbwebsitedefaultstorage.blob.core.wind...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,USA,3150,1.457051e+11,4.625559e+07,437428911220,1.388663e+08,437428911220,1.388663e+08,4.255919e+10,1.351085e+07,2.917238e+11,9.261073e+07,USA,Avengers: Endgame,https://celebmafia.com/wp-content/uploads/2016...
84,Ukraine,1,3.000000e+07,3.000000e+07,108979549,1.089795e+08,108979549,1.089795e+08,1.715062e+06,1.715062e+06,7.897955e+07,7.897955e+07,UKR,Transporter 3,http://www.geourdufilm.com/wp-content/uploads/...
85,United Arab Emirates,28,1.625700e+09,5.806071e+07,4843282289,1.729744e+08,4843282289,1.729744e+08,5.575554e+07,1.991269e+06,3.217582e+09,1.149137e+08,ARE,Fast & Furious 7,https://image.tmdb.org/t/p/original/fyJw1kPiY3...
86,Venezuela,1,1.400000e+06,1.400000e+06,3217176,3.217176e+06,3217176,3.217176e+06,-7.075100e+04,-7.075100e+04,1.817176e+06,1.817176e+06,VEN,El abrazo de la serpiente,http://www.trigon-film.org/en/shop/Posters_One...


## Profits For Each Actor

In [34]:
# split actors' name
def split_names(col):
    array = []
    imdb_sub = imdb.dropna(subset = [col])
    for ind in imdb_sub.index:
        names = imdb_sub.at[ind, col].split(',')
        for name in names:
            array.append(name.strip())
            
    return np.unique(array, return_counts=True)

In [35]:
actor_list, actor_freq = split_names('actors')
actor_list

array(['50 Cent', 'A. Cheron Hall', 'A. Delon Ellis Jr.', ...,
       'Ørjan Gamst', 'Þorleifur Einarsson', 'Þorsteinn Gunnar Bjarnason'],
      dtype='<U37')

In [36]:
actor_budget = []
actor_usa_income = []
actor_global_income = []
actor_usa_profit = []
actor_global_profit = []

for actor_name in actor_list:
    sum_budget = imdb["budget"][imdb["actors"].str.contains(actor_name)].sum()
    sum_usa_income = imdb["usa_gross_income"][imdb["actors"].str.contains(actor_name)].sum()
    sum_global_income = imdb["worlwide_gross_income"][imdb["actors"].str.contains(actor_name)].sum()
    sum_usa_profit = sum_usa_income - sum_budget
    sum_global_profit = sum_global_income - sum_budget
    
    actor_budget.append(sum_budget)
    actor_usa_income.append(sum_usa_income)
    actor_global_income.append(sum_global_income)
    actor_usa_profit.append(sum_usa_profit)
    actor_global_profit.append(sum_global_profit)
    
df_actor = {"actor": actor_list,
            "num": actor_freq,
            "all_budget": actor_budget,
            "avg_budget": actor_budget/actor_freq,
            "all_usa_income": actor_global_income,
            "avg_usa_income": actor_global_income/actor_freq,
            "all_global_income": actor_global_income,
            "avg_global_income": actor_global_income/actor_freq,
            "all_usa_profit": actor_usa_profit,
            "avg_usa_profit": actor_usa_profit/actor_freq,
            "all_global_profit": actor_global_profit,
            "avg_global_profit": actor_global_profit/actor_freq
            }
df_actor = pd.DataFrame(df_actor)

In [37]:
df_actor

Unnamed: 0,actor,num,all_budget,avg_budget,all_usa_income,avg_usa_income,all_global_income,avg_global_income,all_usa_profit,avg_usa_profit,all_global_profit,avg_global_profit
0,50 Cent,7,2.270000e+08,3.242857e+07,439019372,6.271705e+07,439019372,6.271705e+07,-3.319207e+07,-4.741724e+06,2.120194e+08,3.028848e+07
1,A. Cheron Hall,1,2.500000e+07,2.500000e+07,30893885,3.089388e+07,30893885,3.089388e+07,-5.471398e+06,-5.471398e+06,5.893885e+06,5.893885e+06
2,A. Delon Ellis Jr.,1,3.200000e+07,3.200000e+07,44102389,4.410239e+07,44102389,4.410239e+07,8.222729e+06,8.222729e+06,1.210239e+07,1.210239e+07
3,A. Jay Radcliff,1,6.000000e+07,6.000000e+07,204594016,2.045940e+08,204594016,2.045940e+08,2.057401e+07,2.057401e+07,1.445940e+08,1.445940e+08
4,A. Russell Andrews,2,7.300000e+07,3.650000e+07,81591954,4.079598e+07,81591954,4.079598e+07,-1.873638e+07,-9.368190e+06,8.591954e+06,4.295977e+06
...,...,...,...,...,...,...,...,...,...,...,...,...
27347,Óscar Lara,1,3.774340e+06,3.774340e+06,78638987,7.863899e+07,78638987,7.863899e+07,3.386944e+06,3.386944e+06,7.486465e+07,7.486465e+07
27348,Óscar Zafra,1,5.600000e+06,5.600000e+06,18853164,1.885316e+07,18853164,1.885316e+07,-5.572234e+06,-5.572234e+06,1.325316e+07,1.325316e+07
27349,Ørjan Gamst,2,4.800978e+06,2.400489e+06,3354274,1.677137e+06,3354274,1.677137e+06,-4.716763e+06,-2.358382e+06,-1.446704e+06,-7.233521e+05
27350,Þorleifur Einarsson,1,1.942675e+06,1.942675e+06,1826583,1.826583e+06,1826583,1.826583e+06,-1.793425e+06,-1.793425e+06,-1.160920e+05,-1.160920e+05


In [38]:
df_actor = df_actor.drop(df_actor[df_actor.num < 3].index)
df_actor

Unnamed: 0,actor,num,all_budget,avg_budget,all_usa_income,avg_usa_income,all_global_income,avg_global_income,all_usa_profit,avg_usa_profit,all_global_profit,avg_global_profit
0,50 Cent,7,2.270000e+08,3.242857e+07,439019372,6.271705e+07,439019372,6.271705e+07,-3.319207e+07,-4.741724e+06,2.120194e+08,3.028848e+07
5,A.C. Peterson,5,2.080000e+08,4.160000e+07,204800090,4.096002e+07,204800090,4.096002e+07,-1.140146e+08,-2.280293e+07,-3.199910e+06,-6.399820e+05
14,AJ Bowen,5,6.750000e+06,1.350000e+06,28202493,5.640499e+06,28202493,5.640499e+06,1.215820e+07,2.431639e+06,2.145249e+07,4.290499e+06
21,Aamir Khan,8,5.917283e+07,7.396604e+06,474140146,5.926752e+07,474140146,5.926752e+07,-2.990473e+07,-3.738092e+06,4.149673e+08,5.187091e+07
39,Aaron Eckhart,23,1.075950e+09,4.678043e+07,2619989122,1.139126e+08,2619989122,1.139126e+08,2.200234e+08,9.566233e+06,1.544039e+09,6.713214e+07
...,...,...,...,...,...,...,...,...,...,...,...,...
27310,Zoë Bell,3,2.310000e+08,7.700000e+07,467353170,1.557844e+08,467353170,1.557844e+08,-6.273745e+07,-2.091248e+07,2.363532e+08,7.878439e+07
27312,Zoë Kravitz,15,1.276000e+09,8.506667e+07,3308124357,2.205416e+08,3308124357,2.205416e+08,7.212042e+07,4.808028e+06,2.032124e+09,1.354750e+08
27319,Zuleikha Robinson,3,1.395000e+08,4.650000e+07,149955125,4.998504e+07,149955125,4.998504e+07,-5.486172e+07,-1.828724e+07,1.045512e+07,3.485042e+06
27343,Ólafur Darri Ólafsson,3,3.600000e+08,1.200000e+08,872439794,2.908133e+08,872439794,2.908133e+08,-1.317048e+08,-4.390161e+07,5.124398e+08,1.708133e+08


In [39]:
df_genre.to_csv('df_genre.csv')

In [40]:
df_country.to_csv('df_country.csv')

In [41]:
df_actor.to_csv('df_actor.csv')

In [50]:
df_actor_sorted = df_actor.sort_values(by="avg_global_profit", ascending=False)

In [51]:
df_actor_sorted

Unnamed: 0,actor,num,all_budget,avg_budget,all_usa_income,avg_usa_income,all_global_income,avg_global_income,all_usa_profit,avg_usa_profit,all_global_profit,avg_global_profit
6799,Dileep Rao,3,4.270000e+08,1.423333e+08,3751066729,1.250356e+09,3751066729,1.250356e+09,6.681844e+08,2.227281e+08,3.324067e+09,1.108022e+09
8216,Eve,4,2.894123e+09,7.235308e+08,6623289041,1.655822e+09,6623289041,1.655822e+09,7.698612e+07,1.924653e+07,3.729166e+09,9.322915e+08
13074,Joonas Suotamo,3,7.950000e+08,2.650000e+08,3535293091,1.178431e+09,3535293091,1.178431e+09,8.706323e+08,2.902108e+08,2.740293e+09,9.134310e+08
20850,Pierre Coffin,4,2.990000e+08,7.475000e+07,3708122528,9.270306e+08,3708122528,9.270306e+08,9.212494e+08,2.303124e+08,3.409123e+09,8.522806e+08
5580,Daisy Ridley,5,9.420000e+08,1.884000e+08,5178964715,1.035793e+09,5178964715,1.035793e+09,1.348126e+09,2.696252e+08,4.236965e+09,8.473929e+08
...,...,...,...,...,...,...,...,...,...,...,...,...
1223,Amrita Acharia,3,1.231010e+08,4.103366e+07,33165159,1.105505e+07,33165159,1.105505e+07,-1.050525e+08,-3.501748e+07,-8.993582e+07,-2.997861e+07
15969,Lou Diamond Phillips,3,1.910000e+08,6.366667e+07,93942763,3.131425e+07,93942763,3.131425e+07,-1.336011e+08,-4.453371e+07,-9.705724e+07,-3.235241e+07
1628,Angela Sarafyan,3,1.410000e+08,4.700000e+07,35457825,1.181928e+07,35457825,1.181928e+07,-1.233617e+08,-4.112058e+07,-1.055422e+08,-3.518072e+07
22432,Rose McGowan,4,2.560000e+08,6.400000e+07,109020098,2.725502e+07,109020098,2.725502e+07,-1.918608e+08,-4.796519e+07,-1.469799e+08,-3.674498e+07


In [53]:
df_actor_simple = df_actor_sorted[:500]
df_actor_simple

Unnamed: 0,actor,num,all_budget,avg_budget,all_usa_income,avg_usa_income,all_global_income,avg_global_income,all_usa_profit,avg_usa_profit,all_global_profit,avg_global_profit
6799,Dileep Rao,3,4.270000e+08,1.423333e+08,3751066729,1.250356e+09,3751066729,1.250356e+09,6.681844e+08,2.227281e+08,3.324067e+09,1.108022e+09
8216,Eve,4,2.894123e+09,7.235308e+08,6623289041,1.655822e+09,6623289041,1.655822e+09,7.698612e+07,1.924653e+07,3.729166e+09,9.322915e+08
13074,Joonas Suotamo,3,7.950000e+08,2.650000e+08,3535293091,1.178431e+09,3535293091,1.178431e+09,8.706323e+08,2.902108e+08,2.740293e+09,9.134310e+08
20850,Pierre Coffin,4,2.990000e+08,7.475000e+07,3708122528,9.270306e+08,3708122528,9.270306e+08,9.212494e+08,2.303124e+08,3.409123e+09,8.522806e+08
5580,Daisy Ridley,5,9.420000e+08,1.884000e+08,5178964715,1.035793e+09,5178964715,1.035793e+09,1.348126e+09,2.696252e+08,4.236965e+09,8.473929e+08
...,...,...,...,...,...,...,...,...,...,...,...,...
4161,Cate Blanchett,32,2.174400e+09,6.795000e+07,8495900728,2.654969e+08,8495900728,2.654969e+08,9.456306e+08,2.955096e+07,6.321501e+09,1.975469e+08
10853,Jada Pinkett Smith,10,6.270000e+08,6.270000e+07,2600826915,2.600827e+08,2600826915,2.600827e+08,3.994747e+08,3.994747e+07,1.973827e+09,1.973827e+08
25631,Tom Gallop,4,2.950000e+08,7.375000e+07,1083845745,2.709614e+08,1083845745,2.709614e+08,2.395483e+08,5.988706e+07,7.888457e+08,1.972114e+08
6710,Dexter Darden,4,1.690000e+08,4.225000e+07,957674896,2.394187e+08,957674896,2.394187e+08,8.135305e+07,2.033826e+07,7.886749e+08,1.971687e+08


In [90]:
df_actor_simple = df_actor.drop(df_actor[df_actor.avg_global_profit <= 2e+08].index)
df_actor_simple

Unnamed: 0,actor,num,all_budget,avg_budget,all_usa_income,avg_usa_income,all_global_income,avg_global_income,all_usa_profit,avg_usa_profit,all_global_profit,avg_global_profit
76,Aaron Taylor-Johnson,8,6.020000e+08,7.525000e+07,2374263594,2.967829e+08,2374263594,2.967829e+08,2.929402e+08,3.661752e+07,1.772264e+09,2.215329e+08
102,Abby Ryder Fortson,4,3.115000e+08,7.787500e+07,1234152669,3.085382e+08,1234152669,3.085382e+08,1.245093e+08,3.112734e+07,9.226527e+08,2.306632e+08
175,Adam Brown,4,8.980000e+08,2.245000e+08,3788733757,9.471834e+08,3788733757,9.471834e+08,1.510692e+08,3.776730e+07,2.890734e+09,7.226834e+08
189,Adam Driver,13,1.020672e+09,7.851321e+07,4763175634,3.663981e+08,4763175634,3.663981e+08,1.200400e+09,9.233847e+07,3.742504e+09,2.878849e+08
310,Adrian Rawlins,6,3.621010e+08,6.035017e+07,1975461762,3.292436e+08,1975461762,3.292436e+08,2.792756e+08,4.654592e+07,1.613361e+09,2.688935e+08
...,...,...,...,...,...,...,...,...,...,...,...,...
26935,Yahya Abdul-Mateen II,4,3.330000e+08,8.325000e+07,2016613620,5.041534e+08,2016613620,5.041534e+08,4.095467e+08,1.023867e+08,1.683614e+09,4.209034e+08
27180,Zachary Levi,5,6.450000e+08,1.290000e+08,2187879494,4.375759e+08,2187879494,4.375759e+08,1.923363e+08,3.846726e+07,1.542879e+09,3.085759e+08
27225,Zazie Beetz,3,2.850000e+08,9.500000e+07,2081645650,6.938819e+08,2081645650,6.938819e+08,4.087432e+08,1.362477e+08,1.796646e+09,5.988819e+08
27237,Zendaya,4,4.990000e+08,1.247500e+08,2661221426,6.653054e+08,2661221426,6.653054e+08,4.833135e+08,1.208284e+08,2.162221e+09,5.405554e+08


In [91]:
df_actor_simple.to_csv('df_actor_simple.csv')

In [17]:
df_ctry = pd.read_csv('./data/country.csv')

In [18]:
df_ctry.rename(columns = {'name':'country'}, inplace = True)

In [19]:
df_ctry

Unnamed: 0,country,code,num,Unnamed: 3
0,Antigua and Barbuda,ATG,0,
1,Algeria,DZA,1,
2,Azerbaijan,AZE,0,
3,Albania,ALB,0,
4,Armenia,ARM,0,
...,...,...,...,...
241,Saint Barthelemy,BLM,0,
242,Guernsey,GGY,0,
243,Jersey,JEY,0,
244,South Georgia South Sandwich Islands,SGS,0,


In [20]:
new_df = df_ctry.append(df_country)
new_df = df_country.merge(df_ctry[['country','code']], on='country', how='left')

In [21]:
new_df

Unnamed: 0,country,num,all_budget,avg_budget,all_usa_income,avg_usa_income,all_global_income,avg_global_income,all_usa_profit,avg_usa_profit,all_global_profit,avg_global_profit,code
0,Afghanistan,1,2.000000e+07,2.000000e+07,75011029,7.501103e+07,75011029,7.501103e+07,-4.199922e+06,-4.199922e+06,5.501103e+07,5.501103e+07,AFG
1,Algeria,1,1.609645e+07,1.609645e+07,22963701,2.296370e+07,22963701,2.296370e+07,-1.577575e+07,-1.577575e+07,6.867251e+06,6.867251e+06,DZA
2,Angola,1,5.000000e+05,5.000000e+05,101729,1.017290e+05,101729,1.017290e+05,-3.996300e+05,-3.996300e+05,-3.982710e+05,-3.982710e+05,AGO
3,Argentina,17,1.622324e+08,9.543082e+06,391307270,2.301807e+07,391307270,2.301807e+07,-8.159453e+07,-4.799678e+06,2.290749e+08,1.347499e+07,ARG
4,Australia,125,7.064687e+09,5.651750e+07,19705530424,1.576442e+08,19705530424,1.576442e+08,6.385102e+08,5.108081e+06,1.264084e+10,1.011267e+08,AUS
...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,USA,3150,1.457051e+11,4.625559e+07,437428911220,1.388663e+08,437428911220,1.388663e+08,4.255919e+10,1.351085e+07,2.917238e+11,9.261073e+07,
84,Ukraine,1,3.000000e+07,3.000000e+07,108979549,1.089795e+08,108979549,1.089795e+08,1.715062e+06,1.715062e+06,7.897955e+07,7.897955e+07,UKR
85,United Arab Emirates,28,1.625700e+09,5.806071e+07,4843282289,1.729744e+08,4843282289,1.729744e+08,5.575554e+07,1.991269e+06,3.217582e+09,1.149137e+08,ARE
86,Venezuela,1,1.400000e+06,1.400000e+06,3217176,3.217176e+06,3217176,3.217176e+06,-7.075100e+04,-7.075100e+04,1.817176e+06,1.817176e+06,VEN


In [22]:
new_df.to_csv('df_country_code.csv')

## Get Pictures

In [137]:
from selenium import webdriver
import time
import os
import re
import requests
from PIL import Image
from PIL import UnidentifiedImageError
from io import BytesIO
from selenium.webdriver.common.by import By
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
#!pip install webdriver-manager

class Crawler_google_images:
    # 初始化
    def __init__(self, keyword, browser):
        self.url = 'https://www.bing.com/images/search?q='+keyword +' film poster'+'&tbm=isch'
        self.keyword = keyword
        self.browser = browser

    # 获得Chrome驱动，并访问url
    def init_browser(self):
        # 访问url
        browser.get(self.url)
        # 最大化窗口，之后需要爬取窗口中所见的所有图片
        browser.maximize_window()
        return browser

    #下载图片
    def get_image_url(self, browser,round=2):
        count = 0 #图片序号
        pos = 0
        for i in range(round):
            pos += 500
            # 向下滑动
            js = 'var q=document.documentElement.scrollTop=%d'%pos
            browser.execute_script(js)
            time.sleep(1)
            # 找到图片
            # html = browser.page_source#也可以抓取当前页面的html文本，然后用beautifulsoup来抓取
            #直接通过tag_name来抓取是最简单的，比较方便

            img_elements = browser.find_elements(By.CLASS_NAME, 'iusc')
            # print(img_elements)
            for img_element in img_elements:
                m = img_element.get_attribute('m')
                # print(m)
                try:
                    img_url = re.findall("murl\":\"https?://[^\s]*.jpg", m)
                    # print(img_url[0][7:])
                    img_url = img_url[0][7:]
                except (TypeError, IndexError):
                    print("No match for img_url, finding next")
                    continue
                try:
                    r = requests.get(img_url)
                    img = Image.open(BytesIO(r.content))
                    width, height = img.size
                    # print("width: ", width , ", height: ", height)
                    if(height>width):
                        return img_url
                except UnidentifiedImageError:
                    print("UnidentifiedImageError, finding next")
                    continue
                

                #防止反爬机制
                time.sleep(0.2)

    def run(self):
        browser = self.init_browser()
        image_url = self.get_image_url(browser,10)#可以修改爬取的页面数，基本10页是100多张图片
        # browser.close()
        return image_url


def fetchImage(browser, keyword):
    craw = Crawler_google_images(keyword, browser)
    return craw.run()

In [139]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-infobars")
#browser = webdriver.Chrome(chrome_options=chrome_options)
browser = webdriver.Chrome(ChromeDriverManager().install())
key = 'Days of Glory'
image_url = fetchImage(browser, key)
image_url




[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/Users/lilyw/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


UnidentifiedImageError, finding next


'https://cdn.shopify.com/s/files/1/0747/3829/products/HP2565_6434f90b-994d-4c02-96c7-67341b610e81_1024x1024.jpg'

In [179]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-infobars")
browser = webdriver.Chrome(ChromeDriverManager().install())

films = df_genre["best_film"]

genre_poster_urls = []
#actor_image_urls = df_actor_simple[df_actor_simple["actor"].apply(lambda x: x in train_user_id)]
for film in films:
    key = film
    try:
        genre_poster_url = fetchImage(browser, key)
    except:
        genre_poster_urls = ''
    genre_poster_urls.append(genre_poster_url)

genre_poster_urls




[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/Users/lilyw/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next


['https://cdn.shopify.com/s/files/1/1057/4964/products/avengers-endgame-vintage-movie-poster-original-1-sheet-27x41.jpg',
 'https://cdn.shopify.com/s/files/1/1057/4964/products/avengers-endgame-vintage-movie-poster-original-1-sheet-27x41.jpg',
 'http://image.tmdb.org/t/p/original/xmdlYgWRFKaEHUMyPV6U7p5dInS.jpg',
 'https://assets.smoothradio.com/2018/19/bohemian-rhapsody-poster-1526389920.jpg',
 'http://image.tmdb.org/t/p/original/lLieiMb0bQ0Ys2Vwkc2vHrXTY5d.jpg',
 'https://gfx.videobuster.de/archive/v/cdsYRfGf0G-Rcjswjm-4Uhgcz0lMkawsCUyRqclMkZpbWGZJTJGanBlZyUyRmb8tqpmZvRmYmNlYmNm-7c5ZWVj8Lg0ZC5qcGcmcj137zg/go-fast-poster.jpg',
 'https://cdn.shopify.com/s/files/1/1057/4964/products/avengers-endgame-vintage-movie-poster-original-1-sheet-27x41.jpg',
 'https://www.elettrostar.com/wp-content/uploads/2017/06/La-Bella-e-la-Bestia-2017.jpg',
 'https://image.tmdb.org/t/p/original/jRXYjXNq0Cs2TcJjLkki24MLp7u.jpg',
 'https://cdn.traileraddict.com/content/warner-bros-pictures/dunkirk-poster-7.jpg

In [192]:
df_genre["poster"] = genre_poster_urls
df_genre

Unnamed: 0,genre,num,all_budget,avg_budget,all_usa_income,avg_usa_income,all_global_income,avg_global_income,all_usa_profit,avg_usa_profit,all_global_profit,avg_global_profit,best_film,poster
0,Action,1052,72294870000.0,68721360.0,209480432812,199125900.0,209480432812,199125900.0,8510990000.0,8090294.0,137185600000.0,130404500.0,Avengers: Endgame,https://cdn.shopify.com/s/files/1/1057/4964/pr...
1,Adventure,782,71629810000.0,91598230.0,240432699372,307458700.0,240432699372,307458700.0,16821700000.0,21511120.0,168802900000.0,215860500.0,Avengers: Endgame,https://cdn.shopify.com/s/files/1/1057/4964/pr...
2,Animation,228,19269180000.0,84513950.0,70516176338,309281500.0,70516176338,309281500.0,7340647000.0,32195820.0,51247000000.0,224767500.0,Il Re Leone,http://image.tmdb.org/t/p/original/xmdlYgWRFKa...
3,Biography,303,8006258000.0,26423300.0,20750333642,68482950.0,20750333642,68482950.0,1728507000.0,5704642.0,12744080000.0,42059650.0,Bohemian Rhapsody,https://assets.smoothradio.com/2018/19/bohemia...
4,Comedy,1395,52220240000.0,37433860.0,155103005998,111185000.0,155103005998,111185000.0,20433920000.0,14647970.0,102882800000.0,73751090.0,Frozen II - Il segreto di Arendelle,http://image.tmdb.org/t/p/original/lLieiMb0bQ0...
5,Crime,684,20854510000.0,30489060.0,46151947427,67473610.0,46151947427,67473610.0,1491699000.0,2180847.0,25297430000.0,36984550.0,Fast & Furious 8,https://gfx.videobuster.de/archive/v/cdsYRfGf0...
6,Drama,1936,53023050000.0,27387940.0,138467806142,71522630.0,138467806142,71522630.0,8586289000.0,4435067.0,85444750000.0,44134690.0,Avengers: Endgame,https://cdn.shopify.com/s/files/1/1057/4964/pr...
7,Family,227,15153770000.0,66756680.0,46611967857,205339100.0,46611967857,205339100.0,4474093000.0,19709660.0,31458200000.0,138582400.0,La bella e la bestia,https://www.elettrostar.com/wp-content/uploads...
8,Fantasy,329,21514550000.0,65393760.0,66551536395,202284300.0,66551536395,202284300.0,3516603000.0,10688760.0,45036990000.0,136890500.0,Avatar,https://image.tmdb.org/t/p/original/jRXYjXNq0C...
9,History,126,4403455000.0,34948060.0,9392674795,74545040.0,9392674795,74545040.0,-271594900.0,-2155515.0,4989220000.0,39596980.0,Dunkirk,https://cdn.traileraddict.com/content/warner-b...


In [193]:
df_genre.to_csv('df_genre_new.csv')

In [194]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-infobars")
browser = webdriver.Chrome(ChromeDriverManager().install())

films = df_country_code["best_film"]

cty_poster_urls = []
#actor_image_urls = df_actor_simple[df_actor_simple["actor"].apply(lambda x: x in train_user_id)]
for film in films:
    key = film
    try:
        cty_poster_url = fetchImage(browser, key)
    except:
        cty_poster_url = ''
    cty_poster_urls.append(cty_poster_url)

cty_poster_urls




[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/Users/lilyw/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
Uniden

['https://aforismi.meglio.it/img/film/il-cacciatore-di-aquiloni.jpg',
 'https://cdn.shopify.com/s/files/1/0747/3829/products/HP2565_6434f90b-994d-4c02-96c7-67341b610e81_1024x1024.jpg',
 'https://i.pinimg.com/736x/56/9a/3c/569a3cb545ee2040e5ff8215e27593c0--tire-movie-movie-tv.jpg',
 'http://www.nerdsrevenge.it/wp-content/uploads/2015/03/Focus-Niente-e-come-sembra.jpg',
 'http://hdqwalls.com/download/aquaman-2018-movie-poster-1z-2160x3840.jpg',
 'https://fr.web.img5.acsta.net/pictures/15/10/06/15/23/066601.jpg',
 'https://picfiles.alphacoders.com/349/349733.jpg',
 'http://images2.fanpop.com/images/photos/8200000/Official-Poster-new-york-i-love-you-8236556-1728-2560.jpg',
 'http://aforismi.meglio.it/img/film/I_puffi.jpg',
 'https://m.media-amazon.com/images/I/51ydmeqR93L.jpg',
 'https://gfx.videobuster.de/archive/v/cdsYRfGf0G-Rcjswjm-4Uhgcz0lMkawsCUyRqclMkZpbWGZJTJGanBlZyUyRmb8tqpmZvRmYmNlYmNm-7c5ZWVj8Lg0ZC5qcGcmcj137zg/go-fast-poster.jpg',
 'https://thmoviehdd.com/wp-content/uploads/2019

In [196]:
df_country_code["poster"] = cty_poster_urls
df_country_code

Unnamed: 0,country,num,all_budget,avg_budget,all_usa_income,avg_usa_income,all_global_income,avg_global_income,all_usa_profit,avg_usa_profit,all_global_profit,avg_global_profit,code,best_film,poster
0,Afghanistan,1,2.000000e+07,2.000000e+07,75011029,7.501103e+07,75011029,7.501103e+07,-4.199922e+06,-4.199922e+06,5.501103e+07,5.501103e+07,AFG,Il cacciatore di aquiloni,https://aforismi.meglio.it/img/film/il-cacciat...
1,Algeria,1,1.609645e+07,1.609645e+07,22963701,2.296370e+07,22963701,2.296370e+07,-1.577575e+07,-1.577575e+07,6.867251e+06,6.867251e+06,DZA,Days of Glory,https://cdn.shopify.com/s/files/1/0747/3829/pr...
2,Angola,1,5.000000e+05,5.000000e+05,101729,1.017290e+05,101729,1.017290e+05,-3.996300e+05,-3.996300e+05,-3.982710e+05,-3.982710e+05,AGO,Rubber,https://i.pinimg.com/736x/56/9a/3c/569a3cb545e...
3,Argentina,17,1.622324e+08,9.543082e+06,391307270,2.301807e+07,391307270,2.301807e+07,-8.159453e+07,-4.799678e+06,2.290749e+08,1.347499e+07,ARG,Focus - Niente è come sembra,http://www.nerdsrevenge.it/wp-content/uploads/...
4,Australia,125,7.064687e+09,5.651750e+07,19705530424,1.576442e+08,19705530424,1.576442e+08,6.385102e+08,5.108081e+06,1.264084e+10,1.011267e+08,AUS,Aquaman,http://hdqwalls.com/download/aquaman-2018-movi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,USA,3150,1.457051e+11,4.625559e+07,437428911220,1.388663e+08,437428911220,1.388663e+08,4.255919e+10,1.351085e+07,2.917238e+11,9.261073e+07,USA,Avengers: Endgame,https://cdn.shopify.com/s/files/1/1057/4964/pr...
84,Ukraine,1,3.000000e+07,3.000000e+07,108979549,1.089795e+08,108979549,1.089795e+08,1.715062e+06,1.715062e+06,7.897955e+07,7.897955e+07,UKR,Transporter 3,http://www.geourdufilm.com/wp-content/uploads/...
85,United Arab Emirates,28,1.625700e+09,5.806071e+07,4843282289,1.729744e+08,4843282289,1.729744e+08,5.575554e+07,1.991269e+06,3.217582e+09,1.149137e+08,ARE,Fast & Furious 7,https://gfx.videobuster.de/archive/v/cdsYRfGf0...
86,Venezuela,1,1.400000e+06,1.400000e+06,3217176,3.217176e+06,3217176,3.217176e+06,-7.075100e+04,-7.075100e+04,1.817176e+06,1.817176e+06,VEN,El abrazo de la serpiente,http://www.trigon-film.org/en/shop/Posters_One...


In [197]:
df_country_code.to_csv('df_country_code.csv')

In [None]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-infobars")
#browser = webdriver.Chrome(chrome_options=chrome_options)
browser = webdriver.Chrome(ChromeDriverManager().install())
key = 'Aaron Taylor-Johnson'
image_url = fetchImage(browser, key)
image_url

In [114]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-infobars")
browser = webdriver.Chrome(ChromeDriverManager().install())

actor_names = df_actor_simple["actor"]

actor_poster_urls = []
#actor_image_urls = df_actor_simple[df_actor_simple["actor"].apply(lambda x: x in train_user_id)]
for actor_name in actor_names:
    key = actor_name
    try:
        actor_poster_url = fetchImage(browser, key)
    except:
        actor_poster_url = ''
    actor_image_urls.append(actor_poster_url)

actor_poster_urls




[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/Users/lilyw/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
UnidentifiedImageError, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
UnidentifiedImageError, finding next


No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match

[]

In [106]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-infobars")
browser = webdriver.Chrome(ChromeDriverManager().install())

actor_names = df_actor_simple["actor"]

actor_image_urls = []
#actor_image_urls = df_actor_simple[df_actor_simple["actor"].apply(lambda x: x in train_user_id)]
for actor_name in actor_names:
    key = actor_name + " photo"
    try:
        actor_image_url = fetchImage(browser, key)
    except:
        actor_image_url = ''
    actor_image_urls.append(actor_image_url)

actor_image_urls




[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/Users/lilyw/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
UnidentifiedImageError, finding next
UnidentifiedImageError, finding next
No match for img_url, finding next
No match for img_url, finding next
No match for img_url, finding next
Unidenti

['https://fr.web.img2.acsta.net/pictures/19/11/26/11/16/1949279.jpg',
 'https://filmvf.info/wp-content/uploads/2018/07/xJcNxy1RV8E0mX2Kr0BMjgzMmfu.jpg',
 'http://www1.pictures.zimbio.com/gi/Premiere+Disney+Jerry+Bruckheimer+Films+Pirates+v-k3TLahISux.jpg',
 'https://celebdonut.com/wp-content/uploads/2019/10/adam-driver-attends-the-report-premiere-during-the-63rd-bfi-london-film-festival-in-london-4.jpg',
 'https://superstarsbio.com/wp-content/uploads/2020/02/Adrian-Rawlins.jpg',
 'https://fr.web.img5.acsta.net/pictures/17/07/13/10/56/166119.jpg',
 'https://i.pinimg.com/736x/12/4c/78/124c78fa38400301679c1b1cd89f836f.jpg',
 'https://i.pinimg.com/originals/9a/6c/13/9a6c13ba1b93e0ffb58b9d6c6bf76191.jpg',
 'http://fr.web.img2.acsta.net/pictures/16/07/19/16/33/125610.jpg',
 'https://fr.web.img4.acsta.net/pictures/15/10/28/15/24/352939.jpg',
 'http://www.cinemapassion.com/photos-personnalites/Alan-Tudyk-photo-4641.jpg',
 'https://fr.web.img3.acsta.net/pictures/19/08/21/21/32/0609816.jpg',
 'h

In [107]:
len(actor_image_urls)

483

In [108]:
df_actor_simple["actor_image"] = actor_image_urls

In [None]:
df_actor_simple["actor_poster"] = actor_poster_urls

In [109]:
df_actor_simple

Unnamed: 0,actor,num,all_budget,avg_budget,all_usa_income,avg_usa_income,all_global_income,avg_global_income,all_usa_profit,avg_usa_profit,all_global_profit,avg_global_profit,actor_image
76,Aaron Taylor-Johnson,8,6.020000e+08,7.525000e+07,2374263594,2.967829e+08,2374263594,2.967829e+08,2.929402e+08,3.661752e+07,1.772264e+09,2.215329e+08,https://fr.web.img2.acsta.net/pictures/19/11/2...
102,Abby Ryder Fortson,4,3.115000e+08,7.787500e+07,1234152669,3.085382e+08,1234152669,3.085382e+08,1.245093e+08,3.112734e+07,9.226527e+08,2.306632e+08,https://filmvf.info/wp-content/uploads/2018/07...
175,Adam Brown,4,8.980000e+08,2.245000e+08,3788733757,9.471834e+08,3788733757,9.471834e+08,1.510692e+08,3.776730e+07,2.890734e+09,7.226834e+08,http://www1.pictures.zimbio.com/gi/Premiere+Di...
189,Adam Driver,13,1.020672e+09,7.851321e+07,4763175634,3.663981e+08,4763175634,3.663981e+08,1.200400e+09,9.233847e+07,3.742504e+09,2.878849e+08,https://celebdonut.com/wp-content/uploads/2019...
310,Adrian Rawlins,6,3.621010e+08,6.035017e+07,1975461762,3.292436e+08,1975461762,3.292436e+08,2.792756e+08,4.654592e+07,1.613361e+09,2.688935e+08,https://superstarsbio.com/wp-content/uploads/2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26935,Yahya Abdul-Mateen II,4,3.330000e+08,8.325000e+07,2016613620,5.041534e+08,2016613620,5.041534e+08,4.095467e+08,1.023867e+08,1.683614e+09,4.209034e+08,http://fr.web.img6.acsta.net/pictures/19/10/21...
27180,Zachary Levi,5,6.450000e+08,1.290000e+08,2187879494,4.375759e+08,2187879494,4.375759e+08,1.923363e+08,3.846726e+07,1.542879e+09,3.085759e+08,https://fr.web.img2.acsta.net/medias/nmedia/18...
27225,Zazie Beetz,3,2.850000e+08,9.500000e+07,2081645650,6.938819e+08,2081645650,6.938819e+08,4.087432e+08,1.362477e+08,1.796646e+09,5.988819e+08,https://www.hawtcelebs.com/wp-content/uploads/...
27237,Zendaya,4,4.990000e+08,1.247500e+08,2661221426,6.653054e+08,2661221426,6.653054e+08,4.833135e+08,1.208284e+08,2.162221e+09,5.405554e+08,https://fr.web.img4.acsta.net/pictures/19/12/2...


In [110]:
df_actor_simple.to_csv('df_actor_simple.csv')