Extracting features of 2020 movies from Wikipedia

In [1]:
#importing modules
import pandas as pd
import numpy as np
import requests
import bs4 as bs
import urllib.request

In [2]:
def dataset(link):
  source = urllib.request.urlopen(link).read()
  soup = bs.BeautifulSoup(source,'lxml')

  tables = soup.find_all('table',class_='wikitable sortable')

  df_1 = pd.read_html(str(tables[0]))[0]
  df_2 = pd.read_html(str(tables[1]))[0]
  df_3 = pd.read_html(str(tables[2]))[0]
  df_4 = pd.read_html(str(tables[3]).replace("'1\"\'",'"1"'))[0] # avoided "ValueError: invalid literal for int() with base 10: '1"'
  df = df_1.append(df_2.append(df_3.append(df_4,ignore_index=True),ignore_index=True),ignore_index=True)
  
  final_df_2020 = df[['Title','Cast and crew']]
  return final_df_2020

In [3]:
link1 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2020"
final_df_2020 = dataset(link1)
final_df_2020

Unnamed: 0,Title,Cast and crew
0,The Grudge,Nicolas Pesce (director/screenplay); Andrea Ri...
1,Underwater,"William Eubank (director); Brian Duffield, Ada..."
2,Like a Boss,"Miguel Arteta (director); Sam Pitman, Adam Col..."
3,Three Christs,Jon Avnet (director/screenplay); Eric Nazarian...
4,Inherit the Viper,Anthony Jerjen (director); Andrew Crabtree (sc...
...,...,...
269,We Can Be Heroes,Robert Rodriguez (director/screenplay); Priyan...
270,News of the World,Paul Greengrass (director/screenplay); Luke Da...
271,One Night in Miami...,Regina King (director); Kemp Powers (screenpla...
272,Promising Young Woman,Emerald Fennell (director/screenplay); Carey M...


In [4]:
from tmdbv3api import TMDb, Movie
import json
import requests

tmdb = TMDb()
tmdb_movie = Movie()
tmdb.api_key = '22eb0026986397f50a2bbd92a1c791b9'

In [5]:
#getting genres from tmbd api
def get_genre(x):
  genres = []
  result = tmdb_movie.search(x)
  if not result:
    return np.NaN
  else:
    movie_id = result[0].id
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))
    data_json = response.json()
    if data_json['genres']:
        for i in range(0,len(data_json['genres'])):
            genres.append(data_json['genres'][i]['name'])
        return (" ".join(genres))
    else:
        return np.NaN
final_df_2020['genres'] = final_df_2020['Title'].map(lambda x: get_genre(str(x)))    
df_2020 = final_df_2020[['Title','Cast and crew','genres']]
df_2020

Unnamed: 0,Title,Cast and crew,genres
0,The Grudge,Nicolas Pesce (director/screenplay); Andrea Ri...,Horror Mystery Thriller
1,Underwater,"William Eubank (director); Brian Duffield, Ada...",Action Horror Science Fiction Thriller
2,Like a Boss,"Miguel Arteta (director); Sam Pitman, Adam Col...",Comedy
3,Three Christs,Jon Avnet (director/screenplay); Eric Nazarian...,Drama
4,Inherit the Viper,Anthony Jerjen (director); Andrew Crabtree (sc...,Crime Thriller Drama
...,...,...,...
269,We Can Be Heroes,Robert Rodriguez (director/screenplay); Priyan...,Action Fantasy Family Comedy
270,News of the World,Paul Greengrass (director/screenplay); Luke Da...,Drama Western Adventure Action
271,One Night in Miami...,Regina King (director); Kemp Powers (screenpla...,Drama
272,Promising Young Woman,Emerald Fennell (director/screenplay); Carey M...,Thriller Crime Drama


In [6]:
#getting director
def get_director(x):
      if " (director)" in x:
          return x.split(" (director)")[0]
      elif " (directors)" in x:
          return x.split(" (directors)")[0]
      else:
          return x.split(" (director/screenplay)")[0]
          
df_2020['director_name'] = df_2020['Cast and crew'].map(lambda x: get_director(x))

In [7]:
#getting actor 1
def get_actor1(x):
    return ((x.split("screenplay); ")[-1]).split(", ")[0])
    
df_2020['actor_1_name'] = df_2020['Cast and crew'].map(lambda x: get_actor1(x))

#getting actor 2
def get_actor2(x):
  if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
    return np.NaN
  else:
    return ((x.split("screenplay); ")[-1]).split(", ")[1])
      
df_2020['actor_2_name'] = df_2020['Cast and crew'].map(lambda x: get_actor2(x))

#getting actor 3
def get_actor3(x):
  if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
    return np.NaN
  else:
    return ((x.split("screenplay); ")[-1]).split(", ")[2])
      
df_2020['actor_3_name'] = df_2020['Cast and crew'].map(lambda x: get_actor3(x))

In [8]:
#considering only the important features
df_2020 = df_2020.rename(columns={'Title':'movie_title'})
df_2020 = df_2020[['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]

In [9]:
df_2020.isnull().sum()

director_name     0
actor_1_name      0
actor_2_name      5
actor_3_name     28
genres            3
movie_title       0
dtype: int64

In [10]:
#dropping null values
df_2020 = df_2020.dropna(how='any')

In [11]:
df_2020['movie_title'] = df_2020['movie_title'].str.lower()
df_2020['comb'] = df_2020['actor_1_name'] + ' ' + df_2020['actor_2_name'] + ' '+ df_2020['actor_3_name'] + ' '+ df_2020['director_name'] +' ' + df_2020['genres']
df_2020

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho,Horror Mystery Thriller,the grudge,Andrea Riseborough Demián Bichir John Cho Nico...
1,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick,Action Horror Science Fiction Thriller,underwater,Kristen Stewart Vincent Cassel Jessica Henwick...
2,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek,Comedy,like a boss,Tiffany Haddish Rose Byrne Salma Hayek Miguel ...
3,Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins,Drama,three christs,Richard Gere Peter Dinklage Walton Goggins Jon...
4,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs,Crime Thriller Drama,inherit the viper,Josh Hartnett Margarita Levieva Chandler Riggs...
...,...,...,...,...,...,...,...
268,Pete Docter,Jamie Foxx,Tina Fey,Graham Norton,Animation Family Comedy Fantasy Drama,soul,Jamie Foxx Tina Fey Graham Norton Pete Docter ...
269,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin,Action Fantasy Family Comedy,we can be heroes,Priyanka Chopra Jonas Pedro Pascal YaYa Gossel...
271,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge,Drama,one night in miami...,Kingsley Ben-Adir Eli Goree Aldis Hodge Regina...
272,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie,Thriller Crime Drama,promising young woman,Carey Mulligan Bo Burnham Alison Brie Emerald ...


Extracting features of 2021 movies from Wikipedia

In [12]:
link2 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2021"
final_df_2021 = dataset(link2)
final_df_2021

Unnamed: 0,Title,Cast and crew
0,Shadow in the Cloud,Roseanne Liang (director/screenplay); Max Land...
1,The White Tiger,Ramin Bahrani (director/screenplay); Adarsh Go...
2,Locked Down,Doug Liman (director); Steven Knight (screenpl...
3,The Dig,Simon Stone (director); Moira Buffini (screenp...
4,Outside the Wire,"Mikael Håfström (director); Rob Yescombe, Rowa..."
...,...,...
353,The Tragedy of Macbeth,Joel Coen (director/screenplay); Denzel Washin...
354,A Journal for Jordan,Denzel Washington (director); Virgil Williams ...
355,American Underdog,"Erwin brothers (directors); Jon Erwin, David A..."
356,Memoria,Apichatpong Weerasethakul (director/acreenplay...


In [13]:
#getting genres from tmbd api
def get_genre(x):
  genres = []
  result = tmdb_movie.search(x)
  if not result:
    return np.NaN
  else:
    movie_id = result[0].id
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))
    data_json = response.json()
    if data_json['genres']:
        for i in range(0,len(data_json['genres'])):
            genres.append(data_json['genres'][i]['name'])
        return (" ".join(genres))
    else:
        return np.NaN
final_df_2021['genres'] = final_df_2021['Title'].map(lambda x: get_genre(str(x)))    
df_2021 = final_df_2021[['Title','Cast and crew','genres']]
df_2021

Unnamed: 0,Title,Cast and crew,genres
0,Shadow in the Cloud,Roseanne Liang (director/screenplay); Max Land...,Horror Action War
1,The White Tiger,Ramin Bahrani (director/screenplay); Adarsh Go...,Drama
2,Locked Down,Doug Liman (director); Steven Knight (screenpl...,Comedy Crime Drama
3,The Dig,Simon Stone (director); Moira Buffini (screenp...,Drama History
4,Outside the Wire,"Mikael Håfström (director); Rob Yescombe, Rowa...",Thriller Action Science Fiction
...,...,...,...
353,The Tragedy of Macbeth,Joel Coen (director/screenplay); Denzel Washin...,Drama War
354,A Journal for Jordan,Denzel Washington (director); Virgil Williams ...,Drama Romance
355,American Underdog,"Erwin brothers (directors); Jon Erwin, David A...",Drama
356,Memoria,Apichatpong Weerasethakul (director/acreenplay...,Drama Science Fiction Mystery


In [14]:
df_2021.dropna(inplace=True)

In [15]:
#getting director
def get_director(x):
      if " (director)" in x:
          return x.split(" (director)")[0]
      elif " (directors)" in x:
          return x.split(" (directors)")[0]
      else:
          return x.split(" (director/screenplay)")[0]
          
df_2021['director_name'] = df_2021['Cast and crew'].map(lambda x: get_director(x))

In [16]:
#getting actor 1
def get_actor1(x):
    return ((x.split("screenplay); ")[-1]).split(", ")[0])
    
df_2021['actor_1_name'] = df_2021['Cast and crew'].map(lambda x: get_actor1(x))

#getting actor 2
def get_actor2(x):
  if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
    return np.NaN
  else:
    return ((x.split("screenplay); ")[-1]).split(", ")[1])
      
df_2021['actor_2_name'] = df_2021['Cast and crew'].map(lambda x: get_actor2(x))

#getting actor 3
def get_actor3(x):
  if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
    return np.NaN
  else:
    return ((x.split("screenplay); ")[-1]).split(", ")[2])
      
df_2021['actor_3_name'] = df_2021['Cast and crew'].map(lambda x: get_actor3(x))

In [17]:
#considering only the important features
df_2021 = df_2021.rename(columns={'Title':'movie_title'})
df_2021 = df_2021[['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]

In [18]:
df_2021.isnull().sum()

director_name     0
actor_1_name      0
actor_2_name      7
actor_3_name     24
genres            0
movie_title       0
dtype: int64

In [19]:
df_2021=df_2021.dropna(how='any')

In [20]:
df_2021['movie_title'] = df_2021['movie_title'].str.lower()
df_2021['comb'] = df_2021['actor_1_name'] + ' ' + df_2021['actor_2_name'] + ' '+ df_2021['actor_3_name'] + ' '+ df_2021['director_name'] +' ' + df_2021['genres']
df_2021

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,Roseanne Liang,Chloë Grace Moretz,Taylor John Smith,Beulah Koale,Horror Action War,shadow in the cloud,Chloë Grace Moretz Taylor John Smith Beulah Ko...
1,Ramin Bahrani,Adarsh Gourav,Rajkummar Rao,Priyanka Chopra Jonas,Drama,the white tiger,Adarsh Gourav Rajkummar Rao Priyanka Chopra Jo...
2,Doug Liman,Anne Hathaway,Chiwetel Ejiofor,Stephen Merchant,Comedy Crime Drama,locked down,Anne Hathaway Chiwetel Ejiofor Stephen Merchan...
3,Simon Stone,Carey Mulligan,Ralph Fiennes,Lily James,Drama History,the dig,Carey Mulligan Ralph Fiennes Lily James Simon ...
4,Mikael Håfström,Anthony Mackie,Damson Idris,Emily Beecham,Thriller Action Science Fiction,outside the wire,Anthony Mackie Damson Idris Emily Beecham Mika...
...,...,...,...,...,...,...,...
352,Matthew Vaughn,Ralph Fiennes,Gemma Arterton,Rhys Ifans,Action Adventure Thriller War Mystery,the king's man,Ralph Fiennes Gemma Arterton Rhys Ifans Matthe...
353,Joel Coen,Denzel Washington,Frances McDormand,Bertie Carvel,Drama War,the tragedy of macbeth,Denzel Washington Frances McDormand Bertie Car...
354,Denzel Washington,Michael B. Jordan,Chanté Adams,Jalon Christian,Drama Romance,a journal for jordan,Michael B. Jordan Chanté Adams Jalon Christian...
355,Erwin brothers,Zachary Levi,Anna Paquin,Dennis Quaid,Drama,american underdog,Zachary Levi Anna Paquin Dennis Quaid Erwin br...


Extracting features of 2022 movies from Wikipedia

In [21]:
link3 = "https://en.wikipedia.org/wiki/List_of_American_films_of_2022"
final_df_2022 = dataset(link3)
final_df_2022

Unnamed: 0,Title,Cast and crew
0,The 355,Simon Kinberg (director/screenplay); Theresa R...
1,The Legend of La Llorona,Patricia Harris Seeley (director/screenplay); ...
2,The Commando,Asif Akbar (director); Koji Steven Sakai (scre...
3,Scream,"Matt Bettinelli-Olpin, Tyler Gillett (director..."
4,Hotel Transylvania: Transformania,"Jennifer Kluska, Derek Drymon (directors); Amo..."
...,...,...
229,Puss in Boots: The Last Wish,"Joel Crawford (director); Antonio Banderas, Sa..."
230,I Wanna Dance with Somebody,Kasi Lemmons (director); Anthony McCarten (scr...
231,Babylon,Damien Chazelle (director/screenplay); Brad Pi...
232,,


In [22]:
final_df_2022.isnull().sum()

Title            2
Cast and crew    2
dtype: int64

In [23]:
final_df_2022.dropna(inplace=True)

In [24]:
#getting genres from tmbd api
def get_genre(x):
  genres = []
  result = tmdb_movie.search(x)
  if not result:
    return np.NaN
  else:
    movie_id = result[0].id
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))
    data_json = response.json()
    if data_json['genres']:
        for i in range(0,len(data_json['genres'])):
            genres.append(data_json['genres'][i]['name'])
        return (" ".join(genres))
    else:
        return np.NaN
final_df_2022['genres'] = final_df_2022['Title'].map(lambda x: get_genre(str(x)))    
df_2022 = final_df_2022[['Title','Cast and crew','genres']]
df_2022

Unnamed: 0,Title,Cast and crew,genres
0,The 355,Simon Kinberg (director/screenplay); Theresa R...,Action Thriller
1,The Legend of La Llorona,Patricia Harris Seeley (director/screenplay); ...,Horror Thriller
2,The Commando,Asif Akbar (director); Koji Steven Sakai (scre...,Action Crime Thriller
3,Scream,"Matt Bettinelli-Olpin, Tyler Gillett (director...",Horror Mystery Thriller
4,Hotel Transylvania: Transformania,"Jennifer Kluska, Derek Drymon (directors); Amo...",Animation Family Fantasy Comedy Adventure
...,...,...,...
227,Matilda,Matthew Warchus (director); Dennis Kelly (scre...,Comedy Family Fantasy
228,Shazam! Fury of the Gods,"David F. Sandberg (director); Henry Gayden, Ch...",Comedy Action Fantasy
229,Puss in Boots: The Last Wish,"Joel Crawford (director); Antonio Banderas, Sa...",Animation Adventure Comedy Family Fantasy
230,I Wanna Dance with Somebody,Kasi Lemmons (director); Anthony McCarten (scr...,Drama History


In [25]:
#getting director
def get_director(x):
      if " (director)" in x:
          return x.split(" (director)")[0]
      elif " (directors)" in x:
          return x.split(" (directors)")[0]
      else:
          return x.split(" (director/screenplay)")[0]
          
df_2022['director_name'] = df_2022['Cast and crew'].map(lambda x: get_director(x))

In [26]:
#getting actor 1
def get_actor1(x):
    return ((x.split("screenplay); ")[-1]).split(", ")[0])
    
df_2022['actor_1_name'] = df_2022['Cast and crew'].map(lambda x: get_actor1(x))

#getting actor 2
def get_actor2(x):
  if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
    return np.NaN
  else:
    return ((x.split("screenplay); ")[-1]).split(", ")[1])
      
df_2022['actor_2_name'] = df_2022['Cast and crew'].map(lambda x: get_actor2(x))

#getting actor 3
def get_actor3(x):
  if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
    return np.NaN
  else:
    return ((x.split("screenplay); ")[-1]).split(", ")[2])
      
df_2022['actor_3_name'] = df_2022['Cast and crew'].map(lambda x: get_actor3(x))

In [27]:
#considering only the important features
df_2022 = df_2022.rename(columns={'Title':'movie_title'})
df_2022 = df_2022[['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]

In [28]:
df_2022=df_2022.dropna(how='any')

In [29]:
df_2022['movie_title'] = df_2022['movie_title'].str.lower()
df_2022['comb'] = df_2022['actor_1_name'] + ' ' + df_2022['actor_2_name'] + ' '+ df_2022['actor_3_name'] + ' '+ df_2022['director_name'] +' ' + df_2022['genres']
df_2022

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,Simon Kinberg,Jessica Chastain,Lupita Nyong'o,Penélope Cruz,Action Thriller,the 355,Jessica Chastain Lupita Nyong'o Penélope Cruz ...
1,Patricia Harris Seeley,Autumn Reeser,Antonio Cupo,Danny Trejo,Horror Thriller,the legend of la llorona,Autumn Reeser Antonio Cupo Danny Trejo Patrici...
3,"Matt Bettinelli-Olpin, Tyler Gillett",Melissa Barrera,Mason Gooding,Jenna Ortega,Horror Mystery Thriller,scream,Melissa Barrera Mason Gooding Jenna Ortega Mat...
4,"Jennifer Kluska, Derek Drymon",Andy Samberg,Selena Gomez,Kathryn Hahn,Animation Family Fantasy Comedy Adventure,hotel transylvania: transformania,Andy Samberg Selena Gomez Kathryn Hahn Jennife...
5,Luis Prieto,Cameron Monaghan,Frank Grillo,Lilly Krug,Thriller,shattered,Cameron Monaghan Frank Grillo Lilly Krug Luis ...
...,...,...,...,...,...,...,...
227,Matthew Warchus,Alisha Weir,Lashana Lynch,Andrea Riseborough,Comedy Family Fantasy,matilda,Alisha Weir Lashana Lynch Andrea Riseborough M...
228,David F. Sandberg,Zachary Levi,Jack Dylan Grazer,Rachel Zegler,Comedy Action Fantasy,shazam! fury of the gods,Zachary Levi Jack Dylan Grazer Rachel Zegler D...
229,Joel Crawford,Joel Crawford (director); Antonio Banderas,Salma Hayek,Harvey Guillén,Animation Adventure Comedy Family Fantasy,puss in boots: the last wish,Joel Crawford (director); Antonio Banderas Sal...
230,Kasi Lemmons,Naomi Ackie,Ashton Sanders,Stanley Tucci,Drama History,i wanna dance with somebody,Naomi Ackie Ashton Sanders Stanley Tucci Kasi ...


In [30]:
df_20_21=df_2020.append(df_2021,ignore_index=True)

df_20_21_22=df_20_21.append(df_2022,ignore_index=True)

In [31]:
old_df = pd.read_csv('/content/data_16_17_18_19.csv')

In [32]:
cleaned_df = old_df.append(df_20_21_22,ignore_index=True)
cleaned_df

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...
...,...,...,...,...,...,...,...
6666,Matthew Warchus,Alisha Weir,Lashana Lynch,Andrea Riseborough,Comedy Family Fantasy,matilda,Alisha Weir Lashana Lynch Andrea Riseborough M...
6667,David F. Sandberg,Zachary Levi,Jack Dylan Grazer,Rachel Zegler,Comedy Action Fantasy,shazam! fury of the gods,Zachary Levi Jack Dylan Grazer Rachel Zegler D...
6668,Joel Crawford,Joel Crawford (director); Antonio Banderas,Salma Hayek,Harvey Guillén,Animation Adventure Comedy Family Fantasy,puss in boots: the last wish,Joel Crawford (director); Antonio Banderas Sal...
6669,Kasi Lemmons,Naomi Ackie,Ashton Sanders,Stanley Tucci,Drama History,i wanna dance with somebody,Naomi Ackie Ashton Sanders Stanley Tucci Kasi ...


In [33]:
cleaned_df.to_csv('main_data.csv',index=False)