In [1]:
# mutliple outputs in cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# cell width
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

### Imports and data folder path

In [3]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import json

PATH_IN = './data/'

colors = sns.color_palette("colorblind")
sns.set_palette(colors)
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=colors)

### Plot Summaries (Train data 1)
Plot summaries of 42,306 movies extracted from the November 2, 2012 dump of English-language Wikipedia. Each line contains the Wikipedia movie ID (which indexes into movie.metadata.tsv) followed by the summary.

In [4]:
plot_summaries_fname = os.path.join(PATH_IN, 'plot_summaries.txt')

plot_summaries = pd.read_csv(plot_summaries_fname, sep='\t', names=['Wikipedia movie id', 'plot_summary'])
plot_summaries['Wikipedia movie id'] = pd.to_numeric(plot_summaries['Wikipedia movie id'], downcast='integer') # unsigned for uint, in terms of memory same result
plot_summaries['plot_summary'] = plot_summaries['plot_summary'].astype('string')

plot_summaries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42303 entries, 0 to 42302
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Wikipedia movie id  42303 non-null  int32 
 1   plot_summary        42303 non-null  string
dtypes: int32(1), string(1)
memory usage: 495.9 KB


### Stanford CoreNLP processed XML datasets (Train data 2)
The plot summaries, run through the Stanford CoreNLP pipeline (tagging, parsing, NER and coref). Each filename begins with the Wikipedia movie ID (which indexes into movie.metadata.tsv).

[Paper](https://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf)

[Dependency glossary](https://downloads.cs.stanford.edu/nlp/software/dependencies_manual.pdf)

In [5]:
tokens_fname = os.path.join(PATH_IN, 'tokens.parquet')
dependencies_fname = os.path.join(PATH_IN, 'dependencies.parquet')
parses_fname = os.path.join(PATH_IN, 'parses.parquet')
coref_fname = os.path.join(PATH_IN, 'coref.parquet')

tokens = pd.read_parquet(tokens_fname)
dependencies = pd.read_parquet(dependencies_fname)
parses = pd.read_parquet(parses_fname)
coref = pd.read_parquet(coref_fname)

tokens.info()
dependencies.info()
parses.info()
coref.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14905203 entries, 0 to 14905202
Data columns (total 9 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   movie_id     int32 
 1   sentence_id  int16 
 2   token_id     int16 
 3   word         string
 4   lemma        string
 5   COB          int16 
 6   COE          int16 
 7   POS          string
 8   NER          string
dtypes: int16(4), int32(1), string(4)
memory usage: 625.4 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34199068 entries, 0 to 34199067
Data columns (total 8 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   movie_id          int32 
 1   sentence_id       int16 
 2   dependency_class  string
 3   dependency_type   string
 4   governor_id       int16 
 5   governor_word     string
 6   dependent_id      int16 
 7   dependent_word    string
dtypes: int16(3), int32(1), string(4)
memory usage: 1.3 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665586 entries, 0 to 665585

### Movies metadata (Train data 3)

Metadata for 81,741 movies, extracted from the Noverber 4, 2012 dump of Freebase. Tab-separated; columns.
1. Wikipedia movie ID
2. Freebase movie ID
3. Movie name
4. Movie release date
5. Movie box office revenue
6. Movie runtime
7. Movie languages (Freebase ID:name tuples)
8. Movie countries (Freebase ID:name tuples)
9. Movie genres (Freebase ID:name tuples)

In [6]:
movie_fname = os.path.join(PATH_IN, 'movie.metadata.tsv')

movie_column_names = [
    "Wikipedia movie ID", 
    "Freebase movie ID", 
    "Movie name", 
    "Movie release date", 
    "Movie box office revenue", 
    "Movie runtime", 
    "Movie languages", 
    "Movie countries", 
    "Movie genres",
]

movies = pd.read_csv(movie_fname,  sep= '\t', names= movie_column_names)
movies.info()

movies.loc[46808, 'Movie countries'] = '{"/m/03rk0": "India"}'
movies.loc[67202, 'Movie countries'] = '{"/m/03rk0": "India"}'
movies.loc[67202, 'Movie languages'] = '{"/m/0999q": "Malayalam Language"}'
movies.loc[72685, 'Movie countries'] = '{"/m/084n_": "Weimar Republic", "/m/0345h": "Germany"}'

movies.loc[1825, "Movie languages"] = '{"/m/04306rv": "German Language"}'
movies.loc[7855, "Movie languages"] = '{"/m/02bjrlw": "Italian Language", "/m/06nm1": "Spanish Language", "/m/064_8sq": "French Language", "/m/04h9h": "Latin Language", "/m/02h40lc": "English Language", "/m/05qqm": "Polish Language", "/m/04306rv": "German Language"}'
movies.loc[20807, "Movie languages"] = '{"/m/0k0sv": "Croatian language", "/m/02bjrlw": "Italian Language", "/m/06b_j": "Russian Language", "/m/06nm1": "Spanish Language", "/m/064_8sq": "French Language", "/m/05zjd": "Portuguese Language", "/m/02h40lc": "English Language", "/m/06zvd": "Slovenian language", "/m/04306rv": "German Language", "/m/02hwhyv": "Korean Language"}'
movies.loc[25679, "Movie languages"] = '{"/m/05qqm": "Polish Language", "/m/0cjk9": "Ukrainian Language", "/m/0880p": "Yiddish Language", "/m/04306rv": "German Language"}'
movies.loc[30562, "Movie languages"] = '{"/m/02h40lc": "English Language", "/m/06b_j": "Russian Language", "/m/04306rv": "German Language"}'
movies.loc[68137, "Movie languages"] = '{"/m/02hwyss": "Turkish Language", "/m/04306rv": "German Language"}'

movies['Movie release Year'] = movies['Movie release date'].str.split('-').str[0].astype('Int64')
movies['Movie release Month'] = movies['Movie release date'].str.split('-').str[1].astype('Int64')
movies['Movie release Day'] = movies['Movie release date'].str.split('-').str[2].astype('Int64')
movies.drop(columns=['Movie release date'], inplace=True)

movies['parsed languages'] = movies['Movie languages'].apply(ast.literal_eval)
movies['language codes'] = movies['parsed languages'].apply(lambda x: ','.join(list(x.keys())))
movies['languages'] = movies['parsed languages'].apply(lambda x: ','.join([val.replace(' Language', '') for val in list(x.values())]))
movies.drop(columns=['Movie languages', 'parsed languages'], inplace=True)

movies['parsed countries'] = movies['Movie countries'].apply(ast.literal_eval)
movies['countries codes'] = movies['parsed countries'].apply(lambda x: ','.join(list(x.keys())))
movies['countries'] = movies['parsed countries'].apply(lambda x: ','.join(list(x.values())))
movies.drop(columns=['Movie countries', 'parsed countries'], inplace=True)

movies['parsed genres'] = movies['Movie genres'].apply(ast.literal_eval)
movies['genres codes'] = movies['parsed genres'].apply(lambda x: ','.join(list(x.keys())))
movies['genres'] = movies['parsed genres'].apply(lambda x: ','.join(list(x.values())))
movies.drop(columns=['Movie genres', 'parsed genres'], inplace=True)


movies_probably_not_useful = movies[['Freebase movie ID', 'language codes', 'countries codes', 'genres codes']]
movies.drop(columns=['Freebase movie ID', 'language codes', 'countries codes', 'genres codes'], inplace=True)

movies.rename(columns={
    'languages': 'Movie languages',
    'countries': 'Movie countries',
    'genres': 'Movie genres'
}, inplace=True)
movies = movies[['Wikipedia movie ID', 'Movie name', 'Movie release Year', 'Movie box office revenue', 'Movie runtime', 'Movie languages', 'Movie countries', 'Movie genres', 
                 'Movie release Month', 'Movie release Day']]

movies['Movie name'] = movies['Movie name'].astype('string')
movies['Movie languages'] = movies['Movie languages'].astype('string')
movies['Movie countries'] = movies['Movie countries'].astype('string')
movies['Movie genres'] = movies['Movie genres'].astype('string')
movies['Wikipedia movie ID'] = pd.to_numeric(movies['Wikipedia movie ID'], downcast='integer')
movies['Movie release Year'] = pd.to_numeric(movies['Movie release Year'], downcast='integer')
movies['Movie release Month'] = pd.to_numeric(movies['Movie release Month'], downcast='integer')
movies['Movie release Day'] = pd.to_numeric(movies['Movie release Day'], downcast='integer')
movies['Movie box office revenue'] = pd.to_numeric(movies['Movie box office revenue'], downcast='float')
movies['Movie runtime'] = pd.to_numeric(movies['Movie runtime'], downcast='float')

movies_probably_not_useful['Freebase movie ID'] = movies_probably_not_useful['Freebase movie ID'].astype('string')
movies_probably_not_useful['language codes'] = movies_probably_not_useful['language codes'].astype('string')
movies_probably_not_useful['countries codes'] = movies_probably_not_useful['countries codes'].astype('string')
movies_probably_not_useful['genres codes'] = movies_probably_not_useful['genres codes'].astype('string')

print("\n")
movies.info()
movies_probably_not_useful.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81741 entries, 0 to 81740
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Wikipedia movie ID        81741 non-null  int64  
 1   Freebase movie ID         81741 non-null  object 
 2   Movie name                81741 non-null  object 
 3   Movie release date        74839 non-null  object 
 4   Movie box office revenue  8401 non-null   float64
 5   Movie runtime             61291 non-null  float64
 6   Movie languages           81741 non-null  object 
 7   Movie countries           81741 non-null  object 
 8   Movie genres              81741 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 5.6+ MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81741 entries, 0 to 81740
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0  

### Characters metadata (Train data 4)

Metadata for 450,669 characters aligned to the movies above, extracted from the Noverber 4, 2012 dump of Freebase. Tab-separated; columns:

1. Wikipedia movie ID
2. Freebase movie ID
3. Movie release date
4. Character name
5. Actor date of birth
6. Actor gender
7. Actor height (in meters)
8. Actor ethnicity (Freebase ID)
9. Actor name
10. Actor age at movie release
11. Freebase character/actor map ID
12. Freebase character ID
13. Freebase actor ID

In [7]:
character_fname = os.path.join(PATH_IN, 'character.metadata.tsv')

character_column_names = [
    "Wikipedia movie ID",
    "Freebase movie ID",
    "Movie release date",
    "Character name",
    "Actor DOB",
    "Actor gender",
    "Actor height",
    "Actor ethnicity",
    "Actor name",
    "Actor age at movie release",
    "Freebase character/actor map ID",
    "Freebase character ID",
    "Freebase actor ID",
]
        
characters = pd.read_csv(character_fname, sep= '\t', names= character_column_names)
characters.info()

characters['Movie release Year'] = characters['Movie release date'].str.split('-').str[0].astype('Int64')
characters['Movie release Month'] = characters['Movie release date'].str.split('-').str[1].astype('Int64')
characters['Movie release Day'] = characters['Movie release date'].str.split('-').str[2].astype('Int64')
characters.drop(columns=['Movie release date'], inplace=True)

characters['Actor DOB'] = characters['Actor DOB'].str.split('T', expand=True)[0]
characters['Actor DOB Year'] = characters['Actor DOB'].str.split('-').str[0].astype('Int64')
characters['Actor DOB Month'] = characters['Actor DOB'].str.split('-').str[1].astype('Int64')
characters['Actor DOB Day'] = characters['Actor DOB'].str.split('-').str[2].astype('Int64')
characters.drop(columns=['Actor DOB'], inplace=True)


characters_probably_not_useful = characters[['Freebase movie ID', 'Freebase character/actor map ID', 'Freebase character ID', 'Freebase actor ID']]
characters.drop(columns=['Freebase movie ID', 'Freebase character/actor map ID', 'Freebase character ID', 'Freebase actor ID'], inplace=True)

characters = characters[['Wikipedia movie ID', 'Movie release Year', 'Character name', 'Actor DOB Year', 'Actor gender', 'Actor height', 'Actor ethnicity', 'Actor name', 
                 'Actor age at movie release', 'Movie release Month', 'Movie release Day', 'Actor DOB Month', 'Actor DOB Day']]

characters['Character name'] = characters['Character name'].astype('string')
characters['Actor ethnicity'] = characters['Actor ethnicity'].astype('string')
characters['Actor name'] = characters['Actor name'].astype('string')


characters['Actor gender'] = characters['Actor gender'].astype('category')

characters['Wikipedia movie ID'] = pd.to_numeric(characters['Wikipedia movie ID'], downcast='integer')
characters['Movie release Year'] = pd.to_numeric(characters['Movie release Year'], downcast='integer')
characters['Actor DOB Year'] = pd.to_numeric(characters['Actor DOB Year'], downcast='integer')
characters['Actor height'] = pd.to_numeric(characters['Actor height'], downcast='float')
characters['Actor age at movie release'] = pd.to_numeric(characters['Actor age at movie release'], downcast='integer')
characters['Movie release Month'] = pd.to_numeric(characters['Movie release Month'], downcast='integer')
characters['Movie release Day'] = pd.to_numeric(characters['Movie release Day'], downcast='integer')
characters['Actor DOB Month'] = pd.to_numeric(characters['Actor DOB Month'], downcast='integer')
characters['Actor DOB Day'] = pd.to_numeric(characters['Actor DOB Day'], downcast='integer')

characters_probably_not_useful['Freebase movie ID'] = characters_probably_not_useful['Freebase movie ID'].astype('string')
characters_probably_not_useful['Freebase character/actor map ID'] = characters_probably_not_useful['Freebase character/actor map ID'].astype('string')
characters_probably_not_useful['Freebase character ID'] = characters_probably_not_useful['Freebase character ID'].astype('string')
characters_probably_not_useful['Freebase actor ID'] = characters_probably_not_useful['Freebase actor ID'].astype('string')

print("\n")
characters.info()
characters_probably_not_useful.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450669 entries, 0 to 450668
Data columns (total 13 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Wikipedia movie ID               450669 non-null  int64  
 1   Freebase movie ID                450669 non-null  object 
 2   Movie release date               440674 non-null  object 
 3   Character name                   192794 non-null  object 
 4   Actor DOB                        344524 non-null  object 
 5   Actor gender                     405060 non-null  object 
 6   Actor height                     154824 non-null  float64
 7   Actor ethnicity                  106058 non-null  object 
 8   Actor name                       449441 non-null  object 
 9   Actor age at movie release       292556 non-null  float64
 10  Freebase character/actor map ID  450669 non-null  object 
 11  Freebase character ID            192804 non-null  object 
 12  Fr

### tvtropes clusters (Test data 1)

72 character types drawn from tvtropes.com, along with 501 instances of those types. The ID field indexes into the Freebase character/actor map ID in character.metadata.tsv.

In [8]:
tvtropes_clusters_fname = os.path.join(PATH_IN, 'tvtropes.clusters.txt')

rows = []
with open(tvtropes_clusters_fname, 'r') as file:
    for line in file:
        char_type, json_string = line.strip().split('\t', 1)
        
        char_info = json.loads(json_string)
        
        row = {
            'Character type': char_type,
            'Character name': char_info['char'],
            'Movie name': char_info['movie'],
            'Freebase character/actor map ID': char_info['id'],
            'Actor name': char_info['actor']
        }
        rows.append(row)

tvtropes_clusters = pd.DataFrame(rows)
tvtropes_clusters = tvtropes_clusters.astype('string')

tvtropes_clusters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501 entries, 0 to 500
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Character type                   501 non-null    string
 1   Character name                   501 non-null    string
 2   Movie name                       501 non-null    string
 3   Freebase character/actor map ID  501 non-null    string
 4   Actor name                       501 non-null    string
dtypes: string(5)
memory usage: 19.7 KB


### Name clusters (Test data 2)

970 unique character names used in at least two different movies, along with 2,666 instances of those types. The ID field indexes into the Freebase character/actor map ID in character.metadata.tsv.

In [9]:
name_clusters_fname = os.path.join(PATH_IN, 'name.clusters.txt')

names_clusters = pd.read_csv(name_clusters_fname, sep='\t', names=['Character name', 'Freebase character/actor map ID'], dtype='string')

names_clusters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 2 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Character name                   2666 non-null   string
 1   Freebase character/actor map ID  2666 non-null   string
dtypes: string(2)
memory usage: 41.8 KB


# Cleaning the CMU Data

----------------------------------------------------------------------------------

## 1. The Movies Dataset

### 1.1 - Trying to cure the large number of missing values

In [10]:
print('Number of movie entries :',len(movies))
print('Number of unique Wikipedia Movie IDs:', movies['Wikipedia movie ID'].nunique())
print('Number of movies with missing Movie Names :', movies['Movie name'].isna().sum())
print('Number of entries with missing Box office revenue :',movies['Movie box office revenue'].isna().sum())
print('Number of entries with missing runtime :',movies['Movie runtime'].isna().sum())
print('Number of entries with missing Release year:',movies['Movie release Year'].isna().sum())

Number of movie entries : 81741
Number of unique Wikipedia Movie IDs: 81741
Number of movies with missing Movie Names : 0
Number of entries with missing Box office revenue : 73340
Number of entries with missing runtime : 20450
Number of entries with missing Release year: 6902


Large number of missing values, and some in there are probably wrong. Let's assume that Wikipedia must have right info about the films and let's try to use the `pymediawiki` API to retrieve interesting information. We also import the PageError and DisambiguationError exceptions, as we will make us of them to check a few things.

We declare a `MediaWiki` instance that we have will be the main tool to retrieve the information present on the wikipedia pages of the movie. 

In [11]:
#Initializing the utils to do page-search
from mediawiki import MediaWiki
from mediawiki.exceptions import PageError,DisambiguationError
import mwparserfromhell
import re
wikipedia = MediaWiki(user_agent='Movie Data Scraping')

How are we accessing the pages ? Thanks to the `Wikipedia movie ID` column of the `movies` dataset. This wikipedia movie ID is actually the "wgArticleid" (i.e the page identifier) of a MediaWiki page. 

An example is shown with the "Ghost of Mars" film from 2001 just below :

In [12]:
movies.head(2)
gom_wikipage = wikipedia.page(pageid=975900)
print(gom_wikipage)

Unnamed: 0,Wikipedia movie ID,Movie name,Movie release Year,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Movie release Month,Movie release Day
0,975900,Ghosts of Mars,2001,14010832.0,98.0,English,United States of America,"Thriller,Science Fiction,Horror,Adventure,Supe...",8,24
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000,,95.0,English,United States of America,"Mystery,Biographical film,Drama,Crime Drama",2,16


<MediaWikiPage 'Ghosts of Mars'>


This obtained MediaWikiPage instance thus contains information about the movie !

Lots of info to infer to our movies dataset in order to improve the quality of the data. Moreover, near the end in the ==External links== section, we can see that the IMDb title is available and is actually the same as in the IMDb dataset that we're using (you can check in _relationships.ipynb_)

Still have many things to check, but that's a start.



#### Retrieving the IMDb ID
We can get the IMDb id using the fact that, if it is present on the page, it will always be in the External links section (at least in the few dozens of movies checked by hand) Example with the second film of the movie.head() displayed above :

In [12]:
#Getting the WikiMedia page corresponding to our film
page = wikipedia.page(pageid=3196793)
#Using the mwparserfromhell library to retrieve the wikitext for easier handling
text = mwparserfromhell.parse(page.wikitext)
#Using the get_sections function of mwparserfromhell to only get the 'external link' section
external_links_text = text.get_sections(matches='External links')
#Searching for the {IMDb title|id|...} form which contains our id, and retrieving it
imdb_id = re.search(r'\{\{IMDb title\|(\d+)(?:\|.*?)?\}\}', str(external_links_text)).group(1)
print(external_links_text)
print(imdb_id)

['==External links==\n* {{IMDb title|0245916|Getting Away with Murder: The JonBenet Ramsey Story}}\n\n[[Category:2000 documentary films]]\n[[Category:2000 television films]]\n[[Category:2000s American films]]\n[[Category:American biographical films]]\n[[Category:Fox Broadcasting Company original programming]]\n[[Category:Fox network original films]]\n[[Category:Killing of JonBenét Ramsey]]\n[[Category:Television series by Rocket Science Laboratories]]']
0245916


#### Retrieving information already present in the dataset

Let's first tackle the issue about cleaning the dataset. As we know, many of the `Movie Box office revenue` and `Movie runtime` are missing. Using the same principle as above, we will retrieve the runtime and revenue of the film 'Ghosts of Mars'

In [24]:
#Getting the WikiMedia page corresponding to our film
page = wikipedia.page(pageid=975900)
#Using the mwparserfromhell library to retrieve the wikitext for easier handling
text = mwparserfromhell.parse(page.wikitext)
print(text)

{{Short description|2001 film by John Carpenter}}
{{Use mdy dates|date=August 2020}}
{{Infobox film
| name           = Ghosts of Mars
| image          = John Carpenter's Ghosts of Mars.jpg
| alt            = 
| caption        = U.S. theatrical release poster
| director       = [[John Carpenter]]
| producer       = [[Sandy King (producer)|Sandy King]]
| writer         = John Carpenter<br />Larry Sulkis
| starring       = {{Plainlist|
* [[Ice Cube]]
* [[Natasha Henstridge]]<!--per poster block - Cube is first billed, Henstridge second.-->
* [[Jason Statham]]
* [[Pam Grier]]
* [[Clea DuVall]]
* [[Joanna Cassidy]]
}}
| music          = John Carpenter
| cinematography = [[Gary B. Kibbe]]
| editing        = Paul C. Warschilka
| studio         = [[Screen Gems]]<br />Storm King Productions
| distributor    = [[Sony Pictures Releasing]]<ref>{{cite web|title=John Carpenter's Ghosts of Mars (2001)|work=[[AFI Catalog of Feature Films]]|access-date=12 March 2021|url=http://catalog.afi.com/Film/6206

In [13]:
#Getting the WikiMedia page corresponding to our film
page = wikipedia.page(pageid=975900)
#Using the mwparserfromhell library to retrieve the wikitext for easier handling
text = mwparserfromhell.parse(page.wikitext)
#Using the get_sections function of mwparserfromhell to only get the 'external link' section
external_links_text = text.get_sections(matches='External links')
#Searching for the {IMDb title|id|...} form which contains our id, and retrieving it
imdb_id = re.search(r'\{\{IMDb title\|(\d+)(?:\|.*?)?\}\}', str(external_links_text)).group(1)
print(external_links_text)
print(imdb_id)
#print(text)

for section in text.get_sections() :
    
    if section.contains('runtime') & section.contains('Infobox') :

        # Define a regular expression to find the runtime
        runtime_pattern = re.compile(r'\|\s*runtime\s*=\s*([^\n]+)')

        # Search for the runtime value
        match = runtime_pattern.search(str(section)) 
        #Extract the runtime text
        runtime_value = match.group(1)

        #Define a regular expression to extract minute digits from the runtime value
        extracted_digits = re.search(r'\d{1,8}', runtime_value)

        if extracted_digits == None : 
            extracted_digits = np.nan
        else :
            #Extract the runtime value in minutes
            runtime_in_minutes = extracted_digits.group()
            print(runtime_in_minutes)

    #Same, but for the movie revenue    
    if section.contains('gross') & section.contains('Infobox'):

        gross_pattern = re.compile(r'\|\s*gross\s*=\s*([^\n]+)')

        match = gross_pattern.search(str(section)) 

        gross_value = match.group(1)

        extracted_digits = re.search(r'\d{1,8}', gross_value)

        if extracted_digits == None :
            gross_value = np.nan
        else :
            gross_value = extracted_digits.group()
        print(gross_value)

["==External links==\n* {{IMDb title|0228333}}\n* {{Amg title|250566}}\n* {{mojo title|ghostsofmars}}\n* {{rotten-tomatoes|john_carpenters_ghosts_of_mars}}\n* {{Metacritic film}}\n* [https://web.archive.org/web/20040907041359/http://www.theofficialjohncarpenter.com/pages/themovies/gm/gm.html ''Ghosts of Mars''] at [[John Carpenter]]'s official site\n\n{{John Carpenter}}\n{{Authority control}}\n\n[[Category:2000s Western (genre) horror films]]\n[[Category:2001 films]]\n[[Category:2001 horror films]]\n[[Category:2001 science fiction action films]]\n[[Category:2000s ghost films]]\n[[Category:2000s science fiction horror films]]\n[[Category:American action horror films]]\n[[Category:American Western (genre) horror films]]\n[[Category:American ghost films]]\n[[Category:American science fantasy films]]\n[[Category:American science fiction action films]]\n[[Category:American space adventure films]]\n[[Category:American supernatural horror films]]\n[[Category:Films about extraterrestrial life]

Runtime always seems to be in minutes so code should work "anytime", but problematic for the gross revenue. Here it works because it is in millions, but sometimes it is in billions or for some films, the currency revenue can be different and not be in dollars. How do we handle those cases ?

#### Checking the state of IMDb metadata 

In [10]:
imdb_people = pd.read_csv('data/IMDb/name.basics.tsv', sep='\t', na_values=['\\N'])
imdb_info = pd.read_csv('data/IMDb/title.basics.tsv', sep='\t', na_values=['\\N'])
#imdb_principals = pd.read_csv('data/IMDb/title.principals.tsv', sep='\t', na_values=['\\N'])
imdb_ratings = pd.read_csv('data/IMDb/title.ratings.tsv', sep='\t', na_values=['\\N'])

  imdb_info = pd.read_csv('data/IMDb/title.basics.tsv', sep='\t', na_values=['\\N'])


##### imdb_info

As our aim is to be able to enrich our CMU corpus with IMDb info by merging the different datasets, we will run a few things to see what's missing and what's present in the IMDb metadata. First, about movies/shows etc. that are contained in the imdb_info dataset :

In [15]:
print('Number of entries :',len(imdb_info))
print('Number of unique IMDb titles:', imdb_info['tconst'].nunique())

Number of entries : 10218119
Number of unique IMDb titles: 10218119


The info_imdb only contains unique IMDb titles, there doesn't seem to be any repetition based on the tconst attributes.

In [16]:
print('How many Title primary names are missing? -->', imdb_info['primaryTitle'].isna().sum())

print('How many start years are missing in the dataset? -->',imdb_info['startYear'].isna().sum())

print('How many end years are missing in the dataset? -->',imdb_info['endYear'].isna().sum())

print('How many entries lack info about the runtime? -->', imdb_info['runtimeMinutes'].isna().sum())

print('How many entries lack info about the genres? -->', imdb_info['genres'].isna().sum())

How many Title primary names are missing? --> 11
How many start years are missing in the dataset? --> 1369240
How many end years are missing in the dataset? --> 10104758
How many entries lack info about the runtime? --> 7147421
How many entries lack info about the genres? --> 461306


- Nearly no primary names for title lacking that's good, we could very well use them to double check whether the merging of our IMDb ID and the Wikipedia ID was good or not.

- Not that much start years missing, only for 13.4% of the entries. For our CMU dataset, this is the column that would be equivalent to the "Movie release Year" : high chance that we can use that during the merge as well !

- A vast majority of end years missing but this is normal as this is only filled for shows, not movies

- Not that many missing genres, only for ~5% of our dataset. Some chance that this info could be compared with what's in the CMU and maybe decide to use these genres instead of the CMU ones if they're more interesting ?

- 70% of the runtime value missing. That still leaves 3M movies/shows where info is available for our 80k movie/shows of the CMU, which is still a lot. But there's risk that IMDb won't maybe be optimal to retrieve the Runtime value.

- Also, no info about the box office of movies if we wanted to use it to quantify the movie's success in our characters/personas analysis if we want to talk about how successful these are. If we want the revenue, we'll have to go and search it ourselves.

#### imdb_ratings

For success of films and characters/personas combinations or actors careers, we also want to use the imdb ratings. Let's see how they look :

In [17]:
print('Number of entries :',len(imdb_ratings))
print('Number of unique IMDb titles:', imdb_ratings['tconst'].nunique())

Number of entries : 1356511
Number of unique IMDb titles: 1356511


Only unique entries but for only 1.36 million entries out of the 10.2 millions that we have in imdb_info. Hopefully they're all valid :

In [18]:
print('How many ratings are missing? -->', imdb_ratings['averageRating'].isna().sum())

How many ratings are missing? --> 0


Nice, at least we do not have missing values. Problem is that ratings might not be that accurate if they're based on just a very few number of votes. How many number of votes do ratings have approximately ?

In [19]:
mean_votes = imdb_ratings['numVotes'].mean()
median_votes = imdb_ratings['numVotes'].median()
std_dev_votes = imdb_ratings['numVotes'].std()

# Display the results
print(f"Mean number of votes: {mean_votes:.2f}")
print(f"Median number of votes: {median_votes}")
print(f"Standard deviation of votes: {std_dev_votes:.2f}")

Mean number of votes: 1039.22
Median number of votes: 26.0
Standard deviation of votes: 17574.69


No histogram there as the disparity is too big between what's in the huge dataset, it doesn't render any interesting information. Median interesting here : As many films with 26 votes or less than ones that have more than. 

The most famous movies and shows will have a lot of votes while many niche / unknown films or even films made in countries where people are not used to go rate films on IMDb (i.e a lot of people) won't have votes on here.

Difficult to know if the movies in the CMU would get ratings with a sufficient number of votes to really reflect popular opinion on the movie. We'll have to try when merging ! Otherwise, we could very well scrap Critics from other websites, if available on the wikipedia page.

##### IMDb people 

Many infos about this dataframe on the relationship's side are already in Sepehr's relationships.ipynb. Here we will rather just look at actors as that's what we have in the CMU corpus. 

In [20]:
print('Number of entries :',len(imdb_people))
print('Number of unique people in the dataset:', imdb_people['nconst'].nunique())

Number of entries : 12904751
Number of unique people in the dataset: 12904751


In [21]:
actors_actresses_df = imdb_people[imdb_people['primaryProfession'].str.contains('actor|actress', case=False, na=False)]
unique_actors_actresses = actors_actresses_df['nconst'].nunique()
print(f"Number of unique actors and actresses: {unique_actors_actresses}")

Number of unique actors and actresses: 4835753


4M unique actors and actresses in the dataset. Of course, we'd still have to check about how we would do to merge because we have a character's dataset and not an "actors" dataset per se in the CMU.

We would not get any additional information about characters from this dataset particularly. This can still be interesting to have directly the number of movies that are shared by the CMU and IMDb in which an actor has played for instance !

Let's also check the validity of the info in there :

For the characters, maybe it would be more interesting to look at the imdb_principals dataset.

In [22]:
#def get_page_info(article_id):
#    try:
#        page = wikipedia.page(pageid=article_id)
#        return 0
#    except PageError:
#        #print(f"Page with Article ID {article_id} does not exist.")
#        return 1
#    except DisambiguationError:
#        return 2

In [341]:
#all_unique_wikipedia_ids = (movies['Wikipedia movie ID']).to_list()
#
#valid_ids = 0
#false_ids = 0
#ambiguous_ids = 0
#
#for page_id in all_unique_wikipedia_ids :
#    page_info = get_page_info(int(page_id))
#    if page_info == 0 :
#        valid_ids +=1
#    elif page_info == 1 :
#        false_ids+=1
#    elif page_info == 2 : 
#        ambiguous_ids+=1
#
#print('Out of the', len(all_unique_wikipedia_ids), 'unique Wikipedia Movie IDs in the movies dataset,', str(false_ids), 'do not lead to a wikipedia page at all,', str(ambiguous_ids), 'lead to ambiguous pages and finally', str(valid_ids), 'are correct.')

### Matching IMDb & CMU informations :


Let's say we decide to only keep the 74k out of 82k movies where we can retrieve the IMDb ID per the Wiki data crawling method as our dataset. A first thing that we can do is use the mapping produced to get, from IMDb, only the persons that are part of movies in the CMU :

In [11]:
#File containing the IMDb mapping of our CMU movies to the IMDB ids
mapping_02 = pd.read_csv('generated/wp2imdb_02.csv')

In [12]:
imdb_people_exploded = imdb_people.copy()
imdb_people_exploded['knownForTitles'] = imdb_people['knownForTitles'].str.split(',')
imdb_people_exploded = imdb_people_exploded.explode(['knownForTitles'])


merged_ipe = pd.merge(imdb_people_exploded, mapping_02, how='inner', left_on='knownForTitles', right_on='imdb')
unique_persons = merged_ipe.drop_duplicates(subset='nconst', keep='first')

In [13]:
unique_persons

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,freebase,imdb
0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous",tt0053137,/m/02qmrw4,tt0053137
1,nm0001257,Ava Gardner,1922.0,1990.0,"actress,soundtrack,stunts",tt0053137,/m/02qmrw4,tt0053137
2,nm0006104,Ernest Gold,1921.0,1999.0,"music_department,composer,soundtrack",tt0053137,/m/02qmrw4,tt0053137
3,nm0026634,Donna Anderson,1939.0,,actress,tt0053137,/m/02qmrw4,tt0053137
4,nm0062365,Ken Baumgartner,,,actor,tt0053137,/m/02qmrw4,tt0053137
...,...,...,...,...,...,...,...,...
2229178,nm9965212,Vijay Babu Maganti,,,producer,tt8685998,/m/03wbqmr,tt8685998
2229179,nm9973247,Dhamodharan N.,,,director,tt8702662,/m/02r1d8t,tt8702662
2229181,nm9987091,A.P. Sathyan,,,assistant_director,tt0292023,/m/047ng4x,tt0292023
2229182,nm9987092,G. Ramamoorthy,,,camera_department,tt0292023,/m/047ng4x,tt0292023


From the original 12M~ number of unique persons originally present in the _imdb_people_ dataset, we can extract approximately 1.5M that worked, in all fields of the production, on movies that are part of the CMU corpus, based on the matching between Freebase and IMDb IDS.