# Imports and folder paths

In [1]:
import pandas as pd
import numpy as np
import os
import json
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

PATH_FOLDER = "MovieSummaries/"

# TRAIN DATA

## 1) Character Metadata

Metadata for 450,669 characters aligned to the movies above, extracted from the Noverber 4, 2012 dump of Freebase.  

Tab-separated; columns:
1. Wikipedia movie ID 
2. Freebase movie ID
3. Movie release date
4. Character name
5. Actor date of birth
6. Actor gender
7. Actor height (in meters)
8. Actor ethnicity (Freebase ID)
9. Actor name
10. Actor age at movie release
11. Freebase character/actor map ID
12. Freebase character ID
13. Freebase actor ID

---



In [2]:
character_file_name = os.path.join(PATH_FOLDER, 'character.metadata.tsv')
character_column = {
    'Wikipedia movie ID': int,
    'Freebase movie ID': str,
    'Movie release date': str,
    'Character name': str,
    'Actor date of birth': str,
    'Actor gender': str,
    'Actor height (in meters)': float,
    'Actor ethnicity (Freebase ID)': str,
    'Actor name': str,
    'Actor age at movie release': float,
    'Freebase character/actor map ID': str,
    'Freebase character ID': str,
    'Freebase actor ID': str
}
character_metadata = pd.read_csv(character_file_name,  sep="\t", header=None, names=character_column.keys(), dtype=character_column)

character_metadata.sample(5)

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
332982,1188399,/m/04fqnn,1943,,1901-05-09,M,,,Fuzzy Knight,41.0,/m/04d5x4c,,/m/02x3c65
132634,2208902,/m/06w1m_,2002-01-13,,,M,,,Joseph Siravo,,/m/0cg8l0c,,/m/0bp6r9
312447,14632340,/m/03grbll,1917,,1892-04-08,F,1.54,,Mary Pickford,,/m/040mgxq,,/m/04rfq
53254,11779733,/m/02rr_zv,2008-09-18,,1976-10-31,F,1.664,/m/04f581,Piper Perabo,31.0,/m/04q4sc4,,/m/02bqxt
351367,34805876,/m/062fmf,1996-08-30,Milo,1953-05-13,M,,,Zlatko Buric,43.0,/m/09hykxy,/m/0hzxfkp,/m/049g7q2


## 2) Movie Metadata

Metadata for 81,741 movies, extracted from the Noverber 4, 2012 dump of Freebase. 

Tab-separated; columns:
1. Wikipedia movie ID
2. Freebase movie ID
3. Movie name
4. Movie release date
5. Movie box office revenue
6. Movie runtime
7. Movie languages (Freebase ID:name tuples)
8. Movie countries (Freebase ID:name tuples)
9. Movie genres (Freebase ID:name tuples)





In [3]:
movie_file_name = os.path.join(PATH_FOLDER, 'movie.metadata.tsv')

# Define data types for each column
movies_column = {'Wikipedia movie ID': int,
 'Freebase movie ID': str,
 'Movie name': str,
 'Movie release date': str, # can't have datetime because it doesn't support 2 date format YYYY and YYYY-MM-DD simultaneously
 'Movie box office revenue': float, # Use Int64 to allow NaNs ('problem' : they are writen as <NA>)
 'Movie runtime': float,
 'Movie languages (Freebase ID:name tuples)': str,
 'Movie countries (Freebase ID:name tuples)': str,
 'Movie genres (Freebase ID:name tuples)': str}

movie_metadata = pd.read_csv(movie_file_name,  sep="\t", header=None, names=movies_column.keys(), dtype=movies_column)

#movie_metadata = movie_metadata.assign(Year=movie_metadata["Movie release date"].apply(lambda x: str(x)[:4]).astype(int))


movie_metadata.sample(5) 

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
78390,21450904,/m/05fbncw,Bug Buster,1998,,93.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0gf28"": ""Parody"", ""/m/06n90"": ""Science Fi..."
62351,30766597,/m/0gfhdf7,Grown-Ups,1980-11-28,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/07s9rl0"": ""Drama""}"
51188,10034391,/m/02p_hp6,I Accuse My Parents,1945,,69.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
27734,2347448,/m/075kvz,Fireback,,,,{},"{""/m/05v8c"": ""Philippines""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/02kdv5l"": ""A..."
11049,155921,/m/014g2j,Time After Time,1979-08-31,13000000.0,112.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."


## 3) Plot Summaries

Plot summaries of 42,306 movies extracted from the November 2, 2012 dump of English-language Wikipedia. Each line contains the Wikipedia movie ID (which indexes into movie.metadata.tsv) followed by the summary.

In [4]:
# to be added
summary_file_name = os.path.join(PATH_FOLDER, 'plot_summaries.txt')
summary_metadata = pd.read_csv(summary_file_name,  sep="\t")

summary_metadata.columns = [ "Movie ID",
                              "Movie Summary" ]

summary_metadata["Movie Summary"] = summary_metadata["Movie Summary"].astype("string")

summary_metadata.sample(5)

Unnamed: 0,Movie ID,Movie Summary
35929,7976935,The story begins during a thunderstorm. The ho...
31139,42093,Philip Schuyler Green is a widowed journalis...
2197,2058214,"Childhood friends Lonnie , Dominic , and G ha..."
12112,23907048,The film is set in 1940s Shanghai and Hong Kon...
10726,3755302,"Reformed outlaw Link Jones , travels to Crossc..."


## 4) Stanford CoreNLP Plot Summaries

The plot summaries, run through the Stanford CoreNLP pipeline (tagging, parsing, NER and coref). Each filename begins with the Wikipedia movie ID (which indexes into movie.metadata.tsv).

In [5]:
# to be added

# TEST DATA

## 1) Tvtropes clusters

72 character types drawn from tvtropes.com, along with 501 instances of those types. The ID field indexes into the Freebase character/actor map ID in character.metadata.tsv.


In [6]:
tvtropes_file_name = os.path.join(PATH_FOLDER, 'tvtropes.clusters.txt')
tvtropes_metadata = pd.read_csv(tvtropes_file_name, sep="\t")

# define columns
tvtropes_metadata.columns = [ 'character',
                             'data' ]

# Convert the 'data' column to strings
tvtropes_metadata['data'] = tvtropes_metadata['data'].apply(lambda x: str(x))

# Extract and convert the JSON data to separate columns
tvtropes_metadata = pd.DataFrame([(character_type, json.loads(data)) for character_type, data
               in zip(tvtropes_metadata['character'], tvtropes_metadata['data'])])

# redefine columns lost from the previous operation
tvtropes_metadata.columns = ['character', 'data']

# Expand the 'data' column into separate columns
tvtropes_metadata = pd.concat([tvtropes_metadata, tvtropes_metadata['data'].apply(pd.Series)], axis=1)

# Drop the original 'data' column
tvtropes_metadata.drop('data', axis=1, inplace=True)

tvtropes_metadata.columns = [ 'Character role',
                               'Character name',
                               'Movie name',
                               'Freebase character/actor map ID', 
                               'Actor name' ]

tvtropes_metadata.sample(5)


Unnamed: 0,Character role,Character name,Movie name,Freebase character/actor map ID,Actor name
93,charmer,Han Solo,Star Wars Episode IV: A New Hope,/m/0k3r1_,Harrison Ford
124,corrupt_corporate_executive,Jack Bennett,Edge of Darkness,/m/08cwjsb,Danny Huston
231,evil_prince,Nizam,Prince of Persia: The Sands of Time,/m/04m5488,Ben Kingsley
282,granola_person,Bill Django,The Men Who Stare at Goats,/m/06679y8,Jeff Bridges
399,retired_outlaw,William Munny,Unforgiven,/m/0k7l31,Clint Eastwood


## 2) Name clusters

970 unique character names used in at least two different movies, along with 2,666 instances of those types. The ID field indexes into the Freebase character/actor map ID in character.metadata.tsv.


In [7]:
name_file_name = os.path.join(PATH_FOLDER, 'name.clusters.txt')
name_metadata = pd.read_csv(name_file_name,  sep="\t")

name_metadata.columns = [ "character name",
                                "Freebase character/actor map ID" ]

name_metadata.sample(5)


Unnamed: 0,character name,Freebase character/actor map ID
2215,Obi-Wan Kenobi,/m/0k3qzt
398,Ellen Griswold,/m/02vbk07
1782,Ron Stoppable,/m/0j_gwp
1175,Jack Powell,/m/02tbgyx
2159,Freddy Lounds,/m/03js1wd


# DATA ANALYSIS & PROCESSING

### Analysis of impact of charcters on film success (needs improvments)

In [9]:
# get unique character names from name_metadata
unique_character_names = name_metadata["character name"].unique()
unique_character_names

array(['Stuart Little', 'John Doe', 'Josh Framm', 'Caspian X',
       'Apostle Peter', 'Van Wilder', 'Max Cady', 'The Emperor of China',
       'Ludo Dekker', 'Veer Pratap Singh', 'John McClane', 'Jack Cates',
       'Shorty Meeks', 'Fievel Mousekewitz', 'Kazuya Mishima',
       'Darth Vader', 'Queen Victoria', 'Billy Fish', 'Ian Hawke',
       'Ginger Fitzgerald', 'Le Chiffre', 'The Professor',
       'Jim Levenstein', 'Dave Robicheaux', "Jimmy 'The Tulip' Tudeski",
       'Pavel Chekov', 'Chow Mo-wan', 'Foghorn Leghorn', 'Walter Hill',
       'Dylan Sanders', 'The Girl', 'Sherlock Holmes', 'Emperor Nero',
       'Sonia Saxena', 'David King', 'Mr. Big', 'The Drifter', 'Molly O',
       'Judas Iscariot', 'Gloria Sullivan', 'Jennifer Parker',
       'Roger Murtaugh', 'Sharpay Evans', 'Charlie Dog', 'Dr. Vijay',
       'Aldous Snow', 'Mr. Smith', 'Ricky Baker', 'The Stranger',
       'Michelle Flaherty', 'Sidney Prescott', 'Kay Adams', 'Ted Striker',
       'Prince Charming', 'Mr. Kesuke

In [10]:
# select from character_metadata the character names that appear in the character_count
famous_character = character_metadata[character_metadata["Character name"].isin(unique_character_names)]
famous_character

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
386,5894429,/m/0fc8w8,2007-09-14,Young Jack,1983-07-06,M,1.790,/m/0g8_vp,Gregory Smith,24.0,/m/03jq87t,/m/0h58k54,/m/03_nv3
628,196176,/m/01bwgr,1971-11-03,Amy Sumner,1950-07-26,F,1.650,,Susan George,21.0,/m/02vc216,/m/0ch92bz,/m/02ps7rq
629,196176,/m/01bwgr,1971-11-03,David Sumner,1937-08-08,M,1.660,/m/041rx,Dustin Hoffman,34.0,/m/0k3v65,/m/0ch916z,/m/0bl2g
678,748616,/m/03813g,2003-08-14,The Girl,,F,,,Yeo-jin Ha,,/m/0bvbj8n,/m/0bvbj70,/m/0bvbj6y
710,27463222,/m/0c037x9,2010-09-12,Security Guard,,M,,,Arthur Cartwright,,/m/0gdjxvx,/m/0hgs772,/m/0gdjxt_
...,...,...,...,...,...,...,...,...,...,...,...,...,...
450168,24341999,/m/09p5mwg,2010-10-21,Jill Tuck-Kramer,1963-09-06,F,1.650,,Betsy Russell,47.0,/m/09tc32f,/m/076w6nk,/m/027xdvp
450201,43452,/m/0bx0l,1962-12-10,T. E. Lawrence,1932-08-02,M,1.910,/m/02g7sp,Peter O'Toole,30.0,/m/0j_tdt,/m/02nw8h0,/m/0h0jz
450523,11350779,/m/02r8kxp,1942-09-18,Doctor Watson,1895-02-04,M,1.829,,Nigel Bruce,,/m/02tbfd7,/m/0cgry48,/m/02l99f
450524,11350779,/m/02r8kxp,1942-09-18,Sherlock Holmes,1892-06-13,M,1.870,,Basil Rathbone,,/m/02vd3qk,/m/06rkl,/m/0hwd8


In [11]:
#merge famous_character with movie_metadata
famous_character_films = pd.merge(famous_character, movie_metadata, on="Wikipedia movie ID")
#keep interesting columns
famous_character_films = famous_character_films[["Character name", "Movie box office revenue", "Movie name"]]
# delete lines where Movie box office revenue is NaN
famous_character_films = famous_character_films.dropna(subset=["Movie box office revenue"])
famous_character_films

Unnamed: 0,Character name,Movie box office revenue,Movie name
1,Amy Sumner,11148828.0,Straw Dogs
2,David Sumner,11148828.0,Straw Dogs
3,The Girl,9524745.0,"Spring, Summer, Fall, Winter... and Spring"
6,Clark Kent,300218018.0,Superman
7,Lex Luthor,300218018.0,Superman
...,...,...,...
4282,Harold Lee,35387212.0,A Very Harold & Kumar 3D Christmas
4283,Santa Claus,35387212.0,A Very Harold & Kumar 3D Christmas
4293,Lawrence Gordon,136100000.0,Saw VII
4294,Mark Hoffman,136100000.0,Saw VII


In [31]:
# compute average box office revenue per character
average_box_office_revenue_per_character = famous_character_films.groupby("Character name")["Movie box office revenue"].mean()
average_box_office_revenue_per_character

Character name
Abigail Chase          4.024385e+08
Abraham Lincoln        5.505148e+07
Abraham Van Helsing    7.346186e+07
Ace Ventura            1.598015e+08
Adolf Hitler           7.364580e+07
                           ...     
Wilma Flintstone       2.005497e+08
Wonder Woman           6.064580e+06
Wyatt Earp             4.077853e+07
Young Jake             1.806224e+08
Young Jenny            8.160394e+07
Name: Movie box office revenue, Length: 764, dtype: float64

In [27]:
#liner regression analysis of all charcter names on their impact on Movie box office revenue
dummy_variables = pd.get_dummies(famous_character_films['Character name'], prefix='Character')
dummy_variables_numeric = dummy_variables.astype(int)

# Concatenate the dummy variables with the original DataFrame
characters_reg = pd.concat([famous_character_films, dummy_variables_numeric], axis=1)
characters_reg

Unnamed: 0,Character name,Movie box office revenue,Movie name,Character_Abigail Chase,Character_Abraham Lincoln,Character_Abraham Van Helsing,Character_Ace Ventura,Character_Adolf Hitler,Character_Adrian Pennino,Character_Agent Augustus Eugene Gibbons,...,Character_Wild Bill Hickok,Character_Will Turner,Character_William Shakespeare,Character_William Stryker,Character_Willy Wonka,Character_Wilma Flintstone,Character_Wonder Woman,Character_Wyatt Earp,Character_Young Jake,Character_Young Jenny
1,Amy Sumner,11148828.0,Straw Dogs,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,David Sumner,11148828.0,Straw Dogs,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Girl,9524745.0,"Spring, Summer, Fall, Winter... and Spring",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Clark Kent,300218018.0,Superman,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Lex Luthor,300218018.0,Superman,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4282,Harold Lee,35387212.0,A Very Harold & Kumar 3D Christmas,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4283,Santa Claus,35387212.0,A Very Harold & Kumar 3D Christmas,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4293,Lawrence Gordon,136100000.0,Saw VII,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4294,Mark Hoffman,136100000.0,Saw VII,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Define the independent variables (X) and the dependent variable (y)
X = characters_reg[dummy_variables.columns]
y = characters_reg['Movie box office revenue']

# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

print(model.summary())

                               OLS Regression Results                               
Dep. Variable:     Movie box office revenue   R-squared:                       0.793
Model:                                  OLS   Adj. R-squared:                  0.651
Method:                       Least Squares   F-statistic:                     5.612
Date:                      Thu, 16 Nov 2023   Prob (F-statistic):          1.01e-147
Time:                              03:18:21   Log-Likelihood:                -37773.
No. Observations:                      1885   AIC:                         7.707e+04
Df Residuals:                          1121   BIC:                         8.131e+04
Df Model:                               763                                         
Covariance Type:                  nonrobust                                         
                                                           coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

In [32]:
# Extract coefficients and corresponding character names
coefficients = model.params[1:]  # Exclude the intercept
character_names = coefficients.index

# Create a DataFrame to store coefficients and character names
coefficients_df = pd.DataFrame({'Character': character_names, 'Coefficient': coefficients})

# Sort the DataFrame by coefficient values in descending order
coefficients_df = coefficients_df.sort_values(by='Coefficient', ascending=False)

# Select the top 10 characters
top_10_characters = coefficients_df.head(20)

# Print or visualize the top 10 characters and their coefficients
print(top_10_characters)


                                                                 Character  \
Character_Bellatrix Lestrange                Character_Bellatrix Lestrange   
Character_White Rabbit                              Character_White Rabbit   
Character_Professor Severus Snape        Character_Professor Severus Snape   
Character_Lord Voldemort                          Character_Lord Voldemort   
Character_Harry Potter                              Character_Harry Potter   
Character_Davy Jones                                  Character_Davy Jones   
Character_Cutler Beckett                          Character_Cutler Beckett   
Character_Rubeus Hagrid                            Character_Rubeus Hagrid   
Character_Ginny Weasley                            Character_Ginny Weasley   
Character_Professor Albus Dumbledore  Character_Professor Albus Dumbledore   
Character_Hermione Granger                      Character_Hermione Granger   
Character_Captain America                        Character_Capta