# Imports and folder paths

In [4]:
import pandas as pd
import numpy as np
import os
import json
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

PATH_FOLDER = "MovieSummaries/"

# TRAIN DATA

## 1) Character Metadata

Metadata for 450,669 characters aligned to the movies above, extracted from the Noverber 4, 2012 dump of Freebase.  

Tab-separated; columns:
1. Wikipedia movie ID 
2. Freebase movie ID
3. Movie release date
4. Character name
5. Actor date of birth
6. Actor gender
7. Actor height (in meters)
8. Actor ethnicity (Freebase ID)
9. Actor name
10. Actor age at movie release
11. Freebase character/actor map ID
12. Freebase character ID
13. Freebase actor ID

---



In [5]:
character_file_name = os.path.join(PATH_FOLDER, 'character.metadata.tsv')
character_column = {
    'Wikipedia movie ID': int,
    'Freebase movie ID': str,
    'Movie release date': str,
    'Character name': str,
    'Actor date of birth': str,
    'Actor gender': str,
    'Actor height (in meters)': float,
    'Actor ethnicity (Freebase ID)': str,
    'Actor name': str,
    'Actor age at movie release': float,
    'Freebase character/actor map ID': str,
    'Freebase character ID': str,
    'Freebase actor ID': str
}
character_metadata = pd.read_csv(character_file_name,  sep="\t", header=None, names=character_column.keys(), dtype=character_column)

character_metadata.sample(5)

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
422807,23687630,/m/06zm4kh,1951,Maria,1930-08-06,F,,,Maria Frau,20.0,/m/0klq9ps,/m/0klqmhk,/m/0klq9pw
234538,25670405,/m/09v2dxp,2007,,,,,,R.J. Sharpe,,/m/0gc7_86,,/m/0gc7x55
131265,17259609,/m/0d7r5_,1949-10-14,,1924-03-14,M,,,Bonar Colleano,25.0,/m/0gcpd5n,,/m/0gdt6g
360442,5776416,/m/0f3_wx,1973-03-17,Captain Nemo,1932-04-10,M,1.8,,Omar Sharif,40.0,/m/02tbhfq,/m/014s4v,/m/019_1h
359263,24154973,/m/07kf0d4,2010-02-09,Yuan Lie,1977-05-11,M,,,Andy On,32.0,/m/0bdt30b,/m/0gkm7gb,/m/03byjgh


## 2) Movie Metadata

Metadata for 81,741 movies, extracted from the Noverber 4, 2012 dump of Freebase. 

Tab-separated; columns:
1. Wikipedia movie ID
2. Freebase movie ID
3. Movie name
4. Movie release date
5. Movie box office revenue
6. Movie runtime
7. Movie languages (Freebase ID:name tuples)
8. Movie countries (Freebase ID:name tuples)
9. Movie genres (Freebase ID:name tuples)





In [6]:
movie_file_name = os.path.join(PATH_FOLDER, 'movie.metadata.tsv')

# Define data types for each column
movies_column = {'Wikipedia movie ID': int,
 'Freebase movie ID': str,
 'Movie name': str,
 'Movie release date': str, # can't have datetime because it doesn't support 2 date format YYYY and YYYY-MM-DD simultaneously
 'Movie box office revenue': float, # Use Int64 to allow NaNs ('problem' : they are writen as <NA>)
 'Movie runtime': float,
 'Movie languages (Freebase ID:name tuples)': str,
 'Movie countries (Freebase ID:name tuples)': str,
 'Movie genres (Freebase ID:name tuples)': str}

movie_metadata = pd.read_csv(movie_file_name,  sep="\t", header=None, names=movies_column.keys(), dtype=movies_column)

#movie_metadata = movie_metadata.assign(Year=movie_metadata["Movie release date"].apply(lambda x: str(x)[:4]).astype(int))


movie_metadata.sample(5) 

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
48140,31278141,/m/0gfn7bd,Shadey,1985,,90.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/05p553"": ""Comedy film""}"
5310,2547451,/m/07m9l9,Animal Factory,2000-10-22,43805.0,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
357,207680,/m/01dh6f,Nell,1994-12-14,106683817.0,112.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}"
8614,31612500,/m/0gmgwwx,Berry-Strawberry,,,95.0,{},{},"{""/m/05p553"": ""Comedy film""}"
62083,20466623,/m/04zxggc,A Country Doctor,2007,,21.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/02hmvc"": ""Short Film"", ""/m/0jxy"": ""Anime""..."


## 3) Plot Summaries

Plot summaries of 42,306 movies extracted from the November 2, 2012 dump of English-language Wikipedia. Each line contains the Wikipedia movie ID (which indexes into movie.metadata.tsv) followed by the summary.

In [7]:
# to be added
summary_file_name = os.path.join(PATH_FOLDER, 'plot_summaries.txt')
summary_metadata = pd.read_csv(summary_file_name,  sep="\t")

summary_metadata.columns = [ "Movie ID",
                              "Movie Summary" ]

summary_metadata["Movie Summary"] = summary_metadata["Movie Summary"].astype("string")

summary_metadata.sample(5)

Unnamed: 0,Movie ID,Movie Summary
4701,14264650,"The Kid from Cleveland tells the story of a ""t..."
25971,18004401,A singer Pappan ([[Rahman is killed but Yamar...
15903,21413341,Junior Asparagus is anxious for the new show p...
31679,1511805,The film consists of four stories in which hum...
35263,8928152,{{Plot}} Ten year-old India Opal Buloni has j...


## 4) Stanford CoreNLP Plot Summaries

The plot summaries, run through the Stanford CoreNLP pipeline (tagging, parsing, NER and coref). Each filename begins with the Wikipedia movie ID (which indexes into movie.metadata.tsv).

In [8]:
# to be added

# TEST DATA

## 1) Tvtropes clusters

72 character types drawn from tvtropes.com, along with 501 instances of those types. The ID field indexes into the Freebase character/actor map ID in character.metadata.tsv.


In [9]:
tvtropes_file_name = os.path.join(PATH_FOLDER, 'tvtropes.clusters.txt')
tvtropes_metadata = pd.read_csv(tvtropes_file_name, sep="\t")

# define columns
tvtropes_metadata.columns = [ 'character',
                             'data' ]

# Convert the 'data' column to strings
tvtropes_metadata['data'] = tvtropes_metadata['data'].apply(lambda x: str(x))

# Extract and convert the JSON data to separate columns
tvtropes_metadata = pd.DataFrame([(character_type, json.loads(data)) for character_type, data
               in zip(tvtropes_metadata['character'], tvtropes_metadata['data'])])

# redefine columns lost from the previous operation
tvtropes_metadata.columns = ['character', 'data']

# Expand the 'data' column into separate columns
tvtropes_metadata = pd.concat([tvtropes_metadata, tvtropes_metadata['data'].apply(pd.Series)], axis=1)

# Drop the original 'data' column
tvtropes_metadata.drop('data', axis=1, inplace=True)

tvtropes_metadata.columns = [ 'Character role',
                               'Character name',
                               'Movie name',
                               'Freebase character/actor map ID', 
                               'Actor name' ]

tvtropes_metadata.sample(5)


Unnamed: 0,Character role,Character name,Movie name,Freebase character/actor map ID,Actor name
476,trickster,Tyler Durden,Fight Club,/m/0jy5tj,Brad Pitt
333,loser_protagonist,Dennis,"Run, Fat Boy, Run",/m/03lvx8n,Simon Pegg
427,slacker,Michele Weinberger,Romy and Michele's High School Reunion,/m/0k2p5j,Lisa Kudrow
144,crazy_jealous_guy,Tony Montana,Scarface,/m/02_8_nx,Al Pacino
344,loveable_rogue,Harold Hill,The Music Man,/m/04hv61w,Matthew Broderick


## 2) Name clusters

970 unique character names used in at least two different movies, along with 2,666 instances of those types. The ID field indexes into the Freebase character/actor map ID in character.metadata.tsv.


In [10]:
name_file_name = os.path.join(PATH_FOLDER, 'name.clusters.txt')
name_metadata = pd.read_csv(name_file_name,  sep="\t")

name_metadata.columns = [ "character name",
                                "Freebase character/actor map ID" ]

name_metadata.sample(5)


Unnamed: 0,character name,Freebase character/actor map ID
2092,Danny Ocean,/m/0k1fh0
726,Eden Sinclair,/m/0gchg1g
1875,Laxmi Devi,/m/0jznk7
2274,Rocky Balboa,/m/01xxqty
1273,Daffy Duck,/m/0hynk74


# DATA ANALYSIS & PROCESSING

### Inflation

In [11]:
# import inflation.xlsx from 'external_dataset' directory, take only 2 first columns begining line 11 (which are columns names)
inflation = pd.read_excel("external_dataset/inflation.xlsx", header=11)

  warn("Workbook contains no default style, apply openpyxl's default")


In [12]:
inflation["2022 multiplier"] = inflation.iloc[-1].Annual / inflation["Annual"]
inflation

Unnamed: 0,Year,Annual,2022 multiplier
0,1913,9.900,29.561111
1,1914,10.000,29.265500
2,1915,10.100,28.975743
3,1916,10.900,26.849083
4,1917,12.800,22.863672
...,...,...,...
105,2018,251.107,1.165459
106,2019,255.657,1.144717
107,2020,258.811,1.130767
108,2021,270.970,1.080027


### Analysis of impact of charcters on film success (needs improvments)

In [13]:
# get unique character names from name_metadata
unique_character_names = name_metadata["character name"].unique()
unique_character_names

array(['Stuart Little', 'John Doe', 'Josh Framm', 'Caspian X',
       'Apostle Peter', 'Van Wilder', 'Max Cady', 'The Emperor of China',
       'Ludo Dekker', 'Veer Pratap Singh', 'John McClane', 'Jack Cates',
       'Shorty Meeks', 'Fievel Mousekewitz', 'Kazuya Mishima',
       'Darth Vader', 'Queen Victoria', 'Billy Fish', 'Ian Hawke',
       'Ginger Fitzgerald', 'Le Chiffre', 'The Professor',
       'Jim Levenstein', 'Dave Robicheaux', "Jimmy 'The Tulip' Tudeski",
       'Pavel Chekov', 'Chow Mo-wan', 'Foghorn Leghorn', 'Walter Hill',
       'Dylan Sanders', 'The Girl', 'Sherlock Holmes', 'Emperor Nero',
       'Sonia Saxena', 'David King', 'Mr. Big', 'The Drifter', 'Molly O',
       'Judas Iscariot', 'Gloria Sullivan', 'Jennifer Parker',
       'Roger Murtaugh', 'Sharpay Evans', 'Charlie Dog', 'Dr. Vijay',
       'Aldous Snow', 'Mr. Smith', 'Ricky Baker', 'The Stranger',
       'Michelle Flaherty', 'Sidney Prescott', 'Kay Adams', 'Ted Striker',
       'Prince Charming', 'Mr. Kesuke

In [14]:
# select from character_metadata the character names that appear in the character_count
famous_character = character_metadata[character_metadata["Character name"].isin(unique_character_names)]
famous_character

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
386,5894429,/m/0fc8w8,2007-09-14,Young Jack,1983-07-06,M,1.790,/m/0g8_vp,Gregory Smith,24.0,/m/03jq87t,/m/0h58k54,/m/03_nv3
628,196176,/m/01bwgr,1971-11-03,Amy Sumner,1950-07-26,F,1.650,,Susan George,21.0,/m/02vc216,/m/0ch92bz,/m/02ps7rq
629,196176,/m/01bwgr,1971-11-03,David Sumner,1937-08-08,M,1.660,/m/041rx,Dustin Hoffman,34.0,/m/0k3v65,/m/0ch916z,/m/0bl2g
678,748616,/m/03813g,2003-08-14,The Girl,,F,,,Yeo-jin Ha,,/m/0bvbj8n,/m/0bvbj70,/m/0bvbj6y
710,27463222,/m/0c037x9,2010-09-12,Security Guard,,M,,,Arthur Cartwright,,/m/0gdjxvx,/m/0hgs772,/m/0gdjxt_
...,...,...,...,...,...,...,...,...,...,...,...,...,...
450168,24341999,/m/09p5mwg,2010-10-21,Jill Tuck-Kramer,1963-09-06,F,1.650,,Betsy Russell,47.0,/m/09tc32f,/m/076w6nk,/m/027xdvp
450201,43452,/m/0bx0l,1962-12-10,T. E. Lawrence,1932-08-02,M,1.910,/m/02g7sp,Peter O'Toole,30.0,/m/0j_tdt,/m/02nw8h0,/m/0h0jz
450523,11350779,/m/02r8kxp,1942-09-18,Doctor Watson,1895-02-04,M,1.829,,Nigel Bruce,,/m/02tbfd7,/m/0cgry48,/m/02l99f
450524,11350779,/m/02r8kxp,1942-09-18,Sherlock Holmes,1892-06-13,M,1.870,,Basil Rathbone,,/m/02vd3qk,/m/06rkl,/m/0hwd8


In [15]:
#merge famous_character with movie_metadata
famous_character_films = pd.merge(famous_character, movie_metadata, on="Wikipedia movie ID")
#keep interesting columns
famous_character_films = famous_character_films[["Character name", "Movie box office revenue", "Movie name",]]
# delete lines where Movie box office revenue is NaN
famous_character_films = famous_character_films.dropna(subset=["Movie box office revenue"])
#famous character films with "Character name == 'Young Adam'"
famous_character_films[famous_character_films["Movie name"] == "Young Adam"]

Unnamed: 0,Character name,Movie box office revenue,Movie name
3941,Joe Taylor,2561820.0,Young Adam


In [16]:
#liner regression analysis of all charcter names on their impact on Movie box office revenue
dummy_variables = pd.get_dummies(famous_character_films, columns=['Character name', 'Movie name'], prefix=['Character', 'Movie'])
dummy_variables

Unnamed: 0,Movie box office revenue,Character_Abigail Chase,Character_Abraham Lincoln,Character_Abraham Van Helsing,Character_Ace Ventura,Character_Adolf Hitler,Character_Adrian Pennino,Character_Agent Augustus Eugene Gibbons,Character_Agent Smith,Character_Aidan Keller,...,Movie_XXX,Movie_XXX: State of the Union / XXX: The Next Level,Movie_Yogi Bear,Movie_Yossi & Jagger,Movie_Young Adam,Movie_Young Bess,Movie_Young Frankenstein,Movie_Young Sherlock Holmes,Movie_Zero Effect,Movie_Zoom
1,11148828.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,11148828.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,9524745.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,300218018.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,300218018.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4282,35387212.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4283,35387212.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4293,136100000.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4294,136100000.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [20]:

dummy_variables_numeric = dummy_variables.astype(int)

# Concatenate the dummy variables with the original DataFrame
characters_reg = pd.concat([famous_character_films, dummy_variables_numeric], axis=1)
characters_reg

Unnamed: 0,Character name,Movie box office revenue,Movie name,Movie box office revenue.1,Character_Abigail Chase,Character_Abraham Lincoln,Character_Abraham Van Helsing,Character_Ace Ventura,Character_Adolf Hitler,Character_Adrian Pennino,...,Movie_XXX,Movie_XXX: State of the Union / XXX: The Next Level,Movie_Yogi Bear,Movie_Yossi & Jagger,Movie_Young Adam,Movie_Young Bess,Movie_Young Frankenstein,Movie_Young Sherlock Holmes,Movie_Zero Effect,Movie_Zoom
1,Amy Sumner,11148828.0,Straw Dogs,11148828,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,David Sumner,11148828.0,Straw Dogs,11148828,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Girl,9524745.0,"Spring, Summer, Fall, Winter... and Spring",9524745,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Clark Kent,300218018.0,Superman,300218018,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Lex Luthor,300218018.0,Superman,300218018,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4282,Harold Lee,35387212.0,A Very Harold & Kumar 3D Christmas,35387212,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4283,Santa Claus,35387212.0,A Very Harold & Kumar 3D Christmas,35387212,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4293,Lawrence Gordon,136100000.0,Saw VII,136100000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4294,Mark Hoffman,136100000.0,Saw VII,136100000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Define the independent variables (X) and the dependent variable (y)
X = characters_reg[dummy_variables.columns]
y = characters_reg['Movie box office revenue']


# Add a constant to the independent variables (required for the regression model)
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

print(model.summary())

ValueError: shapes (1885,2) and (1885,2) not aligned: 2 (dim 1) != 1885 (dim 0)

In [None]:
# Step 3: Choosing a Regression Model
# Linear Regression with Scikit-Learn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Select Features and Target
X = df_encoded.drop('BoxOfficeRevenue', axis=1)  # Features
y = df_encoded['BoxOfficeRevenue']  # Target

# Step 4: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Linear Regression with Scikit-Learn
model = LinearRegression()
model.fit(X_train, y_train)

# Step 6: Evaluate the Model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# Optionally, you can print the coefficients
print('Coefficients:', model.coef_)

In [None]:
# Extract coefficients and corresponding character names
coefficients = model.params[1:]  # Exclude the intercept
character_names = coefficients.index

# Create a DataFrame to store coefficients and character names
coefficients_df = pd.DataFrame({'Character': character_names, 'Coefficient': coefficients})

# Sort the DataFrame by coefficient values in descending order
coefficients_df = coefficients_df.sort_values(by='Coefficient', ascending=False)

# Select the top 10 characters
top_10_characters = coefficients_df.head(20)

# Print or visualize the top 10 characters and their coefficients
print(top_10_characters)


In [None]:
# compute average box office revenue per character
average_box_office_revenue_per_character = famous_character_films.groupby("Character name")["Movie box office revenue"].mean()
average_box_office_revenue_per_character

In [None]:
import pandas as pd

# Assuming 'Languages' is the column containing lists of languages for each film
data = {'Film': ['film1', 'film2', 'film3'],
        'Languages': [['English', 'French'], ['English'], ['German']]}

df = pd.DataFrame(data)

# Step 1: Get the unique set of languages
unique_languages = set(language for languages in df['Languages'] for language in languages)

# Step 2: Create binary columns for each language
for language in unique_languages:
    df[language] = df['Languages'].apply(lambda x: language in x)

# Step 3: Drop the original 'Languages' column
df = df.drop('Languages', axis=1)

# Resulting DataFrame
df
