In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
data_folder = './MovieSummaries/'

import ast


# 1. MOVIES

## 1.1 METADATA

In [None]:
movies_metadata = pd.read_csv(data_folder+'movie.metadata.tsv', sep='\t', header=None, 
                 names=['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'release_date',
                        'box_office_revenue', 'movie_runtime', 'languages', 'countries', 'genres'])

print(movies_metadata.info())

### 1.1.1 GENRES

In [None]:

unique_genres = set()
genre_strings = movies_metadata['genres'].values

for genre_str in genre_strings:
    genre_dict = ast.literal_eval(genre_str)  # Convert string to dictionary
    unique_genres.update(genre_dict.values())  # Add values to the set

# Convert to a sorted list if order is preferred
unique_genres = sorted(unique_genres)

print(unique_genres)
print(len(unique_genres))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81741 entries, 0 to 81740
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   wikipedia_movie_id  81741 non-null  int64  
 1   freebase_movie_id   81741 non-null  object 
 2   movie_name          81741 non-null  object 
 3   release_date        74839 non-null  object 
 4   box_office_revenue  8401 non-null   float64
 5   movie_runtime       61291 non-null  float64
 6   languages           81741 non-null  object 
 7   countries           81741 non-null  object 
 8   genres              81741 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 5.6+ MB
None
['Absurdism', 'Acid western', 'Action', 'Action Comedy', 'Action Thrillers', 'Action/Adventure', 'Addiction Drama', 'Adult', 'Adventure', 'Adventure Comedy', 'Airplanes and airports', 'Albino bias', 'Alien Film', 'Alien invasion', 'Americana', 'Animal Picture', 'Animals', 'Animated Mus

### 1.1.2. LANGUAGES

In [21]:
unique_languages = set()
languages_strings = movies_metadata['languages'].values

for language_str in languages_strings:
    language_dict = ast.literal_eval(language_str)  # Convert string to dictionary
    unique_languages.update(language_dict.values())  # Add values to the set

# Convert to a sorted list if order is preferred
unique_languages = sorted(unique_languages)

print(unique_languages)
print(len(unique_languages))

['Aboriginal Malay languages', 'Aceh Language', 'Afrikaans Language', 'Akan Language', 'Albanian language', 'Algonquin Language', 'American English', 'American Sign Language', 'Amharic Language', 'Ancient Greek', 'Apache, Western Language', 'Arabic Language', 'Aramaic language', 'Armenian Language', 'Assamese Language', 'Assyrian Neo-Aramaic Language', 'Assyrian language', 'Australian Aboriginal Pidgin English', 'Australian English', 'Awadhi Language', 'Azerbaijani language', 'Bambara language', 'Banyumasan language', 'Belarusian language', 'Bengali Language', 'Bhojpuri Language', 'Bosnian language', 'Brazilian Portuguese', 'Bulgarian Language', 'Burmese Language', 'Cantonese', 'Catalan language', 'Cebuano language', 'Chadian Arabic', 'Chechen Language', 'Chewa language', 'Cheyenne Language', 'Chhattisgarhi Language', 'Chinese language', 'Chinese, Hakka Language', 'Chinese, Jinyu Language', 'Classical Arabic', 'Corsican Language', 'Cree language', 'Croatian language', 'Crow Language', 

### 1.1.3. COUNTRIES

In [22]:
unique_countries = set()
countries_strings = movies_metadata['countries'].values

for country_str in countries_strings:
    country_dict = ast.literal_eval(country_str)  # Convert string to dictionary
    unique_countries.update(country_dict.values())  # Add values to the set
    
# Convert to a sorted list if order is preferred
unique_countries = sorted(unique_countries)

print(unique_countries)
print(len(unique_countries))

['Afghanistan', 'Albania', 'Algeria', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Belgium', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Burma', 'Cambodia', 'Cameroon', 'Canada', 'Chile', 'China', 'Colombia', 'Congo', 'Costa Rica', 'Crime', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Czechoslovakia', 'Democratic Republic of the Congo', 'Denmark', 'Egypt', 'England', 'Estonia', 'Ethiopia', 'Federal Republic of Yugoslavia', 'Finland', 'France', 'Georgia', 'Georgian SSR', 'German Democratic Republic', 'German Language', 'Germany', 'Greece', 'Guinea', 'Guinea-Bissau', 'Haiti', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Iraqi Kurdistan', 'Ireland', 'Isle of Man', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kenya', 'Kingdom of Great Britain', 'Kingdom of Italy', 'Korea', 'Kuwait', 'Lebanon', 'Libya', 'Lithuania', 'Luxembourg', 'Macau', 'Malayalam L

## 1.2. PLOT SUMMARIES

In [15]:
data = []
with open(data_folder+'plot_summaries.txt', 'r', encoding='utf-8') as file:
    for line in file:
        row = line.strip().split('\t')  # Split by tab
        data.append(row)

# Convert list of lists into a DataFrame
plot_summaries = pd.DataFrame(data, columns=['Wikipedia_movie_id', 'plot_summary'])

print(plot_summaries.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42306 entries, 0 to 42305
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Wikipedia_movie_id  42306 non-null  object
 1   plot_summary        42306 non-null  object
dtypes: object(2)
memory usage: 661.2+ KB
None


# 2. CHARACTERS

## 2.1. METADATA

In [16]:
characters_metadata = pd.read_csv(data_folder+'character.metadata.tsv', sep='\t', header=None, 
                 names=['Wikipedia_movie_id', 'Freebase_movie_id', 'Release_date', 'Character_name',
                        'Date_of_birth', 'Gender', 'Height', 'Ethnicity', 'Actor_name', 'Actor_age', 
                        'Char_actor_id', 'Freebase_char_id', 'Freebase_actor_id'])

print(characters_metadata.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450669 entries, 0 to 450668
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Wikipedia_movie_id  450669 non-null  int64  
 1   Freebase_movie_id   450669 non-null  object 
 2   Release_date        440674 non-null  object 
 3   Character_name      192794 non-null  object 
 4   Date_of_birth       344524 non-null  object 
 5   Gender              405060 non-null  object 
 6   Height              154824 non-null  float64
 7   Ethnicity           106058 non-null  object 
 8   Actor_name          449441 non-null  object 
 9   Actor_age           292556 non-null  float64
 10  Char_actor_id       450669 non-null  object 
 11  Freebase_char_id    192804 non-null  object 
 12  Freebase_actor_id   449854 non-null  object 
dtypes: float64(2), int64(1), object(10)
memory usage: 44.7+ MB
None


### 2.1.1. GENDER

### 2.1.2. HEIGHT

### 2.1.3. ETHNICITY

### 2.1.4. AGE

### 2.1.5. ACTORS IN THE SAME MOVIE

## 2.2. CHARACTER TYPES

In [18]:
character_type = pd.read_csv(data_folder+'tvtropes.clusters.txt', sep='\t', header=None, 
                             names=['character_type', 'metadata'])
print(character_type.info())
print(character_type['character_type'].unique())  
print(len(character_type['character_type'].unique()))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501 entries, 0 to 500
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   character_type  501 non-null    object
 1   metadata        501 non-null    object
dtypes: object(2)
memory usage: 8.0+ KB
None
['absent_minded_professor' 'adventurer_archaeologist'
 'arrogant_kungfu_guy' 'big_man_on_campus' 'bounty_hunter'
 'brainless_beauty' 'broken_bird' 'bromantic_foil'
 'bruiser_with_a_soft_center' 'bully' 'byronic_hero' 'casanova'
 'chanteuse' 'charmer' 'child_prodigy' 'classy_cat_burglar'
 'consummate_professional' 'corrupt_corporate_executive' 'coward'
 'crazy_jealous_guy' 'crazy_survivalist' 'cultured_badass'
 'dean_bitterman' 'dirty_cop' 'ditz' 'doormat' 'drill_sargeant_nasty'
 'dumb_blonde' 'dumb_muscle' 'eccentric_mentor' 'egomaniac_hunter'
 'evil_prince' 'fastest_gun_in_the_west' 'father_to_his_men' 'final_girl'
 'gadgeteer_genius' 'gentleman_thief' 'granola_pers

## 2.3. CHARACTER NAMES 

In [20]:
names = pd.read_csv(data_folder+'name.clusters.txt', sep='\t', header=None, 
                    names=['Cluster_name', 'Char_actor_id'])
print(names.info())
#i want to see all of the different and unique cluster names that exist
print(names['Cluster_name'].unique())
print(len(names['Cluster_name'].unique()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Cluster_name   2666 non-null   object
 1   Char_actor_id  2666 non-null   object
dtypes: object(2)
memory usage: 41.8+ KB
None
['Stuart Little' 'John Doe' 'Josh Framm' 'Caspian X' 'Apostle Peter'
 'Van Wilder' 'Max Cady' 'The Emperor of China' 'Ludo Dekker'
 'Veer Pratap Singh' 'John McClane' 'Jack Cates' 'Shorty Meeks'
 'Fievel Mousekewitz' 'Kazuya Mishima' 'Darth Vader' 'Queen Victoria'
 'Billy Fish' 'Ian Hawke' 'Ginger Fitzgerald' 'Le Chiffre' 'The Professor'
 'Jim Levenstein' 'Dave Robicheaux' "Jimmy 'The Tulip' Tudeski"
 'Pavel Chekov' 'Chow Mo-wan' 'Foghorn Leghorn' 'Walter Hill'
 'Dylan Sanders' 'The Girl' 'Sherlock Holmes' 'Emperor Nero'
 'Sonia Saxena' 'David King' 'Mr. Big' 'The Drifter' 'Molly O'
 'Judas Iscariot' 'Gloria Sullivan' 'Jennifer Parker' 'Roger Murtaugh'
 'Sharpay Evans