In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import json

DATA_FOLDER = "MovieSummaries/"

In [41]:
# Read the text file into a DataFrame
summary_df = pd.read_csv(DATA_FOLDER + "plot_summaries.txt", sep='\t', header=None, names=["Wikipedia movie ID", "Summary"])

# Display the first few rows of the DataFrame

first_summary = summary_df.loc[0, 'Summary']
print(first_summary)

summary_df.head()


Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.


Unnamed: 0,Wikipedia movie ID,Summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [74]:
import re

# Read the TSV file into a DataFrame
movie_df = pd.read_csv(DATA_FOLDER + "movie.metadata.tsv", sep='\t', header=None,
                 names=["Wikipedia movie ID", "Freebase movie ID", "Movie name",
                        "Movie release date", "Movie box office revenue",
                        "Movie runtime", "Movie languages", "Movie countries", "Movie genres"])

# Remove the "/m/" prefix from the "Freebase movie ID" and "Movie languages" and "Movie genres" columns
columns_to_remove_prefix = ['Freebase movie ID','Movie languages','Movie genres','Movie countries']
for column in columns_to_remove_prefix:
    movie_df[column] = movie_df[column].str.replace('/m/', '', regex=False)

# Define a function to remove codes and extra characters
def remove_codes_and_characters(column):
    cleaned_column = re.sub(r'"\w+":\s*', '', column)  # Remove codes
    return re.sub(r'[{}"]', '', cleaned_column)  # Remove {, }, and double quotes

# Apply the function to the specified columns
columns_to_clean = ["Movie languages", "Movie countries", "Movie genres"]

for column in columns_to_clean:
    movie_df[column] = movie_df[column].apply(remove_codes_and_characters)

# Display the first few rows of the DataFrame
movie_df.head()

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres
0,975900,03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,English Language,United States of America,"Thriller, Science Fiction, Horror, Adventure, ..."
1,3196793,08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,English Language,United States of America,"Mystery, Biographical film, Drama, Crime Drama"
2,28463795,0crgdbh,Brun bitter,1988,,83.0,Norwegian Language,Norway,"Crime Fiction, Drama"
3,9363483,0285_cd,White Of The Eye,1987,,110.0,English Language,United Kingdom,"Thriller, Erotic thriller, Psychological thriller"
4,261236,01mrr1,A Woman in Flames,1983,,106.0,German Language,Germany,Drama


In [46]:
# Read the gzipped TSV file into a DataFrame
character_df = pd.read_csv(DATA_FOLDER+"character.metadata.tsv", sep='\t', header=None,
                 names=["Wikipedia movie ID", "Freebase movie ID", "Movie release date",
                        "Character name", "Actor date of birth", "Actor gender",
                        "Actor height (in meters)", "Actor ethnicity (Freebase ID)",
                        "Actor name", "Actor age at movie release", "Freebase character/actor map ID",
                        "Freebase character ID", "Freebase actor ID"])

# Remove the "/m/" prefix from specific columns
columns_to_remove_prefix = ["Freebase movie ID", "Actor ethnicity (Freebase ID)", "Freebase character ID", "Freebase actor ID","Freebase character/actor map ID"]
for column in columns_to_remove_prefix:
    character_df[column] = character_df[column].str.replace('/m/', '', regex=False)

# Display the first few rows of the DataFrame
character_df.head()

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
0,975900,03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,0bgchxw,0bgcj3x,03wcfv7
1,975900,03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,044038p,Natasha Henstridge,27.0,0jys3m,0bgchn4,0346l4
2,975900,03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,0x67,Ice Cube,32.0,0jys3g,0bgchn_,01vw26l
3,975900,03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,02vchl6,0bgchnq,034hyc
4,975900,03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,02vbb3r,0bgchp9,01y9xg


In [70]:
# Read the text file into a DataFrame
char_cluster_df = pd.read_csv(DATA_FOLDER+"name.clusters.txt", sep='\t', header=None, names=['Name', 'ID'])

char_cluster_df["ID"] = char_cluster_df["ID"].str.replace('/m/', '', regex=False)

# Display the first few rows of the DataFrame
char_cluster_df.head()

Unnamed: 0,Name,ID
0,Stuart Little,0k3w9c
1,Stuart Little,0k3wcx
2,Stuart Little,0k3wbn
3,John Doe,0jyg35
4,John Doe,0k2_zn


In [69]:

# Read the text file into a DataFrame
tvtropes_df = pd.read_csv(DATA_FOLDER + "tvtropes.clusters.txt", delimiter='\t', header=None, names=['CharType', 'Values'])

# Define a function to extract specific JSON values
def extract_json_values(json_str, key):
    try:
        data = json.loads(json_str)
        return data.get(key, None)
    except json.JSONDecodeError:
        return None

# Extract specific JSON values into separate columns
tvtropes_df['Char'] = tvtropes_df['Values'].apply(lambda x: extract_json_values(x, 'char'))
tvtropes_df['Movie'] = tvtropes_df['Values'].apply(lambda x: extract_json_values(x, 'movie'))
tvtropes_df['ID'] = tvtropes_df['Values'].apply(lambda x: extract_json_values(x, 'id'))
tvtropes_df['Actor'] = tvtropes_df['Values'].apply(lambda x: extract_json_values(x, 'actor'))

tvtropes_df["ID"] = tvtropes_df["ID"].str.replace('/m/', '', regex=False)

# Delete the ClusterID column
tvtropes_df.drop(columns=['Values'], inplace=True)

# Display the first few rows of the DataFrame
tvtropes_df.head()

Unnamed: 0,CharType,Char,Movie,ID,Actor
0,absent_minded_professor,Professor Philip Brainard,Flubber,0jy9q0,Robin Williams
1,absent_minded_professor,Professor Keenbean,Richie Rich,02vchl3,Michael McShane
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,0k6fkc,Ian McKellen
3,absent_minded_professor,Dr. Harold Medford,Them!,0k6_br,Edmund Gwenn
4,absent_minded_professor,Daniel Jackson,Stargate,0k3rhh,James Spader


Age-related desirability in movies

In [84]:
# Filter the DataFrame to find rows with specified genre keywords
filtered_df = movie_df[movie_df["Movie genres"].str.contains('erotic|romantic|romance', case=False, na=False)]

# Use a set to store unique genre values
unique_genres = set()

# Iterate through the filtered DataFrame and add unique genres to the set
for genre in filtered_df["Movie genres"]:
    unique_genres.update(genre.split(', '))  # Assumes genres are separated by a comma and a space

# Print the unique genre values
for genre in unique_genres:
    print(genre)

Hardcore pornography
New Hollywood
Television movie
Black comedy
Children's
Coming-of-age film
Romantic comedy
Heavenly Comedy
Romantic drama
Tamil cinema
Political drama
Creature Film
Melodrama
Escape Film
Pinku eiga
Science Fiction
Gothic Film
Dystopia
Sword and Sandal
Essay Film
Film noir
Auto racing
Thriller
Erotica
Political thriller
Blaxploitation
Homoeroticism
Tollywood
Comedy Thriller
Film-Opera
Computer Animation
Workplace Comedy
Romance Film
Beach Film
Ealing Comedies
Comedy of manners
Time travel
Tragedy
Christian film
Neo-noir
Stoner film
Expressionism
Natural disaster
Backstage Musical
Musical comedy
Psycho-biddy
Feminist Film
Libraries and librarians
Americana
Spaghetti Western
Anime
Japanese Movies
Sports
Psychological thriller
Satire
Mystery
Film \u00e0 clef
Superhero
Female buddy film
Buddy film
Holiday Film
Indie
Illnesses & Disabilities
Hagiography
Doomsday film
Boxing
Children's/Family
Slapstick
Steampunk
Comedy-drama
Anthology
Chase Movie
Zombie Film
Comedy film
Hi

In [89]:
# Iterate through the list and print values with "Erotic" or "Romantic"
love_genre = []
for genre in unique_genres:
    if "Erotic" in genre or "Roman" in genre:
        love_genre.append(genre)
print(love_genre)

['Romantic comedy', 'Romantic drama', 'Erotica', 'Romance Film', 'Erotic thriller', 'Erotic Drama', 'Romantic fantasy', 'Romantic thriller']
