# Data opening

In [38]:
import pandas as pd
import os
import matplotlib.pyplot as plt

## *Characters*

In [None]:
characters_df = pd.read_csv('/home/sara/Dropbox/epfl/master/MA1/ADA/project/data/MovieSummaries/character.metadata.tsv', sep='\t', header=None)
characters_df.reset_index(drop=True, inplace=True)

characters_df.head()

In [40]:
# Adding column names after reading the file
characters_df.columns = ['Wikipedia movie ID', 'Freebase Movie ID', ' Movie release date', 'Character name', 'Actor date of birth', 'Actor gender', 'Actor height','Actor ethnicity',
                         'Actor name', ' Actor age at movie release','Freebase character/actor map ID','Freebase character ID','Freebase actor ID']

In [None]:
characters_df

In [None]:
# Characters number
movies_number = len(characters_df)
print('Number of characters:', movies_number)

In [None]:
# Gender representation
gennre_counts=characters_df['Actor gender'].value_counts()

# Proportion calculation
proportions = gennre_counts/gennre_counts.sum()

# Graphical representation
plt.figure(figsize=(8, 8))
plt.pie(proportions, labels=proportions.index, autopct='%1.1f%%', colors=['skyblue', 'lightcoral'])
plt.title('Proportion d\'hommes et de femmes parmi les acteurs')
plt.show()


## *Movies*

In [None]:
movies_df = pd.read_csv('/home/sara/Dropbox/epfl/master/MA1/ADA/project/data/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
movies_df.head()

In [None]:
# Adding column names after reading the file
movies_df.columns = ['Wikipedia movie ID', 'Freebase Movie ID', ' Movie name', 'Movie release date', 'Movie box office revenue', 'Movie runtime', 'Movie language','Movie countries',
                         'Movie genre']

movies_df

In [None]:
# Number of movies
movies_number = len(movies_df)
print('Number of movies:', movies_number)

# Number of movie genres
genre_number = movies_df['Movie genre'].nunique()
print(f"Number of different genre : {genre_number}")


# Number of movie languages
movie_languages = movies_df['Movie language'].nunique()
print(f"Number of different languages : {movie_languages}")

# Number of english movies
movie_in_english = movies_df[movies_df['Movie language']=='{"/m/02h40lc": "English Language"}']
number_of_movies_in_english = len(movie_in_english)
print(f"Number of movies in english : {number_of_movies_in_english}")


In [None]:
# From when to when films have been released
movies_df['Movie release date'] = pd.to_datetime(movies_df['Movie release date'], errors='coerce')

first_movie = movies_df['Movie release date'].min()
last_movie = movies_df['Movie release date'].max()

print(f"First movie released on : {first_movie}")
print(f"Last movie released on : {last_movie}")

In [None]:
# Number of countries that have been produced movies
unique_countries = movies_df['Movie countries'].nunique()

print(f"Number of countries that have been produced movies : {unique_countries}")

# Number of american movies
american_movies = movies_df[movies_df['Movie countries']=='{"/m/09c7w0": "United States of America"}']
number_of_american_movies = len(american_movies)
print(f"Number of american movies : {number_of_american_movies}")

# Number of japan movies
japan_movies = movies_df[movies_df['Movie countries']=='{"/m/03_3d": "Japan"}']
number_of_japan_movies = len(japan_movies)
print(f"Number of japan movies : {number_of_japan_movies}")

# Number of indian movies
indian_movies = movies_df[movies_df['Movie countries']=='{"/m/03rk0": "India"}']
number_of_indian_movies = len(indian_movies)
print(f"Number of indian: {number_of_indian_movies}")



## *Name cluster*

In [None]:
path_name = '/home/sara/Dropbox/epfl/master/MA1/ADA/project/data/MovieSummaries/name.clusters.txt'

# Read the content of the file
with open(path_name, 'r') as fichier:
    content = fichier.readlines()
# print(content)

# Adding of the headers
headers = "Unique character name\tFreebase actor ID\n"
if content[0].strip() != headers.strip():
     content.insert(0, headers)

#Writing the new content in the file
with open(path_name, 'w') as fichier:
     fichier.writelines(content)

#See the new contente in the file
with open(path_name, 'r') as fichier:
     new_content = fichier.read()

print(new_content)

In [None]:
# Number of different name

# Lire le fichier texte
with open(path_name) as file:
    # Lire toutes les lignes du fichier
    lines = file.readlines()

# Extraire les noms (supposons que chaque ligne contient un nom)
names = [line.strip() for line in lines]

# Compter le nombre de noms différents
unique_names = len(set(names))

print(f"Number of different names : {unique_names}")

## *Plot summary*

In [None]:
with open('/home/sara/Dropbox/epfl/master/MA1/ADA/project/data/MovieSummaries/plot_summaries.txt', 'r') as fichier:
    content = fichier.read()
    
print(content)

In [None]:
path_plot = '/home/sara/Dropbox/epfl/master/MA1/ADA/project/data/MovieSummaries/plot_summaries.txt'

# Read the content of the file
with open(path_plot, 'r') as fichier:
    content = fichier.readlines()

# Adding of the headers
headers = "Wikipedia movie ID\tPlot summaries\n"
if content[0].strip() != headers.strip():
    content.insert(0, headers)

# Writing the new content in the file
with open(path_plot, 'w') as fichier:
    fichier.writelines(content)

# See the new contente in the file
with open(path_plot, 'r') as fichier:
    new_content = fichier.read()

print(new_content)

In [None]:
# Lire le fichier texte
with open(path_plot, 'r') as file:
    # Lire tout le contenu du fichier
    content = file.read()

# Séparer les résumés par les lignes vides
summaries = content.split('\n\n')

# Compter le nombre de résumés
number_of_summaries = len(summaries)

print(f"Le nombre de résumés de films dans le fichier est : {number_of_summaries}")

## *TV tropes cluster (stéréotypes)*

In [None]:
with open('/home/sara/Dropbox/epfl/master/MA1/ADA/project/data/MovieSummaries/tvtropes.clusters.txt', 'r') as fichier:
    content = fichier.read()
    
print(content)
