In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_excel("Vietnam_Movies.xlsx")

In [3]:
actors_info = {}

for index, row in df.iterrows():
    if isinstance(row['Stars'], str):
        cleaned = row['Stars'].replace("[", "").replace("]", "").replace("'", "")
        actor_list = cleaned.split(",")
        actor_list = [actor.strip() for actor in actor_list]
        
        for actor in actor_list:
            if actor in actors_info:
                actors_info[actor][0] += 1
                actors_info[actor][1].append(row['Original Title'])
            else:
                actors_info[actor] = [1, [row['Original Title']]]

sorted_actors = sorted(actors_info.items(), key=lambda x: x[1][0], reverse=True)

actor_data = []
for actor, info in sorted_actors:
    for movie in info[1]:
        if actor != "":
            actor_data.append([actor, movie])

actors_df = pd.DataFrame(actor_data, columns=['Stars', 'Star - Movies'])

# actors_df.to_excel('actors_summary.xlsx', index=False)

In [4]:
directors_info = {}

for index, row in df.iterrows():
    if isinstance(row['Director'], str):
        director = row['Director']
        
        if director in directors_info:
            directors_info[director][0] += 1
            directors_info[director][1].append(row['Original Title'])
        else:
            directors_info[director] = [1, [row['Original Title']]]

sorted_directors = sorted(directors_info.items(), key=lambda x: x[1][0], reverse=True)

director_data = []
for director, info in sorted_directors:
    for movie in info[1]:
        if director != "":
            director_data.append([director, movie])

directors_df = pd.DataFrame(director_data, columns=['Directors', 'Director - Movies'])

# directors_df.to_excel('directors_summary.xlsx', index=False)

In [5]:
from collections import defaultdict

genres_info = defaultdict(lambda: defaultdict(int))

for index, row in df.iterrows():
    genres_str = row['Genres']
    countries_str = row['Production Countries']
    
    if isinstance(genres_str, str) and isinstance(countries_str, str):
        genres = [genre.strip() for genre in genres_str.replace("[", "").replace("]", "").replace("'", "").split(",")]
        countries = [country.strip() for country in countries_str.replace("[", "").replace("]", "").replace("'", "").replace('"', "").split(",")]
        
        for genre in genres:
            for country in countries:
                genres_info[genre][country] += 1

genre_data = []
for genre, genre_info in genres_info.items():
    for country, count in genre_info.items():
        if genre != "" or country != "":
            genre_data.append([genre, country, count])

genres_df = pd.DataFrame(genre_data, columns=['Genres', 'Genre - Country', 'Genre - Count'])
# genres_df.to_excel('genres_by_country_summary.xlsx', index=False)

In [6]:
company_info = {}

for index, row in df.iterrows():
    if isinstance(row['Production Companies'], str):
        cleaned = row['Production Companies'].replace("[", "").replace("]", "").replace("'", "")
        company_list = cleaned.split(",")
        company_list = [company.strip() for company in company_list]
        
        for company in company_list:
            if company in company_info:
                company_info[company][0] += 1
                company_info[company][1].append(row['Original Title'])
            else:
                company_info[company] = [1, [row['Original Title']]]

sorted_company = sorted(company_info.items(), key=lambda x: x[1][0], reverse=True)

company_data = []
for company, info in sorted_company:
    for movie in info[1]:
        if company != "":
            company_data.append([company, movie])

company_df = pd.DataFrame(company_data, columns=['Production Companies', 'Production Company - Movies'])

# company_df.to_excel('companies_summary.xlsx', index=False)

In [7]:
country_info = {}

for index, row in df.iterrows():
    if isinstance(row['Production Countries'], str):
        cleaned = row['Production Countries'].replace("[", "").replace("]", "").replace("'", "").replace('"', "")
        country_list = cleaned.split(",")
        country_list = [country.strip() for country in country_list]
        
        for country in country_list:
            if country in country_info:
                country_info[country] += 1
            else:
                country_info[country] = 1
            
sorted_country = sorted(country_info.items(), key=lambda x: x[1], reverse=True)

country_data = []
for country, info in sorted_country:
    if country != "":
        country_data.append([country, info])

country_df = pd.DataFrame(country_data, columns=['Production Countries', 'Production Country - Count'])
# country_df.to_excel('countries_summary.xlsx', index=False)

In [8]:
spoken_lang_info = {}

for index, row in df.iterrows():
    if isinstance(row['Spoken Languages'], str):
        cleaned = row['Spoken Languages'].replace("[", "").replace("]", "").replace("'", "").replace('"', "")
        spoken_lang_list = cleaned.split(",")
        spoken_lang_list = [spoken_lang.strip() for spoken_lang in spoken_lang_list]
        
        for spoken_lang in spoken_lang_list:
            if spoken_lang in spoken_lang_info:
                spoken_lang_info[spoken_lang] += 1
            else:
                spoken_lang_info[spoken_lang] = 1

sorted_spoken_lang = sorted(spoken_lang_info.items(), key=lambda x: x[1], reverse=True)

spoken_lang_data = []
for spoken_lang, info in sorted_spoken_lang:
    if spoken_lang != "":
        spoken_lang_data.append([spoken_lang, info])

spoken_lang_df = pd.DataFrame(spoken_lang_data, columns=['Spoken Languages', 'Spoken Language - Count'])

# spoken_lang_df.to_excel('spoken_languages_summary.xlsx', index=False)

In [10]:
# concatenate all dataframes into one
summary_df = pd.concat([actors_df, directors_df, genres_df, company_df, country_df, spoken_lang_df], axis=1)
summary_df.to_excel('summary.xlsx', index=False)