# Topics Analysis

In [8]:
# Modules to import
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [2]:
df_movies_with_topics = pd.read_csv('../../data/cultureData/df_movies_with_topics.csv')
df_movies_with_topics.head()

Unnamed: 0,wiki_id,cleaned_summary,language,topic,topic_name,freebase_id,original_title,release_date,revenue,runtime,languages,countries,genres,countries_freebase_id,languages_freebase_id,genres_freebase_id,region,year
0,20663735,poovalli induchoodan sentence six year prison ...,en,0,Love & Family,/m/051zjwb,Narasimham,2000-01-01,,175.0,['Malayalam Language'],India,Musical,['/m/03rk0'],['/m/0999q'],"['/m/04t36', '/m/02kdv5l', '/m/07s9rl0', '/m/0...",South Asia,2000
1,20663735,poovalli induchoodan sentence six year prison ...,en,0,Love & Family,/m/051zjwb,Narasimham,2000-01-01,,175.0,['Malayalam Language'],India,Action,['/m/03rk0'],['/m/0999q'],"['/m/04t36', '/m/02kdv5l', '/m/07s9rl0', '/m/0...",South Asia,2000
2,20663735,poovalli induchoodan sentence six year prison ...,en,0,Love & Family,/m/051zjwb,Narasimham,2000-01-01,,175.0,['Malayalam Language'],India,Drama,['/m/03rk0'],['/m/0999q'],"['/m/04t36', '/m/02kdv5l', '/m/07s9rl0', '/m/0...",South Asia,2000
3,20663735,poovalli induchoodan sentence six year prison ...,en,0,Love & Family,/m/051zjwb,Narasimham,2000-01-01,,175.0,['Malayalam Language'],India,Bollywood,['/m/03rk0'],['/m/0999q'],"['/m/04t36', '/m/02kdv5l', '/m/07s9rl0', '/m/0...",South Asia,2000
4,1952976,plot open 1974 young girl dahlia stand outside...,en,1,Crime,/m/068jvg,Dark Water,2005-06-27,49483352.0,105.0,['English Language'],United States of America,Thriller,['/m/09c7w0'],['/m/02h40lc'],"['/m/01jfsb', '/m/07s9rl0', '/m/03npn']",North America,2005


In [3]:
df = df_movies_with_topics[['wiki_id','topic_name','release_date','countries','region']]
df.head()

Unnamed: 0,wiki_id,topic_name,release_date,countries,region
0,20663735,Love & Family,2000-01-01,India,South Asia
1,20663735,Love & Family,2000-01-01,India,South Asia
2,20663735,Love & Family,2000-01-01,India,South Asia
3,20663735,Love & Family,2000-01-01,India,South Asia
4,1952976,Crime,2005-06-27,United States of America,North America


In [4]:
# List of topics
df['topic_name'].unique()

array(['Love & Family', 'Crime', 'Investigation', 'Pirates',
       'Middle East', 'Civil War', 'Africa', 'Cartoons', 'Martial Arts',
       'USSR', 'Betty Boop', 'French Life', 'WWII', 'Samurai',
       'Sci-Fi Earth', 'Politics', 'Family Drama', 'Stooges',
       'Musketeers', 'Charlie Brown', 'Christmas', 'Space', 'Soldiers',
       'Laurel & Hardy', 'Boxing', 'Sports', 'Tom & Jerry', 'College',
       'Roman Empire', 'Royalty', 'Jungle', 'Tokyo Life', 'Racing',
       'Yakuza', 'Monsters', 'Fantasy', 'Disney', 'Godzilla',
       'Pink Panther', 'School Life'], dtype=object)

In [5]:
# Count number of unique wiki_id per topic
df.groupby('topic_name')['wiki_id'].nunique().sort_values(ascending=False)

topic_name
Love & Family     6137
Crime             4408
Investigation     1254
Martial Arts      1036
Family Drama       598
Sci-Fi Earth       558
Civil War          439
French Life        354
Space              324
WWII               322
Pirates            256
Soldiers           216
USSR               201
Sports             199
Monsters           185
Samurai            153
Cartoons           149
Royalty            143
Stooges            124
Christmas          123
Africa             118
Charlie Brown       98
College             88
Middle East         86
Tokyo Life          84
Tom & Jerry         83
Fantasy             81
Yakuza              76
Jungle              73
Musketeers          69
Racing              69
Laurel & Hardy      68
Godzilla            53
Betty Boop          53
Disney              51
Roman Empire        50
Politics            49
Boxing              48
School Life         40
Pink Panther        20
Name: wiki_id, dtype: int64

### Analysis of the topics propagation

In [6]:
# Select a topic
topic = 'Martial Arts'

df_topic = df[df['topic_name'] == topic]

# Number of countries where the topic is present
print(f'Number of countries where the topic {topic} is present: {df_topic["countries"].nunique()}')

# Number of regions where the topic is present
print(f'Number of regions where the topic {topic} is present: {df_topic["region"].nunique()}')

# Number of movies where the topic is present
print(f'Number of movies where the topic {topic} is present: {df_topic["wiki_id"].nunique()}')


Number of countries where the topic Martial Arts is present: 33
Number of regions where the topic Martial Arts is present: 7
Number of movies where the topic Martial Arts is present: 1036


In [16]:
df_topic.head(2)

Unnamed: 0,wiki_id,topic_name,release_date,countries,region
75,5414895,Martial Arts,2008-06-30,China,East and Southeast Asia
76,5414895,Martial Arts,2008-06-30,China,East and Southeast Asia


In [None]:
# Remove duplicates for each movie by country and topic
df_unique = df.drop_duplicates(subset=['wiki_id', 'countries', 'topic_name'])

# Add a column indicating presence (1)
df_unique['presence'] = 1

# Unique list of available topics
topics = df_unique['topic_name'].unique()

# Create a figure
fig = go.Figure()

# Add traces for each topic
for topic in topics:
    df_topic = df_unique[df_unique['topic_name'] == topic]
    fig.add_trace(
        go.Choropleth(
            locations=df_topic['countries'],
            locationmode='country names',
            z=df_topic['presence'],
            colorbar=dict(title='Presence'),
            colorscale=[[0, 'lightgray'], [1, 'green']],
            showscale=False,
            visible=(topic == topics[0])  # Show only the first topic initially
        )
    )

# Create buttons for topic selection
buttons = []
for i, topic in enumerate(topics):
    buttons.append(
        dict(
            label=topic,
            method='update',
            args=[{'visible': [t == topic for t in topics]},
                  {'title': f'Presence of the topic: {topic}'}]
        )
    )

# Add dropdown menu
fig.update_layout(
    updatemenus=[
        dict(
            buttons=buttons,
            direction='down',
            showactive=True,
            x=0.1,
            xanchor='center',
            y=1.15,
            yanchor='top'
        )
    ],
    title=f'Presence of the topic: {topics[0]}',
    geo=dict(
        showframe=False,
        showcoastlines=True,
        projection_type='equirectangular'
    )
)

# Show the figure
fig.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique['presence'] = 1


In [17]:
# Supprimer les doublons pour chaque film par pays et sujet
df_unique = df.drop_duplicates(subset=['wiki_id', 'countries', 'topic_name'])

# Ajouter une colonne indiquant la présence (1)
df_unique['presence'] = 1

# Liste unique des sujets disponibles
topics = df_unique['topic_name'].unique()

# Initialisation de l'application Dash
app = Dash(__name__)

# Mise en page de l'application
app.layout = html.Div([
    html.H1("Interactive Topic Map", style={'textAlign': 'center'}),
    html.Label("Select a topic:", style={'marginTop': '20px'}),
    dcc.Dropdown(
        id='topic-dropdown',
        options=[{'label': topic, 'value': topic} for topic in topics],
        value=topics[0],  # Sujet par défaut
        clearable=False,
        style={'width': '50%'}
    ),
    dcc.Graph(id='choropleth-map')
])

# Callback pour mettre à jour la carte en fonction du sujet sélectionné
@app.callback(
    Output('choropleth-map', 'figure'),
    [Input('topic-dropdown', 'value')]
)
def update_map(selected_topic):
    # Filtrer les données pour le sujet sélectionné
    df_topic = df_unique[df_unique['topic_name'] == selected_topic]
    
    # Créer la carte choroplèthe
    fig = px.choropleth(
        df_topic,
        locations="countries",  # Colonne contenant les noms des pays
        locationmode="country names",  # Indique que les noms de pays sont utilisés
        color="presence",  # Colonne pour la couleur (présence ou absence)
        color_continuous_scale=[[0, "lightgray"], [1, "green"]],  # Palette de couleurs (binaire)
        title=f"Presence of the topic: {selected_topic}",
        labels={"presence": "Presence"}
    )
    
    # Mettre en page la carte
    fig.update_layout(
        width=1000,
        height=800,
        coloraxis_showscale=False  # Supprimer l'échelle des couleurs
    )
    return fig

# Lancer l'application
if __name__ == '__main__':
    app.run_server(debug=True, port=8060)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

