# Geographical Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
import re
import networkx as nx
from wordcloud import WordCloud

df = pd.read_csv("preprocessing/processed_generative_ai_data_with_kmeans.csv")

def extract_countries(country_str):
    if pd.isna(country_str) or country_str == '':
        return []
    return [c.strip() for c in str(country_str).split(';') if c.strip() != '']


all_countries = []
for countries in df['countries'].dropna():
    all_countries.extend(extract_countries(countries))


country_counts = Counter(all_countries).most_common(15)
country_df = pd.DataFrame(country_counts, columns=['country', 'count'])


country_code_to_name = {
    'US': 'United States', 'CN': 'China', 'GB': 'United Kingdom', 'DE': 'Germany',
    'AU': 'Australia', 'CA': 'Canada', 'FR': 'France', 'IT': 'Italy',
    'IN': 'India', 'CH': 'Switzerland', 'JP': 'Japan', 'NL': 'Netherlands',
    'ES': 'Spain', 'KR': 'South Korea', 'SG': 'Singapore', 'SE': 'Sweden',
     'BR': 'Brazil', 'TW': 'Taiwan', 'BE': 'Belgium'
}

country_df['country_name'] = country_df['country'].map(country_code_to_name)

## Top 15 contributing countries

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(x='count', y='country_name', data=country_df, palette='viridis')
plt.title('Les 15 Pays les plus contribuants en recherches sur IA générative', fontsize=16)
plt.xlabel("Nombre d'articles", fontsize=14)
plt.ylabel('Pays', fontsize=14)
plt.tight_layout()
plt.savefig('figures/top_countries_bar.png')
plt.show()

In [None]:
fig = px.choropleth(country_df, 
                    locations='country', 
                    locationmode='ISO-3',
                    color='count', 
                    hover_name='country_name',
                    color_continuous_scale='Viridis',
                    title='Distribution globale des recherches en IA générative')
fig.write_html('figures/world_map_research.html')

In [None]:
country_year_data = []
for _, row in df.dropna(subset=['countries', 'year']).iterrows():
    year = int(row['year'])
    countries = extract_countries(row['countries'])
    for country in countries:
        country_year_data.append({'year': year, 'country': country})

country_year_df = pd.DataFrame(country_year_data)

## 5 Top Countries

In [None]:
top5_countries = [c[0] for c in Counter(all_countries).most_common(5)]
top5_country_year = country_year_df[country_year_df['country'].isin(top5_countries)]

In [None]:
plt.figure(figsize=(12, 8))
country_counts_by_year = pd.crosstab(top5_country_year['year'], top5_country_year['country'])
country_counts_by_year.plot(kind='line', marker='o', linewidth=2.5)
plt.title('Les tendances de recherches des Top 5 pays (2019-2024)', fontsize=16)
plt.xlabel('année', fontsize=14)
plt.ylabel('Nombre d\'articles', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(title='Country')
plt.xticks(range(2019, 2025))
plt.tight_layout()
plt.savefig('figures/country_trends.png')
plt.show()