# Exploratory Analysis

## Publication growth over time

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='year', color='skyblue')
plt.title("Nombre de publications par année")
plt.xlabel("Année")
plt.ylabel("Nombre d’articles")
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

## Keyword frequency by year

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Créer un dictionnaire {année: texte nettoyé}
yearly_text = df.groupby('year')['cleaned_text'].apply(lambda x: ' '.join(x.dropna()))

# Vectorisation par année
vectorizer = CountVectorizer(stop_words='english', max_features=500)
yearly_counts = {}

for year, text in yearly_text.items():
    X = vectorizer.fit_transform([text])
    freq = X.toarray()[0]
    words = vectorizer.get_feature_names_out()
    yearly_counts[year] = sorted(zip(words, freq), key=lambda x: -x[1])[:20]

# Afficher pour chaque année
for year, words in yearly_counts.items():
    print(f"\nTop mots en {year}:")
    for word, count in words:
        print(f"  {word}: {count}")

## Bigrams and Trigrams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english', max_features=20)
X = vectorizer.fit_transform(df['cleaned_text'].fillna(""))
freq = X.sum(axis=0)
bigrams = [(word, freq[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
bigrams.sort(key=lambda x: -x[1])

print("Bigrams/trigrams les plus fréquents :")
for phrase, count in bigrams[:20]:
    print(f"{phrase}: {count}")

## Top concepts

In [None]:
concept_list = []
for concepts in df['concepts'].dropna():
    concept_list.extend([c.strip() for c in concepts.split(';') if c.strip() != ""])

from collections import Counter

top_concepts = Counter(concept_list).most_common(20)

print("Top concepts extraits :")
for concept, count in top_concepts:
    print(f"{concept}: {count}")

## Top Countries

In [None]:
country_list = []
for countries in df['countries'].dropna():
    country_list.extend([c.strip() for c in countries.split(';') if c.strip() != ""])

top_countries = Counter(country_list).most_common(10)

print("Top pays représentés :")
for country, count in top_countries:
    print(f"{country}: {count}")

# word cloud by period

### Global

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text_all = " ".join(df['cleaned_text'].dropna().tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_all)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Nuage de mots global")
plt.show()

### Per year

In [None]:
import matplotlib.pyplot as plt

years = df['year'].unique()
for year in years:
    subset = df[df['year'] == year]
    if len(subset) == 0:
        continue
    text_year = ' '.join(subset['cleaned_text'].dropna())
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_year)
    
    plt.figure(figsize=(10, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f"Nuage de mots - {year}")
    plt.axis("off")
    plt.tight_layout()
    plt.show()