# Trend Analysis

## Topic Modeling - LDA

In [None]:
from gensim.utils import simple_preprocess
texts = [simple_preprocess(text, deacc=True) for text in df['cleaned_text']]
print("Exemple de tokenisation :", texts[0][:10])

In [None]:
from gensim.corpora import Dictionary
dictionary = Dictionary(texts)
dictionary.filter_extremes(no_below=5, no_above=0.8)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
from gensim.models.ldamodel import LdaModel
num_topics = 6
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

In [None]:
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}:\n{topic}\n")

## Visualization LDA

In [None]:
import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(lda_vis, 'figures/lda_visualization.html')

topic_descriptions = {
    0: {
        "name": "Medical and Structural Applications",
        "description": "Research focused on multimodal applications in medical imaging, patient care, and materials with structure-property relationships.",
        "keywords": ["multimodal", "editing", "design", "patient", "medical", "structure", "property", "imaging", "screening","ranging"],
        "trends": "Shows increasing interest in medical applications of generative AI, particularly for diagnostics and structural design."
    },
    1: {
        "name": "Generative AI Systems and Research",
        "description": "Broad research on generative AI systems, tackling intelligence challenges and artificial intelligence applications.",
        "keywords": ["generative", "system", "research", "intelligence", "challenge", "paper", "application", "artificial", "technology", "tool"],
        "trends": "Core theme showing the general expansion of generative AI research across domains."
    },
    2: {
        "name": "Image Generation and Diffusion Models",
        "description": "Research on generative image models, particularly diffusion models and text-to-image generation technologies.",
        "keywords": ["image", "generative", "model", "diffusion", "text", "generation", "method", "generated", "quality", "synthetic"],
        "trends": "Became especially prominent after 2021 with the rise of diffusion models like DALL-E and Stable Diffusion."
    },
    3: {
        "name": "Large Language Models and Natural Language Generation",
        "description": "Work on large language models (LLMs), language tasks, human-AI interaction, and prompt engineering.",
        "keywords": ["model", "language", "llm", "large", "task", "human", "generation", "agent", "prompt", "gpt"],
        "trends": "Exploded in popularity from 2022 onwards with the release of ChatGPT and subsequent LLMs."
    },
    4: {
        "name": "Neural Networks and Learning Methods",
        "description": "Fundamental research on neural network architectures, learning methods, and training approaches.",
        "keywords": ["model", "data", "learning", "network", "based", "training", "method", "approach", "task", "algorithm"],
        "trends": "More prevalent in earlier years (2019-2021), representing foundational research that enabled later breakthroughs."
    },
    5: {
        "name": "ChatGPT and Human-AI Interaction Studies",
        "description": "Research specifically studying ChatGPT usage, user experience, and social implications of generative AI.",
        "keywords": ["study", "chatgpt", "user", "generated", "human", "analysis", "social", "use", "using", "used"],
        "trends": "Emerged strongly in 2023 and 2024, reflecting the societal impact of accessible generative AI tools."
    }
}

print("\nEnhanced LDA Topic Interpretations:")
for topic_id, details in topic_descriptions.items():
    print(f"\nTopic {topic_id}: {details['name']}")
    print(f"Description: {details['description']}")
    print(f"Keywords: {', '.join(details['keywords'])}")
    print(f"Trends: {details['trends']}")

topic_evolution = df[['year', 'dominant_topic']].groupby(['year']).agg(lambda x: x.value_counts().index[0])
print("\nDominant topic by year:")
for year, topic in topic_evolution.iterrows():
    topic_id = topic['dominant_topic']
    print(f"{year}: Topic {topic_id} - {topic_descriptions[topic_id]['name']}")


In [None]:
def get_dominant_topic(model, corpus):
    topics = []
    for doc in corpus:
        topic_dist = model.get_document_topics(doc)
        topic_id, _ = max(topic_dist, key=lambda x: x[1])
        topics.append(topic_id)
    return topics

df['dominant_topic'] = get_dominant_topic(lda_model, corpus)

topic_evolution = df.groupby(['year', 'dominant_topic']).size().unstack(fill_value=0)

topic_evolution_norm = topic_evolution.div(topic_evolution.sum(axis=1), axis=0)

topic_evolution_norm.plot(kind='line', title='Évolution des topics dominants par année')

## BERTopic

In [None]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt


df = pd.read_csv("preprocessing/processed_generative_ai_data_with_kmeans.csv")


In [None]:
df_clean = df[~df['cleaned_text'].isna()].reset_index(drop=True)
docs = df_clean['cleaned_text'].tolist()
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
vectorizer = CountVectorizer(stop_words="english", 
                             min_df=6,
                             ngram_range=(1, 2))
topic_model = BERTopic(
    embedding_model=sentence_model,
    vectorizer_model=vectorizer,
    nr_topics="auto",
    verbose=True
)
topics, probs = topic_model.fit_transform(docs)


In [None]:
df_clean['bertopic'] = topics

In [None]:
topic_info = topic_model.get_topic_info()
print(topic_info.head(10))

In [None]:
topic_info.info()

In [None]:
topic_model.visualize_topics()

In [None]:
import pandas as pd
df_topics_over_time = pd.DataFrame({
    "Document": docs,
    "Topic": topics,
    "Timestamp": df_clean['year'].astype(int)
})

In [None]:
docs = df_clean['cleaned_text'].tolist()
topics, probs = topic_model.fit_transform(docs)
years = df_clean['year'].astype(int).tolist()

print(f"Nombre de documents : {len(docs)}")
print(f"Nombre de topics : {len(topics)}")
print(f"Nombre d'années : {len(years)}")

if len(docs) == len(topics) == len(years):
    topics_over_time = topic_model.topics_over_time(docs, years, topics)
else:
    print("Les listes ne sont pas de la même longueur !")
    raise ValueError("docs, topics et years doivent être de même longueur pour topics_over_time.")

In [None]:
topics_over_time

In [None]:
fig = topic_model.visualize_topics_over_time(
    topics_over_time,
    top_n_topics=10,             # Afficher les 10 premiers topics
    normalize_frequency=True      # Normaliser pour éviter les biais liés au nombre d'articles par année
)
fig.show()

In [None]:
cross_table = pd.crosstab(df_clean['bertopic'], df_clean['kmeans_cluster'])
print("\nBERTopic vs KMeans clusters:")
print(cross_table)

In [None]:
topic_model.save("models/bertopic_model")
df_clean.to_csv("preprocessing/processed_data_with_bertopic.csv", index=False)

## Clustering - KMeans

### Determining the optimal k using the Silhouette score

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import umap
from sklearn.metrics import silhouette_score
import seaborn as sns


df = pd.read_csv("preprocessing/processed_generative_ai_data.csv")
X_reduced = np.load('X_reduced.npy')


silhouette_scores = []
range_clusters = range(2, 15)

for n_clusters in range_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_reduced)
    score = silhouette_score(X_reduced, cluster_labels)
    silhouette_scores.append(score)
    print(f"Silhouette score for {n_clusters} clusters: {score:.3f}")

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range_clusters, silhouette_scores, 'o-')
plt.xlabel('Nombre de Clusters')
plt.ylabel('Silhouette Score')
plt.title('Analyse de Silhouette Score pour KMeans Clustering')
plt.grid(True)
plt.show()

In [None]:
optimal_clusters = range_clusters[np.argmax(silhouette_scores)]
print(f"Valeur Optimale de clusters: {optimal_clusters}")

In [None]:
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
df['kmeans_cluster'] = kmeans.fit_predict(X_reduced)

## 2D Visualization with UMAP

In [None]:
reducer = umap.UMAP(random_state=42)
umap_embedding = reducer.fit_transform(X_reduced)

In [None]:
plt.figure(figsize=(14, 10))
sns.scatterplot(x=umap_embedding[:, 0], y=umap_embedding[:, 1], 
                hue=df['kmeans_cluster'], palette='viridis', 
                legend='full', alpha=0.7, s=40)
plt.title(f'KMeans Clustering with {optimal_clusters} Clusters (UMAP Visualization)')
plt.colorbar(label='Cluster')
plt.tight_layout()
plt.show()

In [None]:
cluster_counts = df['kmeans_cluster'].value_counts().sort_index()
print("Taille du cluster:")
print(cluster_counts)

In [None]:
df.to_csv("preprocessing/processed_generative_ai_data_with_kmeans.csv", index=False)