# Citation analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import networkx as nx

df = pd.read_csv("preprocessing/processed_generative_ai_data_with_kmeans.csv")

In [None]:
citation_stats = df['cited_by_count'].describe()
print("Statistiques des citations:")
print(citation_stats)

## Identifying the most influential papers

In [None]:
top_cited = df.nlargest(20, 'cited_by_count')
print("\nTop 20 des articles cités:")
for i, (_, paper) in enumerate(top_cited.iterrows(), 1):
    print(f"{i}. {paper['title']} ({paper['year']}) - Citations: {paper['cited_by_count']}")

## Article distribution based on citations

In [None]:
plt.figure(figsize=(12, 8))

plt.hist(df['cited_by_count'], bins=50, alpha=0.7, color='steelblue', log=True)
plt.title('Distribution des nombres de citation(Log Scale)', fontsize=16)
plt.xlabel('Nombre de citations (log scale)', fontsize=14)
plt.ylabel('Nombre d\'articles (log scale)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('figures/citation_distribution.png')
plt.show()

## Distribution of citations by year

In [None]:
plt.figure(figsize=(12, 8))
citation_by_year = df.groupby('year')['cited_by_count'].mean()
year_counts = df.groupby('year').size()

plt.bar(citation_by_year.index, citation_by_year.values, alpha=0.7, color='darkorange')
plt.title('Moyenne des citations d\'un article par année de publication', fontsize=16)
plt.xlabel('Année de publication', fontsize=14)
plt.ylabel('Moyenne des citations', fontsize=14)
plt.xticks(range(2019, 2025))
plt.grid(True, axis='y', alpha=0.3)

for year, avg_citations in zip(citation_by_year.index, citation_by_year.values):
    count = year_counts[year]
    plt.text(year, avg_citations + 5, f'n={count}', ha='center')

plt.tight_layout()
plt.savefig('figures/avg_citations_by_year.png')
plt.show()

## Distribution of citations by cluster

In [None]:
plt.figure(figsize=(12, 8))
citation_by_cluster = df.groupby('kmeans_cluster')['cited_by_count'].mean().sort_values(ascending=False)
cluster_counts = df.groupby('kmeans_cluster').size()

plt.bar(citation_by_cluster.index, citation_by_cluster.values, alpha=0.7, color='teal')
plt.title('Moyenne des citations d\'un article par Cluster', fontsize=16)
plt.xlabel('Cluster de recherche', fontsize=14)
plt.ylabel('Moyenne des citations', fontsize=14)
plt.grid(True, axis='y', alpha=0.3)

for cluster, avg_citations in zip(citation_by_cluster.index, citation_by_cluster.values):
    count = cluster_counts[cluster]
    plt.text(cluster, avg_citations + 5, f'n={count}', ha='center')

plt.tight_layout()
plt.savefig('figures/avg_citations_by_cluster.png')
plt.show()

## Qualitative review of the 20 cited articles"

In [None]:
top_20_papers = df.nlargest(20, 'cited_by_count')

## frequency of words in influential article titles

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(top_20_papers['title'])
words = vectorizer.get_feature_names_out()
count_values = X.toarray().sum(axis=0)

top_words = [(word, count) for word, count in zip(words, count_values)]
top_words.sort(key=lambda x: x[1], reverse=True)
top_title_words = top_words[:15]

In [None]:
print("\nLes mots fréquents dans les titres 20 des articles cités:")
for word, count in top_title_words:
    print(f"{word}: {count}")