# EDA: Citation Distribution Analysis

Explore citation patterns:
1. Overall citation distribution
2. Citation statistics by year
3. High vs low impact papers
4. Citation growth over time
5. Top cited papers

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

%matplotlib inline

## 1. Load Cleaned Data

In [None]:
df = pd.read_pickle('../data/processed/cleaned_data.pkl')
print(f"Dataset: {df.shape}")
print(f"Years: {df['Year'].min()} - {df['Year'].max()}")

## 2. Citation Distribution

In [None]:
print("Citation Statistics:")
print(df['Citations'].describe())
print(f"\nSkewness: {df['Citations'].skew():.2f}")
print(f"Kurtosis: {df['Citations'].kurtosis():.2f}")

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].hist(df['Citations'], bins=100, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Citations')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Citation Distribution (Raw)')
axes[0].axvline(df['Citations'].median(), color='red', linestyle='--', label=f'Median: {df["Citations"].median():.0f}')
axes[0].legend()

axes[1].hist(np.log1p(df['Citations']), bins=100, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_xlabel('Log(Citations + 1)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Citation Distribution (Log-transformed)')

axes[2].boxplot(df['Citations'], vert=True)
axes[2].set_ylabel('Citations')
axes[2].set_title('Citation Boxplot')

plt.tight_layout()
plt.show()

## 3. Top 25% Classification Threshold

In [None]:
threshold = df['Citations'].quantile(0.75)
print(f"Top 25% threshold: {threshold:.0f} citations")

df['high_impact'] = (df['Citations'] >= threshold).astype(int)

print(f"\nHigh-impact papers (top 25%): {df['high_impact'].sum()}")
print(f"Low-impact papers (bottom 75%): {(df['high_impact'] == 0).sum()}")

fig, ax = plt.subplots(figsize=(8, 6))
df['high_impact'].value_counts().plot(kind='bar', ax=ax, color=['steelblue', 'coral'])
ax.set_xlabel('Impact Level')
ax.set_ylabel('Count')
ax.set_title('Distribution of High vs Low Impact Papers')
ax.set_xticklabels(['Low Impact (<75%)', 'High Impact (â‰¥75%)'], rotation=0)
plt.tight_layout()
plt.show()

## 4. Citations by Year

In [None]:
year_stats = df.groupby('Year')['Citations'].agg(['count', 'mean', 'median', 'std']).reset_index()
year_stats.columns = ['Year', 'Papers', 'Mean Citations', 'Median Citations', 'Std Dev']

print("Citation statistics by year:")
print(year_stats.tail(10))

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

axes[0, 0].bar(year_stats['Year'], year_stats['Papers'], color='steelblue', edgecolor='black')
axes[0, 0].set_xlabel('Year')
axes[0, 0].set_ylabel('Number of Papers')
axes[0, 0].set_title('Publications per Year')

axes[0, 1].plot(year_stats['Year'], year_stats['Mean Citations'], marker='o', color='coral', linewidth=2)
axes[0, 1].plot(year_stats['Year'], year_stats['Median Citations'], marker='s', color='green', linewidth=2)
axes[0, 1].set_xlabel('Year')
axes[0, 1].set_ylabel('Citations')
axes[0, 1].set_title('Mean and Median Citations by Year')
axes[0, 1].legend(['Mean', 'Median'])

df.boxplot(column='Citations', by='Year', ax=axes[1, 0], rot=45)
axes[1, 0].set_xlabel('Year')
axes[1, 0].set_ylabel('Citations')
axes[1, 0].set_title('Citation Distribution by Year')
plt.sca(axes[1, 0])
plt.xticks(rotation=45)

recent_years = df[df['Year'] >= 2015]
axes[1, 1].scatter(recent_years['Year'], recent_years['Citations'], alpha=0.3, s=10)
axes[1, 1].set_xlabel('Year')
axes[1, 1].set_ylabel('Citations')
axes[1, 1].set_title('Citation Scatter (2015+)')

plt.tight_layout()
plt.show()

## 5. Top Cited Papers

In [None]:
top_papers = df.nlargest(20, 'Citations')[['Title', 'Year', 'Citations', 'Authors', 'Scopus Source title']]
print("Top 20 Most Cited Papers:")
print(top_papers.to_string(index=False))

## 6. Citation Percentiles

In [None]:
percentiles = [10, 25, 50, 75, 90, 95, 99]
citation_percentiles = df['Citations'].quantile([p/100 for p in percentiles])

print("Citation Percentiles:")
for p, val in zip(percentiles, citation_percentiles):
    print(f"{p}th percentile: {val:.0f} citations")

plt.figure(figsize=(10, 6))
plt.bar(range(len(percentiles)), citation_percentiles, color='coral', edgecolor='black')
plt.xticks(range(len(percentiles)), [f'{p}th' for p in percentiles])
plt.xlabel('Percentile')
plt.ylabel('Citation Count')
plt.title('Citation Distribution by Percentile')
plt.tight_layout()
plt.show()

## 7. Temporal Validation Split

In [None]:
train_years = [2015, 2016, 2017]
test_years = [2018, 2019, 2020]

train_df = df[df['Year'].isin(train_years)]
test_df = df[df['Year'].isin(test_years)]

print("Temporal Validation Split:")
print(f"Train set (2015-2017): {len(train_df)} papers")
print(f"Test set (2018-2020): {len(test_df)} papers")
print(f"\nTrain citation stats:")
print(train_df['Citations'].describe())
print(f"\nTest citation stats:")
print(test_df['Citations'].describe())

## 8. Summary

In [None]:
print("=" * 60)
print("CITATION DISTRIBUTION SUMMARY")
print("=" * 60)
print(f"Total papers: {len(df)}")
print(f"Year range: {df['Year'].min()} - {df['Year'].max()}")
print(f"\nCitation range: {df['Citations'].min()} - {df['Citations'].max()}")
print(f"Median citations: {df['Citations'].median():.0f}")
print(f"Mean citations: {df['Citations'].mean():.2f}")
print(f"\nTop 25% threshold: {threshold:.0f} citations")
print(f"High-impact papers: {df['high_impact'].sum()} ({df['high_impact'].mean()*100:.1f}%)")
print(f"\nDistribution: Highly skewed (many low-cited, few highly-cited papers)")
print(f"Recommendation: Use log-transformation for regression models")