In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv('metadata.csv')

# Inspect data
print("Shape:", df.shape)
print(df.info())
print(df.head())

# Check missing values
print(df.isnull().sum().head(10))


In [None]:
# Drop rows with missing titles or publish_time
df = df.dropna(subset=['title', 'publish_time'])

# Convert publish_time to datetime
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')

# Extract publication year
df['year'] = df['publish_time'].dt.year

# Create new column for abstract word count
df['abstract_word_count'] = df['abstract'].fillna("").apply(lambda x: len(x.split()))

# Check cleaned dataset
df.info()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# 1. Publications per year
year_counts = df['year'].value_counts().sort_index()
plt.bar(year_counts.index, year_counts.values)
plt.title('Publications per Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.show()

# 2. Top journals
top_journals = df['journal'].value_counts().head(10)
sns.barplot(y=top_journals.index, x=top_journals.values)
plt.title('Top 10 Journals')
plt.xlabel('Number of Publications')
plt.show()

# 3. Word Cloud for Titles
text = " ".join(df['title'].dropna().tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Common Words in Titles')
plt.show()
