In [None]:
# -----------------------------
# Part 1: Data Loading
# -----------------------------
import pandas as pd

# Load the smaller dataset
df = pd.read_csv("metadata_sample.csv")

print("Shape:", df.shape)
print(df.info())
df.head()

# -----------------------------
# Part 2: Data Cleaning
# -----------------------------
# Convert publish_time to datetime
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')

# Extract year
df['year'] = df['publish_time'].dt.year

# Abstract word count (optional)
df['abstract_word_count'] = df['abstract'].fillna("").apply(lambda x: len(x.split()))

# Drop rows without title or publish_time
df_clean = df.dropna(subset=['title', 'publish_time'])

print("After cleaning:", df_clean.shape)

# -----------------------------
# Part 3: Analysis & Visualizations
# -----------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Papers per year
year_counts = df_clean['year'].value_counts().sort_index()
plt.figure(figsize=(8,5))
sns.barplot(x=year_counts.index, y=year_counts.values)
plt.title("Publications by Year")
plt.show()

# 2. Top journals
top_journals = df_clean['journal'].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(y=top_journals.index, x=top_journals.values)
plt.title("Top Journals Publishing COVID-19 Papers")
plt.show()

# 3. Word frequency in titles
from collections import Counter
import re

titles = " ".join(df_clean['title'].dropna()).lower()
words = re.findall(r'\b\w+\b', titles)
common_words = Counter(words).most_common(20)
print("Most frequent words in titles:", common_words)
