# CORD‑19 Metadata Analysis
This notebook performs data loading, cleaning, analysis, and visualization based on the CORD‑19 `metadata.csv` dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Ensure plots display inline
%matplotlib inline

## Load Dataset

In [None]:
# Replace with your actual path
file_path = "metadata.csv"

try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: File not found. Please upload metadata.csv first.")

## Preview Dataset

In [None]:
df.head()

## Dataset Info & Missing Values

In [None]:
df.info()
df.isna().sum()

## Basic Statistics (Numerical Columns)

In [None]:
df.describe()

## Clean Missing Data

In [None]:
df_clean = df.copy()

# Convert date column
if 'publish_time' in df_clean.columns:
    df_clean['publish_time'] = pd.to_datetime(df_clean['publish_time'], errors='coerce')

# Drop rows with no title or abstract
df_clean = df_clean.dropna(subset=['title', 'abstract'], how='any')

df_clean['abstract_word_count'] = df_clean['abstract'].apply(lambda x: len(str(x).split()))

df_clean.head()

## Publications per Year

In [None]:
df_clean['year'] = df_clean['publish_time'].dt.year
pub_per_year = df_clean['year'].value_counts().sort_index()
pub_per_year

In [None]:
plt.figure(figsize=(10,5))
pub_per_year.plot(kind='line')
plt.title("Publications Over Time")
plt.xlabel("Year")
plt.ylabel("Count")
plt.grid(True)
plt.show()

## Top Journals

In [None]:
top_journals = df_clean['journal'].value_counts().head(10)
top_journals

In [None]:
plt.figure(figsize=(10,5))
top_journals.plot(kind='bar')
plt.title("Top Journals Publishing COVID‑19 Research")
plt.xlabel("Journal")
plt.ylabel("Count")
plt.xticks(rotation=45, ha='right')
plt.show()