In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("metadata.csv")

# Explore
print("Shape:", df.shape)      # Rows & Columns
print(df.info())               # Column data types
print(df.head())               # Preview first rows

# Check missing values
print(df.isnull().sum().head(10))  

# Stats for numerical cols
print(df.describe())


In [None]:
# Drop rows with no title or abstract (example)
df_clean = df.dropna(subset=['title', 'abstract', 'publish_time'])

# Convert to datetime
df_clean['publish_time'] = pd.to_datetime(df_clean['publish_time'], errors='coerce')

# Extract year
df_clean['year'] = df_clean['publish_time'].dt.year

# Word count for abstracts
df_clean['abstract_word_count'] = df_clean['abstract'].apply(lambda x: len(str(x).split()))


In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Count papers by year
year_counts = df_clean['year'].value_counts().sort_index()
plt.bar(year_counts.index, year_counts.values)
plt.title("Publications by Year")
plt.xlabel("Year")
plt.ylabel("Count")
plt.show()

# Top journals
top_journals = df_clean['journal'].value_counts().head(10)
top_journals.plot(kind='barh', title="Top Journals")
plt.show()

# Word cloud for titles
text = " ".join(df_clean['title'].dropna().astype(str))
wc = WordCloud(width=800, height=400, background_color="white").generate(text)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()


In [None]:
import streamlit as st
import matplotlib.pyplot as plt

st.title("CORD-19 Data Explorer")
st.write("Explore COVID-19 research metadata")

# Year filter
year_range = st.slider("Select year range", 2015, 2023, (2019, 2021))
filtered = df_clean[(df_clean['year'] >= year_range[0]) & (df_clean['year'] <= year_range[1])]

# Plot pubs over time
year_counts = filtered['year'].value_counts().sort_index()
fig, ax = plt.subplots()
ax.bar(year_counts.index, year_counts.values)
ax.set_title("Publications by Year")
st.pyplot(fig)

# Show data sample
st.dataframe(filtered.head(20))
