In [None]:
# Part 1: Data Loading and Basic Exploration

import pandas as pd

# Load the dataset
data = pd.read_csv('metadata.csv')


# Display the first few rows
print("First few rows:")
print(data.head())

# Check the dimensions of the dataset
print("Dataset dimensions:")
print(f"Rows: {data.shape[0]}, Columns: {data.shape[1]}")

# Check the data types of each column
print("Data types:")
print(data.dtypes)

# Check for missing values
print("Missing values in each column:")
print(data.isnull().sum())

# Generate basic statistics
print("Basic statistics:")
print(data.describe())


In [None]:
# Part 2: Data Cleaning and Preparation

import pandas as pd

# Load the dataset
data = pd.read_csv('metadata.csv')


# Check for missing values in the dataset
missing_values = data.isnull().sum()

# Calculate the percentage of missing values for each column
missing_percentage = (missing_values / data.shape[0]) * 100

# Display columns with missing values and their percentage
print(missing_percentage[missing_percentage > 0].sort_values(ascending=False))


# Remove rows where 'title' or 'abstract' is missing
data_cleaned = data.dropna(subset=['title', 'abstract'])

# Alternatively, fill missing values in non-critical columns
data_cleaned['authors'] = data_cleaned['authors'].fillna('Unknown')

# Convert 'publish_time' to datetime
data_cleaned['publish_time'] = pd.to_datetime(data_cleaned['publish_time'], errors='coerce')

# Check for any invalid datetime values (these will be set to NaT)
invalid_dates = data_cleaned[data_cleaned['publish_time'].isna()]
print(invalid_dates.head())

# Extract the year from 'publish_time' into a new column 'publish_year'
data_cleaned['publish_year'] = data_cleaned['publish_time'].dt.year


# Create a new column 'abstract_word_count' that counts the number of words in the abstract
data_cleaned['abstract_word_count'] = data_cleaned['abstract'].apply(lambda x: len(str(x).split()))

# Check the cleaned data
print(data_cleaned.head())


In [None]:
# Part 3: Data Analysis and Visualization

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import re

# Count the number of papers by publication year
papers_by_year = data_cleaned.groupby('publish_year').size()

# Display the result
print(papers_by_year)

# Count the number of papers by journal
top_journals = data_cleaned['journal'].value_counts().head(10)

# Display the result
print(top_journals)



# Function to clean and tokenize the titles
def clean_and_tokenize(title):
    # Remove non-alphanumeric characters and split into words
    words = re.findall(r'\w+', title.lower())
    return words

# Tokenize all titles and count the frequency of words
all_words = data_cleaned['title'].apply(clean_and_tokenize).sum()
word_counts = Counter(all_words)

# Display the most common words
print(word_counts.most_common(10))


# Create visualizations


# Plot the number of publications over time (by publication year)
plt.figure(figsize=(10, 6))
sns.countplot(data=data_cleaned, x='publish_year', palette='viridis')
plt.title('Number of Publications Over Time')
plt.xlabel('Publication Year')
plt.ylabel('Number of Publications')
plt.xticks(rotation=45)
plt.show()


# Create bar chart for top publishing journals
# Plot the top 10 journals by the number of publications
plt.figure(figsize=(10, 6))
top_journals.plot(kind='bar', color='skyblue')
plt.title('Top Journals Publishing COVID-19 Research')
plt.xlabel('Journal')
plt.ylabel('Number of Publications')
plt.xticks(rotation=45)
plt.show()

# Generate a Word Cloud of Paper Titles

# Combine all titles into a single string
text = ' '.join(data_cleaned['title'].dropna())

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Paper Titles')
plt.show()

# Plot distribution of paper counts by source
plt.figure(figsize=(10, 6))
sns.countplot(data=data_cleaned, x='source_x', palette='viridis')
plt.title('Distribution of Paper Counts by Source')
plt.xlabel('Source')
plt.ylabel('Number of Publications')
plt.xticks(rotation=45)
plt.show()
