In [None]:
import pandas as pd

# Load dataset
df = pd.read_excel("ai_articles_with_person_cleaned.xlsx")

# Convert 'date' column (year-month format) into datetime
df['date'] = pd.to_datetime(df['date'], format='%Y-%m')

# Extract year into a new column
df['year'] = df['date'].dt.year

# Save a new file with the added year column
df.to_excel("ai_articles_with_person_year.xlsx", index=False)

print(df[['date', 'year']].head())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# -----------------------------
# Load dataset
# -----------------------------
df = pd.read_excel("ai_articles_with_person_year.xlsx")

# Compute word count for each article
df['word_count'] = df['text_cleaned_final'].apply(lambda x: len(str(x).split()))

# -----------------------------
# 1. Number of articles per year
# -----------------------------
year_counts = df['year'].value_counts().sort_index()

plt.figure(figsize=(10,5))
sns.barplot(x=year_counts.index, y=year_counts.values, color="steelblue")
plt.xticks(rotation=45)
plt.title("Number of AI Articles with Person Images per Year")
plt.xlabel("Year")
plt.ylabel("Article Count")
plt.tight_layout()
plt.show()

# -----------------------------
# 2. Distribution of article length
# -----------------------------
plt.figure(figsize=(10,5))
sns.histplot(df['word_count'], bins=50, kde=True, color="darkred")
plt.title("Distribution of Article Word Counts")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

# -----------------------------
# 3. Average article length by year
# -----------------------------
avg_length_by_year = df.groupby('year')['word_count'].mean()

plt.figure(figsize=(10,5))
sns.lineplot(x=avg_length_by_year.index, y=avg_length_by_year.values, marker="o", color="darkgreen")
plt.title("Average Article Word Count per Year")
plt.xlabel("Year")
plt.ylabel("Average Word Count")
plt.tight_layout()
plt.show()
