In [None]:
import json
import requests
import csv
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval
import numpy as np
import datetime
import seaborn as sns

In [None]:
#Evolution of the number of SO questions in each topic
# Set theme for aesthetics
sns.set_theme(style="whitegrid")

# Load and prepare data
df = pd.read_csv('HF_SO_2019_2024.csv')
df['topic_label'] = df['topic_label'].replace('Compute capabilities', 'Distributed Computing and Resource Management')

# Exclude the rows where 'topic_label' is 'Other'
df_filtered = df[~df['topic_label'].isin(['Other'])]

# Group by 'year' and 'topic_label' and count the number of questions
grouped_df = df_filtered.groupby(['year', 'topic_label']).size().unstack(fill_value=0)

# Define a compact color palette for a stacked style
colors = sns.color_palette("tab20", n_colors=len(grouped_df.columns))

# Plotting
fig, ax = plt.subplots(figsize=(5, 3.7))  # Compact size for single-column fit

# Plot the stacked bar chart
grouped_df.plot(kind='bar', stacked=True, ax=ax, color=colors, width=0.5, edgecolor='black')

# Customize the plot for a compact and clear style
ax.set_xlabel('Year', fontsize=9)
ax.set_ylabel('Number of Questions', fontsize=9)
plt.xticks(rotation=45, ha='right', fontsize=8)
plt.yticks(fontsize=8)

# Position the legend inside the plot area at the upper left
plt.legend(title='Topic Label', bbox_to_anchor=(0.02, 0.98), loc='upper left', fontsize=7, title_fontsize=8, frameon=True, shadow=False)

# Save the figure for LaTeX
plt.tight_layout()  # Adjust layout to fit in single column
plt.savefig('so_questions_evolution_compact.pdf', format='pdf', bbox_inches='tight', dpi=300)
plt.show()


In [None]:
# Group by 'topic_label' and count the number of questions
questions_per_topic = df.groupby('topic_label').size().reset_index(name='number_of_questions')
questions_per_topic = questions_per_topic.sort_values(by='number_of_questions', ascending=False)
# Display the result
print(questions_per_topic)

In [None]:
avg_stats = df_filtered.groupby('topic_label').agg(
    avg_view_count=('view_count', 'mean'),
    avg_score=('score', 'mean')
).round(1).reset_index()
# Convert the rounded float values to integers


# Display the resulting DataFrame
print(avg_stats)


In [None]:
# Calculate the average duration in days per topic
average_period_per_topic = df_filtered.groupby('topic_label')['duration_accepted_answer'].mean().reset_index()

# Round the results for better readability
average_period_per_topic = average_period_per_topic.round({'duration_accepted_answer': 1})

# Display the result
print(average_period_per_topic)

In [None]:


# Group by 'topic_label' and count the total number of questions per topic
total_questions_per_topic = df.groupby('topic_label').size()

# Group by 'topic_label' and count the number of questions without an accepted answer
questions_without_answer_per_topic = df[df['accepted_answer_id'].isna()].groupby('topic_label').size()

# Calculate the percentage of questions without an accepted answer
percentage_without_answer = (questions_without_answer_per_topic / total_questions_per_topic) * 100

# Reset index to convert Series to DataFrame and give it a meaningful name
percentage_without_answer = percentage_without_answer.round(1).reset_index(name='percentage_without_accepted_answer')
percentage_without_answer = percentage_without_answer.sort_values(by='percentage_without_accepted_answer', ascending=False)


# Display the result
print(percentage_without_answer)

In [None]:

# Group by 'topic_label' and count the total number of questions per topic
total_questions_per_topic = df.groupby('topic_label').size()

# Group by 'topic_label' and count the number of questions without an accepted answer
questions_without_answer_per_topic = df[df['answer_count']==0].groupby('topic_label').size()

# Calculate the percentage of questions without an accepted answer
percentage_without_answer = (questions_without_answer_per_topic / total_questions_per_topic) * 100

# Reset index to convert Series to DataFrame and give it a meaningful name
percentage_without_answer = percentage_without_answer.round(1).reset_index(name='percentage_without_answer')

# Display the result
print(percentage_without_answer)

In [None]:
# Step 1: Group by 'topic_label' and 'year', and count the number of questions for each group
questions_per_year = df.groupby(['topic_label', 'year']).size().reset_index(name='number_of_questions')

# Step 2: Calculate the yearly increase percentage for each topic
questions_per_year['yearly_increase_percentage'] = questions_per_year.groupby('topic_label')['number_of_questions'].pct_change() * 100


questions_per_year_filtered = questions_per_year[
    ~((questions_per_year['year'] == 2020) & (questions_per_year.groupby('topic_label').cumcount() == 1))
]



# Step 4: Calculate the average yearly increase percentage per topic, excluding he 2019-2020 increase
average_increase_percentage = questions_per_year_filtered.groupby('topic_label')['yearly_increase_percentage'].mean().round(1).reset_index(name='average_yearly_increase_percentage')


print (questions_per_year)
# Display the result
print(average_increase_percentage)

In [None]:
data_filtered = df[df['year'].isin([2020, 2021, 2022, 2023,2024])]

# Group by year and count the number of questions per year
yearly_data = data_filtered.groupby('year').size().reset_index(name='question_count')

# Calculate the yearly growth rate
yearly_data['growth_rate'] = yearly_data['question_count'].pct_change() * 100

# Calculate the average yearly growth rate (excluding the first year which has NaN)
average_yearly_growth_rate = yearly_data['growth_rate'].iloc[1:].mean()

print(yearly_data)
print(f"The average yearly growth rate of questions from 2020 to 2024 is {average_yearly_growth_rate:.2f}%")

In [None]:
count_how_titles = df['title'].str.contains(r'\b[Hh]ow\b', regex=True).sum()

print(f"Number of titles containing 'how' or 'How': {count_how_titles}")

In [None]:
count_how_titles = df['title'].str.contains(r'\b[Ww]hy\b', regex=True).sum()

print(f"Number of titles containing 'why' or 'Why': {count_how_titles}")