In [None]:
import json
import requests
import csv
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval
import numpy as np
import datetime

In [None]:
dataset= pd.read_csv("HF_SO_2019_2024.csv")
dataset.dtypes
dataset.head(12)

In [None]:

# Plotting the evolution of the number of questions over the years
dataset["creation_date"] = pd.to_datetime(dataset["creation_date"])
dataset['year'] = dataset['creation_date'].dt.year
questions_per_year = dataset.groupby('year').size()
plt.figure(figsize=(10, 6))
plt.bar(questions_per_year.index, questions_per_year.values, color='b')

plt.xlabel('Year')
plt.ylabel('Number of Questions')
plt.xticks(questions_per_year.index, rotation=45)
plt.grid(True, axis='y')  # Only grid lines for the y-axis
plt.tight_layout()
plt.savefig('Nu_question_growth.pdf')
plt.show()

In [None]:
#number of posts
df= pd.read_csv("HF_SO_2019_2024.csv")
total_questions=len(df)
total_answers=df['answer_count'].sum()
total_posts = total_questions + total_answers

#number of answered_questions
answered_questions=df[df['answer_count']>0]
num_answered_questions=len(answered_questions)


#number of accepted_answers
accepted_answers = df[~df['accepted_answer_id'].isnull()]
num_accepted_answers = len(accepted_answers)


print(total_questions)
print(num_answered_questions)
print(num_accepted_answers)

In [None]:
df['tags'] = df['tags'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
print("Data type of 'tags' column after conversion:", type(df['tags'].iloc[0]))
unique_tags = set()
for tags_list in df['tags']:
    if isinstance(tags_list, list):
        unique_tags.update(tags_list)

num_distinct_tags=len(unique_tags)
num_distinct_tags

df['num_tags'] = df['tags'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Calculate the average number of tags per question
average_tags_per_question = round(df['num_tags'].mean())


# Calculate the average number of answers per question
average_answers_per_question= round(df['answer_count'].mean())

In [None]:
df= pd.read_csv("HF_SO_2019_2024.csv")
user_involvment={'owner':[],'creation_date':[],'involvment_type':[]}
df['owner'] = df['owner'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
question_owners = []

for owner,creation_date in zip(df['owner'], df['creation_date']):
  if 'account_id' in owner:
    user_involvment['owner'].append(owner['account_id'])
    user_involvment['creation_date'].append(creation_date)
    user_involvment['involvment_type'].append("question")
    question_owners.append(owner['account_id'])




print (len(set(question_owners)))

answer_owners = []
df['answers'] = df['answers'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
for answers_list in df['answers']:
    if isinstance(answers_list, list):
        for answer in answers_list:
            if 'owner' in answer and 'account_id' in answer['owner']:
                user_involvment['owner'].append(answer['owner']['account_id'])
                dt=datetime.datetime.fromtimestamp(answer['creation_date']).strftime('%Y-%m-%d')
                user_involvment['creation_date'].append(dt)
                user_involvment['involvment_type'].append("answer")
                answer_owners.append(answer['owner']['account_id'])


answer_comment_owners = []
df['answers'] = df['answers'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
for answers_list in df['answers']:
    if isinstance(answers_list, list):
        for answer in answers_list:
            if 'comments' in answer :
              for comment in answer['comments']:
                if 'account_id' in comment['owner']:
                  user_involvment['owner'].append(comment['owner']['account_id'])
                  dt=datetime.datetime.fromtimestamp(comment['creation_date']).strftime('%Y-%m-%d')
                  user_involvment['creation_date'].append(dt)
                  user_involvment['involvment_type'].append("comment_to_answer")
                  answer_comment_owners.append(comment['owner']['account_id'])


comment_owners = []
df['comments'] = df['comments'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
for comments_list in df['comments']:
    if isinstance(comments_list, list):
        for comment in comments_list:
            if 'owner' in comment and 'account_id' in comment['owner'] :
              user_involvment['owner'].append(comment['owner']['account_id'])
              dt=datetime.datetime.fromtimestamp(comment['creation_date']).strftime('%Y-%m-%d')
              user_involvment['creation_date'].append(dt)
              user_involvment['involvment_type'].append("comment_to_question")
              comment_owners.append(comment['owner']['account_id'])


all_owners = list(set(question_owners + answer_owners + answer_comment_owners + comment_owners))
num_distinct_users=len(all_owners)

print(len(user_involvment["owner"]))
print(len(user_involvment["creation_date"]))
print(len(user_involvment["involvment_type"]))

user_involvment_df = pd.DataFrame(user_involvment)


question_owners = [user_involvment['owner'][i] for i in range(len(user_involvment['owner'])) 
                   if user_involvment['involvment_type'][i] == 'question']

# Get the number of distinct owners
distinct_question_owners = len(set(question_owners))

print(f"Number of distinct owners with involvement type 'question': {distinct_question_owners}")



In [None]:
accepted_answer_owners = []
df['answers'] = df['answers'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
for answers_list in df['answers']:
    if isinstance(answers_list, list):
        for answer in answers_list:
            if 'owner' in answer and 'account_id' in answer['owner'] and answer['is_accepted'] == True:
                accepted_answer_owners.append(answer['owner']['account_id'])
print(len(set(accepted_answer_owners)))

user_accepted_answer_count = {}

for i in range(len(accepted_answer_owners)):
        if accepted_answer_owners[i] in user_accepted_answer_count:
            user_accepted_answer_count[accepted_answer_owners[i]] += 1
        else:
            user_accepted_answer_count[accepted_answer_owners[i]] = 1

distribution = {}

for count in user_accepted_answer_count.values():
    if count in distribution:
        distribution[count] += 1
    else:
        distribution[count] = 1

for num_accepted_answer, num_users in distribution.items():
    print(f"{num_users} distinct users have {num_accepted_answer} accepted answer(s).")

    
print(distribution)

In [None]:
non_accepted_answer_owners = []
df['answers'] = df['answers'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
for answers_list in df['answers']:
    if isinstance(answers_list, list):
        for answer in answers_list:
            if 'owner' in answer and 'account_id' in answer['owner'] and answer['is_accepted'] == False:
                non_accepted_answer_owners.append(answer['owner']['account_id'])
print(len(set(non_accepted_answer_owners)))

user_non_accepted_answer_count = {}

for i in range(len(non_accepted_answer_owners)):
        if non_accepted_answer_owners[i] in user_non_accepted_answer_count:
            user_non_accepted_answer_count[non_accepted_answer_owners[i]] += 1
        else:
            user_non_accepted_answer_count[non_accepted_answer_owners[i]] = 1

distribution = {}

for count in user_non_accepted_answer_count.values():
    if count in distribution:
        distribution[count] += 1
    else:
        distribution[count] = 1

for num_non_accepted_answer, num_users in distribution.items():
    print(f"{num_users} distinct users have {num_non_accepted_answer} non_accepted answer(s).")

In [None]:
question_count = {}
for i in range(len(user_involvment['owner'])):
    if user_involvment['involvment_type'][i] == 'question':
        user_id = user_involvment['owner'][i]
        if user_id in question_count:
            question_count[user_id] += 1
        else:
            question_count[user_id] = 1

distribution = {}
for count in question_count.values():
    if count in distribution:
        distribution[count] += 1
    else:
        distribution[count] = 1

for num_questions, num_users in distribution.items():
    print(f"{num_users} distinct users asked {num_questions} question(s).")

In [None]:
Statistics = {
    'Item': ['Number of posts', 'Number of questions', 'Number of answered questions',
              'Number of accepted answers', 'Number of distinct tags',
              'Number of distinct users', 'Average number of tags per question',
              'Average number of answers per question'],
    'Value': [total_posts, total_questions, num_answered_questions, num_accepted_answers, num_distinct_tags, num_distinct_users, average_tags_per_question, average_answers_per_question]
}

Statistics_df = pd.DataFrame(Statistics)
fig, ax = plt.subplots()
ax.axis('off')
table_title = "Statistics about the collected data"
ax.set_title(table_title, fontsize=14)
table = ax.table(cellText=Statistics_df.values, colLabels=Statistics_df.columns, loc='center', cellLoc='center', colColours=['lightblue']*len(Statistics_df.columns))
table.scale(2, 2)

plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

user_involvment_df['creation_date'] = pd.to_datetime(user_involvment_df['creation_date'], format='mixed', errors='coerce')
user_involvment_df['year'] = user_involvment_df['creation_date'].dt.year

distinct_users_per_year = user_involvment_df.groupby('year')['owner'].nunique()


plt.rcParams.update({'font.size': 12})               # General font size
plt.rcParams.update({'axes.titlesize': 14})          # Title font size
plt.rcParams.update({'axes.labelsize': 12})          # Axis label font size
plt.rcParams.update({'xtick.labelsize': 10})         # X-tick label font size
plt.rcParams.update({'ytick.labelsize': 10})         # Y-tick label font size


fig, ax = plt.subplots(figsize=(7, 6))  # Adjust size for a two-column LaTeX document

distinct_users_per_year.plot(kind='bar', color='#4C72B0', edgecolor='black', ax=ax)

ax.set_xlabel('Year')
ax.set_ylabel('Number of Distinct Users Involved per Year')

ax.grid(axis='y', linestyle='--', alpha=0.6)  # Y-axis grid only


for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', 
                (p.get_x() + p.get_width() / 2, p.get_height()), 
                ha='center', va='bottom', fontsize=10, color='black', rotation=0)


plt.xticks(rotation=45, ha='right')  # Rotate x-ticks for better spacing
plt.tight_layout()                   # Adjust layout to avoid clipping


plt.savefig('distinct_users_per_year.pdf', bbox_inches='tight', dpi=300)
plt.show()


In [None]:


df['tags'] = df['tags'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)

tags_exploded = df.explode('tags')
tags_exploded = tags_exploded[tags_exploded['tags'] != 'huggingface']

tag_counts = tags_exploded['tags'].value_counts()

tag_percentages = (tag_counts / tag_counts.sum()) * 100
tag_percentages = tag_percentages.round(2).astype(str) + '%'

tag_summary = pd.DataFrame({'Tag_Count': tag_counts, 'Tag_Percentage': tag_percentages})

top_10_tags = tag_summary.head(10)

other_count = tag_counts[10:].sum()
other_percentage = (other_count / tag_counts.sum()) * 100

other_summary = pd.DataFrame({'Tag_Count': [other_count], 'Tag_Percentage': [f'{other_percentage:.2f}%']}, index=['Other'])

final_summary = pd.concat([top_10_tags, other_summary])

print(final_summary)


In [None]:
import pandas as pd
from ast import literal_eval

df['tags'] = df['tags'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)

tags_exploded = df.explode('tags')
tags_exploded = tags_exploded[tags_exploded['tags'] != 'huggingface']  # Optional filtering

tags_exploded['year'] = pd.to_datetime(tags_exploded['creation_date']).dt.year

tag_counts_over_time = tags_exploded.groupby(['tags', 'year']).size().unstack(fill_value=0)

tag_counts = tag_counts_over_time.sum(axis=1)  # Sum across years for total count
tag_percentages = (tag_counts / tag_counts.sum()) * 100  # Percentage for each tag

yearly_diff = tag_counts_over_time.diff(axis=1)  # Year-over-year difference
average_yearly_increase = yearly_diff.mean(axis=1)  # Mean of the yearly differences

tag_summary = pd.DataFrame({
    'Tag_Count': tag_counts,
    'Tag_Percentage': tag_percentages.round(2).astype(str) + '%',
    'Average_Yearly_Increase': average_yearly_increase.round(2)
})

top_10_tags = tag_summary.nlargest(10, 'Tag_Count')

other_count = tag_counts[~tag_counts.index.isin(top_10_tags.index)].sum()
other_percentage = (other_count / tag_counts.sum()) * 100
other_avg_increase = average_yearly_increase[~average_yearly_increase.index.isin(top_10_tags.index)].mean()

other_summary = pd.DataFrame({
    'Tag_Count': [other_count],
    'Tag_Percentage': [f'{other_percentage:.2f}%'],
    'Average_Yearly_Increase': [other_avg_increase]
}, index=['Other'])

final_summary = pd.concat([top_10_tags, other_summary])

print(final_summary)

In [None]:
# Group by 'Creation_Date' and 'Tags', count occurrences, and unstack to pivot the data
tags_exploded["creation_date"] = pd.to_datetime(tags_exploded["creation_date"], format='mixed', errors='coerce')
tag_counts_over_time = tags_exploded.groupby([tags_exploded['creation_date'].dt.year, 'tags']).size().unstack(fill_value=0)



# Get the top N popular tags (adjust N as needed)
top_N_tags = tag_counts_over_time.sum().nlargest(10).index.tolist()
top_N_tags_data = tag_counts_over_time[top_N_tags]

# Set font sizes
plt.rcParams.update({'font.size': 14})  # General font size
plt.rcParams.update({'axes.titlesize': 16})  # Title font size
plt.rcParams.update({'axes.labelsize': 14})  # Axis label font size
plt.rcParams.update({'xtick.labelsize': 16})  # X-tick label font size
plt.rcParams.update({'ytick.labelsize': 16})  # Y-tick label font size

# Plotting the time series graph for the top N popular tags
ax=top_N_tags_data.plot(kind='line', marker='o', figsize=(12, 8))
specific_years = sorted(tags_exploded['creation_date'].dt.year.unique())
ax.set_xticks(specific_years)
ax.set_xticklabels([str(year) for year in specific_years])

plt.xlabel('Year')
plt.ylabel('Tag Count')
plt.legend(title='Tags')
plt.grid(True)
plt.grid(True)
plt.savefig('Growth_10 Popular_Tags.pdf')
plt.show()


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set a theme for aesthetics
sns.set_theme(style="whitegrid")

# Mark questions as either 'With Accepted Answer' or 'Without Accepted Answer'
df['acceptance_status'] = np.where(df['accepted_answer_id'].notnull(), 'With Accepted Answer', 'Without Accepted Answer')

# Group by year and acceptance status, count the number of questions
grouped = df.groupby(['year', 'acceptance_status']).size().unstack(fill_value=0)
grouped['All Questions'] = grouped.sum(axis=1)
years = grouped.index.tolist()
bar_width = 0.25
y = np.arange(len(years))

# Define custom colors
colors = ["#A2D2FF", "#B2A4FF", "#FFC39E"]  # Adjust these to match the example color scheme

# Plotting the horizontal bars and adding values at the end of each bar
plt.figure(figsize=(6, 4.5))  # Compact size to fit in a single column
for i, (status, color) in enumerate(zip(['With Accepted Answer', 'Without Accepted Answer', 'All Questions'], colors)):
    plt.barh(y + i * bar_width, grouped[status], color=color, height=bar_width, label=status, edgecolor='black')
    # Add text at the end of each bar
    for j, value in enumerate(grouped[status]):
        plt.text(value + 50, y[j] + i * bar_width, f'{value:,}', va='center', fontsize=8, color='black')  # Comma separator for readability

# Customize labels and ticks
plt.ylabel('Years', fontsize=10, fontweight='bold')
plt.xlabel('Number of Questions', fontsize=10, fontweight='bold')
plt.yticks(y + bar_width, years, fontsize=9)
plt.xticks(fontsize=9)
plt.title("Yearly Distribution of Questions with and without Accepted Answers", fontsize=12, fontweight='bold', pad=10)

# Place legend inside the bottom-right corner of the plot
plt.legend(title='Acceptance Status', title_fontsize=10, fontsize=9, loc='lower right', bbox_to_anchor=(0.98, 0.02), frameon=True, shadow=True, framealpha=0.8)

# Remove the top and right spines for a cleaner look
sns.despine()

# Add grid lines only for the x-axis
plt.grid(axis='x', linestyle='--', alpha=0.7)

plt.tight_layout()  # Adjust layout to fit everything nicely
plt.savefig('Nu_Q_W_WO_Answer_horizontal.pdf', bbox_inches='tight', dpi=300)
plt.show()
