# Year of creation

This notebook prepares a histogram of the creation dates of the repos.

In [None]:
import glob
import json
import matplotlib.pyplot as plt
import os
import pandas as pd


In [None]:
# Find the most recent code search directory in data/processed, and obtain the final JSONL file path
PROCESSED_DATA_DIR = os.path.join(os.getcwd(), "..", "data", "processed")
latest_dir_pattern = os.path.join(PROCESSED_DATA_DIR, "code_search_*")
latest_dir = max(glob.glob(latest_dir_pattern), key=os.path.getmtime, default=None)

if latest_dir:
    RESULTS_DIR = os.path.join(latest_dir, "results")
    os.makedirs(RESULTS_DIR, exist_ok=True)
else:
    raise FileNotFoundError("No matching code_search_YYYYMMDD_hhmmss directory found.")

INPUT_FILENAME = os.path.join(RESULTS_DIR, "aws_provider_repos.jsonl")

In [None]:

# Load data from your JSONL file ---
data = []
with open(INPUT_FILENAME, 'r') as f:
    for line in f:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
# Convert to DataFrame
df = pd.DataFrame(data)

print("Data loaded successfully!")
df.head()

In [None]:
# Convert the 'repo_created_at' column to datetime objects
df['repo_created_at'] = pd.to_datetime(df['repo_created_at'])

# Set the creation date as the DataFrame index
df = df.set_index('repo_created_at')

print("Converted 'repo_created_at' to datetime and set as index.")
df.info()

In [None]:
# Group by year and count the number of repositories.
# The 'Y' stands for Year-end frequency.
yearly_counts = df.resample('YE').size()

# For cleaner labels, we can change the index from a timestamp to just the year
yearly_counts.index = yearly_counts.index.year
yearly_counts.index.name = 'Year'

print("Repository Counts per Year:")
print(yearly_counts)

In [None]:
# Set plot style and size
plt.style.use('seaborn-v0_8-whitegrid')
plt.figure(figsize=(10, 6))

# Create the bar plot for the yearly data
yearly_counts.plot(kind='bar', color='#e74c3c', width=0.6)

# Add titles and labels for clarity
plt.title('Number of Repositories Created Per Year', fontsize=16, weight='bold')
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Repositories', fontsize=12)

# Set x-axis labels to be horizontal
plt.xticks(rotation=0)

# Ensure everything fits nicely
plt.tight_layout()

# Save the figure as a PDF before showing it
# The bbox_inches='tight' argument crops the saved figure to a tight bounding box.
os.makedirs('../paper/figs', exist_ok=True)
plt.savefig('../paper/figs/yearly_histogram.pdf', bbox_inches='tight')

# Display the plot
plt.show()

print("Histogram has been exported as 'yearly_histogram.pdf'")