# HackerNews Score Distribution Analysis

This notebook analyzes the distribution of HackerNews post scores using data from `summary-scores.csv`.

## Import Required Libraries

Import pandas for data manipulation and matplotlib.pyplot for visualization.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Load the Dataset

Use pandas to load 'summary-scores.csv' into a DataFrame and inspect the first few rows.

In [None]:
# Load the summary-scores.csv file
df = pd.read_csv('analysis/summary-scores.csv')

# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nFirst 10 rows:")
print(df.head(10))

print("\nDataset info:")
print(df.info())

print("\nBasic statistics:")
print(df.describe())

## Visualize Data with a Histogram

Use Matplotlib to create a histogram of the scores column from the loaded dataset.

In [None]:
# Create a histogram of scores weighted by count
# Since we have aggregated data (score, count), we need to expand or weight the histogram

# Create arrays for plotting - repeat each score by its count
scores_expanded = []
for _, row in df.iterrows():
    scores_expanded.extend([row['score']] * row['count'])

# Convert to numpy array for better performance
scores_expanded = np.array(scores_expanded)

# Create the histogram
plt.figure(figsize=(12, 8))
plt.hist(scores_expanded, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.title('Distribution of HackerNews Post Scores')
plt.grid(True, alpha=0.3)
plt.show()

print(f"Total number of posts: {len(scores_expanded):,}")
print(f"Score range: {scores_expanded.min()} to {scores_expanded.max()}")
print(f"Mean score: {scores_expanded.mean():.2f}")
print(f"Median score: {np.median(scores_expanded):.2f}")

In [None]:
# Alternative visualization: Log scale for better visibility of distribution
plt.figure(figsize=(12, 8))
plt.hist(scores_expanded, bins=100, alpha=0.7, color='lightcoral', edgecolor='black')
plt.xlabel('Score')
plt.ylabel('Frequency (Log Scale)')
plt.yscale('log')
plt.title('Distribution of HackerNews Post Scores (Log Scale)')
plt.grid(True, alpha=0.3)
plt.show()

# Show distribution for scores up to 100 for better detail
scores_filtered = scores_expanded[scores_expanded <= 100]
plt.figure(figsize=(12, 8))
plt.hist(scores_filtered, bins=50, alpha=0.7, color='lightgreen', edgecolor='black')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.title('Distribution of HackerNews Post Scores (Scores ≤ 100)')
plt.grid(True, alpha=0.3)
plt.show()

print(f"Posts with score ≤ 100: {len(scores_filtered):,} ({len(scores_filtered)/len(scores_expanded)*100:.1f}%)")