In [None]:
# Third take on monthly counts, building from daily totals
# https://chatgpt.com/share/684ad316-85dc-800c-a825-6d7e68143383 - basic setup and structure
# https://chatgpt.com/share/684f1411-b5c8-800c-ba94-41927d889226



In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data/new_KW_count.csv')
daily_count = pd.read_csv('data/counts_by_day.csv')

In [None]:
import pandas as pd

# Step 1: Aggregate overall corpus to months
# — assumes daily_count has columns ['date', 'count']
daily_count['pub_date'] = pd.to_datetime(daily_count['pub_date'], errors='coerce')
daily_count['month'] = daily_count['pub_date'].dt.to_period('M').dt.to_timestamp()
monthly_total = daily_count.groupby('month')['count'].sum()

# Step 2: Aggregate your subset to months (month_count)
# — assumes df has a 'pub_date' for each topic-relevant article
df['pub_date'] = pd.to_datetime(df['pub_date'], errors='coerce')
df['month'] = df['pub_date'].dt.to_period('M').dt.to_timestamp()
month_count = df.groupby('month').size()

# Step 3: Align and compute proportion
all_months = pd.date_range(
    start=min(monthly_total.index.min(), month_count.index.min()),
    end=max(monthly_total.index.max(), month_count.index.max()),
    freq='MS'
)
monthly_total = monthly_total.reindex(all_months, fill_value=0)
month_count    = month_count.reindex(all_months, fill_value=0)

# Build a single DataFrame
monthly_df = pd.DataFrame({
    'monthly_total': monthly_total,
    'month_count':   month_count
}, index=all_months)

# Compute proportion (will be NaN where monthly_total is 0)
monthly_df['proportion'] = monthly_df['month_count'] / monthly_df['monthly_total']

# Quick look
print(monthly_df.head())


In [None]:
# Replace NaNs (from months with zero total) by 0
monthly_df['proportion'] = monthly_df['proportion'].fillna(0)

# Rename to space_proportion
monthly_df.rename(columns={'proportion': 'space_proportion'}, inplace=True)

# Quick check
print(monthly_df[['monthly_total', 'month_count', 'space_proportion']].head())


In [None]:
import numpy as np
import statsmodels.api as sm

# 0) (Optional) If you’d rather work with a generic name, rename:
# monthly_df = monthly_df.rename(columns={'space_proportion':'proportion'})

# 1) Reset index to get the month back as a column, then add month_num
monthly_df = monthly_df.reset_index().rename(columns={'index':'month'})
monthly_df['month_num'] = np.arange(len(monthly_df))

# 2) Drop any rows where space_proportion is NaN (happens if monthly_total==0)
data = monthly_df.dropna(subset=['space_proportion'])

# 3) Build your design matrices
X = sm.add_constant(data['month_num'])             # intercept + trend
y = data['space_proportion']                       # your response

# 4) Fit OLS
model = sm.OLS(y, X).fit()

# 5) Inspect the slope (coef of month_num), p-value, R², etc.
print(model.summary())


In [None]:
df.info()

In [None]:
events_df=pd.read_csv('data/new_astro_df.csv')

In [None]:
# Drop the parsed‐datetime column so you only have the original strings
events_df = events_df.drop(columns=['clean_date_dt'])
# — or, in place:
# events_df.drop('clean_date_dt', axis=1, inplace=True)

# Quick check:
print(events_df.columns)


In [None]:
import pandas as pd

# 1) Replace all “?” with “-” so that those DDDD?DD?DD strings become DDDD-DD-DD
events_df['clean_date'] = events_df['clean_date'].str.replace('?', '-', regex=False)

# 2) First parse everything assuming ISO (YYYY-MM-DD)
parsed_iso = pd.to_datetime(
    events_df['clean_date'],
    format='%Y-%m-%d',
    errors='coerce'
)

# 3) Then parse anything still NaT as slash dates (M/D/YYYY or MM/DD/YYYY)
parsed_slash = pd.to_datetime(
    events_df['clean_date'],
    format='%m/%d/%Y',
    errors='coerce'
)

# 4) Combine them, preferring the ISO parse
parsed_all = parsed_iso.fillna(parsed_slash)

# 5) Report any remaining unparsed values
still_bad = events_df.loc[parsed_all.isna(), 'clean_date'].unique()
if len(still_bad):
    print("⚠️ These still failed to parse:", still_bad)
else:
    print("✅ All dates parsed successfully!")

# 6) Overwrite clean_date with uniform ISO strings
events_df['clean_date'] = parsed_all.dt.strftime('%Y-%m-%d')


In [None]:
# Parse event_month as a datetime (if it isn’t already)
em = pd.to_datetime(events_df['event_month'], errors='coerce')

# Show you the distribution of “day” values
print(em.dt.day.value_counts())

# And if you want to list any that aren’t the 1st:
bad = events_df.loc[em.dt.day != 1, ['clean_date','event_month']]
print("\nThese rows are not on the 1st of the month:\n", bad)


In [None]:
import pandas as pd

# 1) Parse your clean_date into a true datetime (if you haven't already)
events_df['clean_date_dt'] = pd.to_datetime(
    events_df['clean_date'],
    errors='coerce'
)

# 2) Recompute event_month as the 1st of month
events_df['event_month'] = (
    events_df['clean_date_dt']
      .dt.to_period('M')       # e.g. “1907-11”
      .dt.to_timestamp()       # → “1907-11-01 00:00:00”
)

# 3) Check that every day is now “1”
day_counts = events_df['event_month'].dt.day.value_counts()
print("Days in event_month:", day_counts.to_dict())

# 4) (Optional) List any remaining non-1 days
bad = events_df.loc[events_df['event_month'].dt.day != 1, 
                   ['clean_date','event_month']]
if not bad.empty:
    print("Still off-1 rows:")
    print(bad)
else:
    print("All event_month values are now the first of the month.")


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# --- 1) Parse dates & drop pre-1850 rows
df['pub_date'] = pd.to_datetime(df['pub_date'], errors='coerce')
df = df[df['pub_date'] >= '1850-01-01'].copy()

events_df['clean_date_dt'] = pd.to_datetime(events_df['clean_date'], errors='coerce')
events_df = events_df[events_df['clean_date_dt'] >= '1850-01-01'].copy()

# --- 2) Extract publication year
df['year']        = df['pub_date'].dt.year
events_df['year'] = events_df['clean_date_dt'].dt.year

# --- 3) Yearly article totals and topic counts
yearly_total  = df.groupby('year').size()
topic_cols    = [
    'moon_count','mars_count','jupiter_count','venus_count',
    'planet_count','comet_count','eclipse_count',
    'mercury_count','saturn_count','neptune_count',
    'uranus_count','satellite_count','rocket_count'
]
yearly_topics = df.groupby('year')[topic_cols].sum()

# --- 4) Yearly proportions
yearly_props = yearly_topics.div(yearly_total, axis=0)

# --- 5) Build year-by-event_key dummies
evt_year_counts = (
    events_df
      .groupby(['year','event_key'])
      .size()
      .unstack(fill_value=0)
)
evt_year_dummy = (evt_year_counts > 0).astype(int)

# --- 6) Merge into one DataFrame
yearly_analysis = (
    yearly_props
      .merge(evt_year_dummy, left_index=True, right_index=True, how='left')
      .fillna(0)
)

# --- 7) Difference in means for each event_key
print("=== Mean Δ in Proportion (Event‐Year vs Non‐Event‐Year) ===")
for key in evt_year_dummy.columns:
    prop_col = next((c for c in yearly_props.columns if key.lower() in c), None)
    if not prop_col:
        continue
    with_ev    = yearly_analysis.loc[yearly_analysis[key]==1, prop_col].mean()
    without_ev = yearly_analysis.loc[yearly_analysis[key]==0, prop_col].mean()
    print(f"{key:15s} Δ = {with_ev - without_ev:+.4f} in {prop_col}")

# --- 8) OLS regression of each topic proportion on its event dummy
print("\n=== OLS: Proportion ~ Event‐Year Dummy ===")
for key in evt_year_dummy.columns:
    prop_col = next((c for c in yearly_props.columns if key.lower() in c), None)
    if not prop_col:
        continue
    y = yearly_analysis[prop_col]
    X = sm.add_constant(yearly_analysis[key])
    res = sm.OLS(y, X).fit()
    print(f"\n--- {prop_col} ~ {key} ---")
    print(res.summary())

# --- 9) Example plot for one event_key
example_key = 'Mars'  # or any other event_key
prop_col    = next(c for c in yearly_props.columns if 'mars' in c)
fig, ax = plt.subplots(figsize=(8,4))
ax.plot(yearly_analysis.index, yearly_analysis[prop_col], marker='o', label=prop_col)
for yr in yearly_analysis.index[yearly_analysis[example_key]==1]:
    ax.axvline(yr, color='gray', alpha=0.3)
ax.set_title(f"{prop_col} with '{example_key}' event‐years highlighted")
ax.set_xlabel('Year')
ax.set_ylabel('Proportion')
ax.legend()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 1) Make sure pub_date is datetime and filter 1850+
df['pub_date'] = pd.to_datetime(df['pub_date'], errors='coerce')
df = df[df['pub_date'].dt.year >= 1850].copy()

# 2) Compute yearly space_proportion: mean of the is_H flag
yearly_space = (
    df
      .groupby(df['pub_date'].dt.year)['is_H']
      .mean()
)
yearly_space.index.name = 'year'
yearly_space.name = 'space_proportion'

# 3) Identify local peaks: strictly greater than prev & next year
peak_mask = (
    (yearly_space > yearly_space.shift(1)) &
    (yearly_space > yearly_space.shift(-1))
)
annual_peaks = yearly_space[peak_mask]

# 4) Print out the peak years & values
print("Annual space_proportion peaks:")
print(annual_peaks)

# 5) (Optional) Plot the full series with peaks highlighted
fig, ax = plt.subplots(figsize=(8,4))
ax.plot(yearly_space.index, yearly_space.values, marker='o', label='space_proportion')
ax.scatter(annual_peaks.index, annual_peaks.values, color='red', zorder=5, label='peaks')
ax.set_xlabel('Year')
ax.set_ylabel('Space Proportion')
ax.set_title('Yearly Space Proportion with Local Peaks')
ax.legend()
plt.show()


In [None]:
import pandas as pd

# 1) Build a mapping from each event_key → its yearly proportion column
mapping = {}
for key in evt_year_dummy.columns:
    matches = [c for c in yearly_props.columns if key.lower() in c]
    if matches:
        mapping[key] = matches[0]

# 2) Scan each topic for local‐max years with no event
orphan_list = []
for key, prop_col in mapping.items():
    s = yearly_props[prop_col]
    # strict local peaks: > previous year & > next year
    peak_mask = (s > s.shift(1)) & (s > s.shift(-1))
    peaks = s[peak_mask].dropna()

    # align the event‐year dummy to those peak years
    ev = evt_year_dummy[key].reindex(peaks.index, fill_value=0)

    # keep only the peaks where ev == 0 (no event that year)
    orphan_years = peaks[ev == 0]
    for year, val in orphan_years.items():
        orphan_list.append({
            'event_key':  key,
            'peak_year':  year,
            'proportion': val
        })

# 3) Build a DataFrame and sort
orphan_peaks_annual = (
    pd.DataFrame(orphan_list)
      .sort_values(['event_key', 'peak_year'])
      .reset_index(drop=True)
)

# 4) Inspect
print(orphan_peaks_annual)


In [None]:
threshold = orphan_peaks_annual['proportion'].quantile(0.75)
strong_orphans = orphan_peaks_annual[
    orphan_peaks_annual['proportion'] >= threshold
]


In [None]:
import matplotlib.pyplot as plt

for key in evt_year_dummy.columns:
    # find the matching proportion column (e.g. 'Mars' → 'mars_count')
    prop_col = next((c for c in yearly_props.columns if key.lower() in c), None)
    if not prop_col:
        continue

    fig, ax = plt.subplots(figsize=(8, 4))
    years = yearly_analysis.index

    # 1) Plot the yearly proportion
    ax.plot(years, yearly_analysis[prop_col], marker='o', label=prop_col)

    # 2) Vertical lines for event‐years
    evt_years = yearly_analysis.index[yearly_analysis[key] == 1]
    for yr in evt_years:
        ax.axvline(yr, color='gray', alpha=0.3)

    # 3) Overlay orphan peaks for this key
    peaks = strong_orphans[strong_orphans['event_key'] == key]
    if not peaks.empty:
        ax.scatter(peaks['peak_year'], peaks['proportion'],
                   color='red', marker='X', s=100, label='Orphan peaks')
        # Add text labels at each point
        for _, row in peaks.iterrows():
            ax.text(row['peak_year'], row['proportion'],
                    str(int(row['peak_year'])),
                    color='red', fontsize=9,
                    ha='left', va='bottom')

    # 4) Labels & legend
    ax.set_title(f"{prop_col} with '{key}' Events & Orphan Peaks")
    ax.set_xlabel('Year')
    ax.set_ylabel('Proportion')
    ax.legend(loc='upper left')
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd

# 0) Make sure your events_df has a datetime column; for example:
# if you have `event_month` as a string “YYYY-MM-DD”, parse it:
events_df['event_month_dt'] = pd.to_datetime(
    events_df['event_month'], 
    errors='coerce'
)

# 1) Reduce to the first‐of‐month timestamp
events_df['month'] = (
    events_df['event_month_dt']
      .dt.to_period('M')
      .dt.to_timestamp()
)

# 2) Pivot into counts of events per month × event_key
evt_counts = (
    events_df
      .groupby(['month','event_key'])
      .size()
      .unstack(fill_value=0)
)

# 3) Turn counts into a boolean dummy (1 if ≥1 event, else 0)
evt_dummy = (evt_counts > 0).astype(int)

# Now you can rerun your orphan‐peak code using this evt_dummy.


In [None]:
import pandas as pd

# 1) Make sure your article dates are datetimes and extract month
df['pub_date'] = pd.to_datetime(df['pub_date'], errors='coerce')
df['month']    = df['pub_date'].dt.to_period('M').dt.to_timestamp()

# 2) Define the columns you want to turn into proportions
topic_cols = [
    'moon_count','mars_count','jupiter_count','venus_count',
    'planet_count','comet_count','eclipse_count',
    'mercury_count','saturn_count','neptune_count',
    'uranus_count','satellite_count','rocket_count'
]

# 3) Compute total articles per month
monthly_total = df.groupby('month').size()

# 4) Sum each topic’s raw counts by month
monthly_topics = df.groupby('month')[topic_cols].sum()

# 5) (Optional) Align to a complete monthly index, filling gaps with zeros
all_months = pd.date_range(
    start=monthly_total.index.min(),
    end=monthly_total.index.max(),
    freq='MS'
)
monthly_total  = monthly_total.reindex(all_months, fill_value=0)
monthly_topics = monthly_topics.reindex(all_months, fill_value=0)

# 6) Divide to get proportions
monthly_props = monthly_topics.div(monthly_total, axis=0)

# -- Now monthly_props is ready for your orphan‐peak or any other analyses. --


In [None]:
import pandas as pd

# — assume monthly_props and evt_dummy are already defined —

# 1) Collect monthly orphan peaks
monthly_orphans = []
# Build a Series that’s True in months with ANY event
any_event_months = (evt_dummy.sum(axis=1) > 0)

for prop_col in monthly_props.columns:
    s = monthly_props[prop_col]
    # local‐maxima: strictly above both neighbors
    peaks = s[(s > s.shift(1)) & (s > s.shift(-1))].dropna()
    # pick only those months where NO event of any kind occurred
    orphan_months = peaks[~any_event_months.reindex(peaks.index, fill_value=False)]
    for month, val in orphan_months.items():
        monthly_orphans.append({
            'topic':      prop_col,
            'peak_month': month,
            'proportion': val
        })

# 2) Build the DataFrame
monthly_orphans_df = pd.DataFrame(monthly_orphans)

# 3) Sanity check & display
if monthly_orphans_df.empty:
    print("🔍 No monthly orphan peaks found — every local peak coincides with at least one event.")
    # (Optional) List out all local peaks, regardless of events:
    print("\nAll local peaks by topic (for reference):")
    for prop_col in monthly_props.columns:
        s = monthly_props[prop_col]
        peaks = s[(s > s.shift(1)) & (s > s.shift(-1))].dropna()
        print(f" • {prop_col}: {peaks.index.tolist()}")
else:
    monthly_orphans_df = (
        monthly_orphans_df
          .sort_values(['topic','peak_month'])
          .reset_index(drop=True)
    )
    print(monthly_orphans_df)


In [None]:
# 1) Group and summarize
summary = (
    monthly_orphans_df
      .groupby('topic')
      .agg(
          orphan_count     = ('peak_month', 'count'),
          max_proportion   = ('proportion', 'max')
      )
      .reset_index()
      .sort_values(by='max_proportion', ascending=False)
)

# 2) Show the result
print(summary)
