In [1]:
import requests
import pandas as pd
import re

Get data from the last 50 climatoligical reports that are about yesterday and keep only the latest.

In [2]:
import requests
import pandas as pd
import re

def get_text(v):
    URL = f"https://forecast.weather.gov/product.php?site=LOX&issuedby=LAX&product=CLI&format=TXT&version={v}&glossary=0"
    r = requests.get(URL, timeout=30)
    r.raise_for_status()
    return r.text

def normalize_cli_text(text: str) -> str:
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"[ \t]+\n", "\n", text)
    text = re.sub(r"\n{2,}", "\n\n", text)
    return text.strip()

def extract_cli_yesterday(version = 50):
    rows = []  
    patterns = {
        "DATE": r"CA CLIMATE SUMMARY FOR (\w+ \d{1,2} \d{4})",
        "TMAX": r"YESTERDAY\s+MAXIMUM\s+(\d+)",
        "TMIN": r"MINIMUM\s+(\d+)",
        "PRCP": r"PRECIPITATION\s*\(IN\)\s*YESTERDAY\s+([0-9]+(?:\.[0-9]+)?)",
        "AWND": r"AVERAGE WIND SPEED\s+([\d.]+)",
        "WDF2": r"HIGHEST WIND DIRECTION\s+\w+\s+\((\d+)\)",
        "WSF2": r"HIGHEST WIND SPEED\s+(\d+)"
    }
    #changed version for testing
    for n in range(1, version):
        text = normalize_cli_text(get_text(n))

        # Match object or None
        valid_yesterday = re.search(r"YESTERDAY", text)
        if not valid_yesterday:
            print(f"no report available for version {n}")
            continue
        print("downloading report")

        out = {}
        for k, p in patterns.items():
            m = re.search(p, text, flags=re.MULTILINE | re.DOTALL)
            out[k] = m.group(1) if m else None
        rows.append(out)
    df = pd.DataFrame(rows)
    df["DATE"] = pd.to_datetime(df["DATE"], format = "mixed")
    for c in ["TMAX", "TMIN", "WSF2", "WDF2", "PRCP", "AWND"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    df.drop_duplicates(subset = ["DATE"], keep = "first", inplace = True)
    return df

#changed version for testing
def extract_cli_today(version = 1):
    rows_today = []  
    patterns = {
        "DATE": r"CA CLIMATE SUMMARY FOR (\w+ \d{1,2} \d{4})",
        "TMAX": r"TODAY\s+MAXIMUM\s+(\d+)",
        "TMIN": r"MINIMUM\s+(\d+)",
        "PRCP": r"PRECIPITATION\s*\(IN\)\s*TODAY\s+([0-9]+(?:\.[0-9]+)?)",
        "AWND": r"AVERAGE WIND SPEED\s+([\d.]+)",
        "WDF2": r"HIGHEST WIND DIRECTION\s+\w+\s+\((\d+)\)",
        "WSF2": r"HIGHEST WIND SPEED\s+(\d+)"
        ,
    }
    text = normalize_cli_text(get_text(version))
    valid_today = re.search(r"TEMPERATURE\s*\(F\)[\s\S]*?\bTODAY\b", text)
    if not valid_today:
        print("no report available for today")
        return None
    print("downloading report")   
    out = {}
    for k, p in patterns.items():
        m = re.search(p, text, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        out[k] = m.group(1) if m else None
    df = pd.DataFrame([out])  # one row
    df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce")
    for c in ["TMAX", "TMIN", "WSF2", "WDF2", "PRCP", "AWND"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

extract the forecasted max temp for tomorrow

In [3]:
import requests
import datetime as dt

#changed date for testing: subtracted 1 day from today
def get_forecast():
    print("fetching forecast")
    LAT, LON = 33.94, -118.401
    HEADERS = {"User-Agent": "giulio"}
    URL = "https://api.weather.gov/gridpoints/LOX/149,41/forecast/hourly"
    tomorrow = (dt.date.today() + dt.timedelta(days=1)).isoformat()
    today = dt.date.today().isoformat()
    r = requests.get(URL, headers=HEADERS)
    periods = r.json()["properties"]["periods"]
    tmax = -999
    
    for p in periods:
        if p["startTime"][:10] == tomorrow:
            tmax = max(tmax, p["temperature"])
    if tmax == -999:
        print("cannot fetch forecast")
    else: print("forecast downloaded")
    max_temp_today = {"DATE": today, "forecasted_TMAX": tmax}
    df = pd.DataFrame([max_temp_today])
    df["DATE"] = pd.to_datetime(df["DATE"], format = "mixed")
    df["forecasted_TMAX"] = pd.to_numeric(df["forecasted_TMAX"], errors = "coerce")
    return df

In [4]:
def merge_data():
    df_yesterday = extract_cli_yesterday()
    df_today = extract_cli_today()
    df_forecast = get_forecast()
    df_merged = pd.concat([df_yesterday, df_today], axis=0)
    df_merged["year"] = df_merged["DATE"].dt.year
    df_inference = pd.merge(df_merged, df_forecast, how = "left", on = "DATE")
    df_inference = df_inference.sort_values("DATE", ascending=False)
    print(df_inference.head(8))
    return df_inference

In [5]:
import numpy as np
def feature_engineering(df):
    print("starting feature engineering")
    df = df
    df = df.sort_values("DATE").reset_index(drop=True)
    cols_to_ffill = ["DATE", "TMAX", "TMIN", "PRCP", "AWND", "WSF2", "WDF2", "forecasted_TMAX"]
    df[cols_to_ffill] = df[cols_to_ffill].ffill()

    df["doy"] = df["DATE"].dt.dayofyear
    df["dow"] = df["DATE"].dt.dayofweek
    df["month"] = df["DATE"].dt.month
    df["doy_sin"] = np.sin(2*np.pi*df["doy"]/365.25)
    df["doy_cos"] = np.cos(2*np.pi*df["doy"]/365.25)
    
    df["diurnal_range"] = df["TMAX"] - df["TMIN"]
    df["wind_dir_sin"] = np.sin(np.deg2rad(df["WDF2"]))
    df["wind_dir_cos"] = np.cos(np.deg2rad(df["WDF2"]))
    
    #lags
    lag_cols_1 = ["TMAX", "TMIN", "diurnal_range"]
    for c in lag_cols_1:
        for k in [1, 2, 3, 7]:
            df[f"{c}_lag{k}"] = df[c].shift(k)
    lag_cols_2 = ["PRCP"]
    for k in [1, 2, 3, 4]:
        df[f"PRCP_lag_{k}"] = df["PRCP"].shift(k)

    #rolling means 
    df["rolling_3"] = df["TMAX"].rolling(3).mean()
    df["rolling_7"] = df["TMAX"].rolling(7).mean()
    df = df.sort_values("DATE", ascending = False).reset_index(drop=True)
    df = df.iloc[:1]
    df = df.drop(columns = ["DATE"])
    return df

In [6]:
from xgboost import XGBRegressor

def make_prediction(df):
    inference_df = df
    model = XGBRegressor()
    model.load_model("/Users/giulioelmi/Desktop/kelshi_trading/inference/best1.1.json")
    pred_error = float(model.predict(inference_df)[0])
    forecast = df["forecasted_TMAX"].iloc[0]
    adjusted_forecast = forecast  + pred_error
    print("------------------")
    print("Predicted error (TMAX_obs - TMAX_forecast):", pred_error)
    print("Forecast: ", forecast)
    print("Adjusted forecast: ", adjusted_forecast)

In [7]:
df = merge_data()

no report available for version 1
downloading report
no report available for version 3
downloading report
no report available for version 5
downloading report
no report available for version 7
downloading report
no report available for version 9
downloading report
no report available for version 11
downloading report
downloading report
no report available for version 14
downloading report
no report available for version 16
downloading report
no report available for version 18
no report available for version 19
downloading report
no report available for version 21
no report available for version 22
downloading report
no report available for version 24
downloading report
downloading report
no report available for version 27
downloading report
no report available for version 29
downloading report
no report available for version 31
downloading report
no report available for version 33
downloading report
no report available for version 35
no report available for version 36
downloading repor

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22 entries, 21 to 20
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   DATE             22 non-null     datetime64[ns]
 1   TMAX             22 non-null     int64         
 2   TMIN             22 non-null     int64         
 3   PRCP             21 non-null     float64       
 4   AWND             22 non-null     float64       
 5   WDF2             22 non-null     int64         
 6   WSF2             22 non-null     int64         
 7   year             22 non-null     int32         
 8   forecasted_TMAX  1 non-null      float64       
dtypes: datetime64[ns](1), float64(3), int32(1), int64(4)
memory usage: 1.6 KB


In [9]:
latest_prediction = make_prediction(feature_engineering(merge_data()))
latest_prediction

no report available for version 1
downloading report
no report available for version 3
downloading report
no report available for version 5
downloading report
no report available for version 7
downloading report
no report available for version 9
downloading report
no report available for version 11
downloading report
downloading report
no report available for version 14
downloading report
no report available for version 16
downloading report
no report available for version 18
no report available for version 19
downloading report
no report available for version 21
no report available for version 22
downloading report
no report available for version 24
downloading report
downloading report
no report available for version 27
downloading report
no report available for version 29
downloading report
no report available for version 31
downloading report
no report available for version 33
downloading report
no report available for version 35
no report available for version 36
downloading repor

XGBoostError: [22:55:08] /Users/runner/work/xgboost/xgboost/src/common/io.cc:144: Opening /Users/giulioelmi/Desktop/kelshi_trading/inference/best1.1.json failed: No such file or directory
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000117088545 dmlc::LogMessageFatal::~LogMessageFatal() + 117
  [bt] (1) 2   libxgboost.dylib                    0x00000001171c7656 xgboost::common::LoadSequentialFile(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>) + 790
  [bt] (2) 3   libxgboost.dylib                    0x000000011715b36b XGBoosterLoadModel::$_0::operator()() const + 171
  [bt] (3) 4   libxgboost.dylib                    0x000000011715ad10 XGBoosterLoadModel + 1040
  [bt] (4) 5   libffi.8.dylib                      0x0000000108b52972 ffi_call_unix64 + 82
  [bt] (5) 6   ???                                 0x00007ff7b8b8b7d0 0x0 + 140701932763088



In [None]:
# Get all markets for the KXHIGHNY series
from datetime import datetime
ticker = "KXHIGHLAX"
def get_markets_data(ticker):
    markets_url = f"https://api.elections.kalshi.com/trade-api/v2/markets?series_ticker={ticker}&status=open"
    markets_response = requests.get(markets_url)
    markets_data = markets_response.json()
    rows = []
    for market in markets_data['markets']:
        ticker = market.get("event_ticker")
        dt = ticker.split("-")[1]
        date = datetime.strptime("20" + dt, "%Y%b%d").date()
        floor = market.get("floor_strike")
        cap = market.get("cap_strike")
        no_ask = market.get("no_ask")
        yes_ask = market.get("yes_ask")
        data = {"date": date, "ticker": ticker, "floor": floor, "cap": cap, "no_ask": no_ask, "yes_ask": yes_ask}
        rows.append(data)
    df = pd.DataFrame(rows)
    df["date"] = pd.to_datetime(df["date"])
    for c in ["floor", "cap", "no_ask", "yes_ask"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    #print(json.dumps(markets_data, indent=4))
    tomorrow = pd.Timestamp.today().normalize() + pd.Timedelta(days=1)
    df = df[df["date"] == tomorrow]
    df = df.sort_values(by = "cap")
    return df
    
market_data = get_markets_data(ticker)


In [11]:
market_data.head()

Unnamed: 0,date,ticker,floor,cap,no_ask,yes_ask
1,2026-01-16,KXHIGHLAX-26JAN16,,74.0,82,19
5,2026-01-16,KXHIGHLAX-26JAN16,74.0,75.0,77,24
4,2026-01-16,KXHIGHLAX-26JAN16,76.0,77.0,72,31
3,2026-01-16,KXHIGHLAX-26JAN16,78.0,79.0,80,22
2,2026-01-16,KXHIGHLAX-26JAN16,80.0,81.0,89,14


In [None]:
import math
import pandas as pd

def max_prices_df(markets: pd.DataFrame, mu: float, sigma: float) -> pd.DataFrame:
    """
    For each Kalshi contract row, compute:
      - p_yes: model-implied probability that YES settles to 1
      - max_yes / max_no: maximum price (in cents) you're willing to pay for YES/NO
      - edges vs current asks (optional but useful)

    Assumptions about contract encoding:
      - floor & cap present  -> bucket: floor <= T < cap
      - floor present only   -> unilateral: T >= floor
      - cap present only     -> unilateral: T < cap

    Prices in the input are assumed to be in cents (0..100).
    """

    def norm_cdf(x: float) -> float:
        # Standard normal CDF using erf (no scipy dependency)
        return 0.5 * (1.0 + math.erf(x / math.sqrt(2.0)))

    def prob_yes(row) -> float:
        floor = row.get("floor")
        cap = row.get("cap")

        has_floor = pd.notna(floor)
        has_cap = pd.notna(cap)

        if has_floor and has_cap:
            # P(floor <= T < cap)
            z_u = (cap - mu) / sigma
            z_l = (floor - mu) / sigma
            p = norm_cdf(z_u) - norm_cdf(z_l)
        elif has_floor and not has_cap:
            # P(T >= floor)
            z = (floor - mu) / sigma
            p = 1.0 - norm_cdf(z)
        elif has_cap and not has_floor:
            # P(T < cap)
            z = (cap - mu) / sigma
            p = norm_cdf(z)
        else:
            # If both missing, cannot interpret the contract
            p = float("nan")

    out = markets.copy()

    out["p_yes"] = out.apply(prob_yes, axis=1)
    out["p_no"] = 1.0 - out["p_yes"]

    # Maximum you're willing to pay (in cents)
    out["max_yes_cents"] = (100.0 * out["p_yes"]).round(2)
    out["max_no_cents"] = (100.0 * out["p_no"]).round(2)

    # Compare to current asks (also in cents)
    if "yes_ask" in out.columns:
        out["edge_yes_cents"] = (out["max_yes_cents"] - out["yes_ask"]).round(2)
    if "no_ask" in out.columns:
        out["edge_no_cents"] = (out["max_no_cents"] - out["no_ask"]).round(2)

    # Handy boolean suggestions (strictly positive edge)
    if "yes_ask" in out.columns:
        out["buy_yes"] = out["edge_yes_cents"] > 0
    if "no_ask" in out.columns:
        out["buy_no"] = out["edge_no_cents"] > 0

    return out


In [None]:
# df is your Kalshi markets dataframe
result = max_prices_df(get_markets_data(ticker), mu=66.48624873161316, sigma=1)
result

In [None]:
import xgboost, sklearn, sys
print(sys.version)
print(xgboost.__version__)
print(sklearn.__version__)


In [None]:
print(market_data)