# Let's start some basic analytics

In [1]:
import sys

import sqlite3
from collections import Counter

import pandas as pd

sys.path.append("..")

import src.utils as utils
import src.scraper as sc


In [2]:
db_path = utils.root_path / "data" / "data.db"
db_path.exists()

True

In [3]:
conn = sqlite3.connect(str(db_path))

In [4]:
query = """SELECT forecasts.CITY, 
        date(FORECAST_DATE) as forecast_date, 
        PLATFORM, 
        forecasts.TEMPERATURE as forecast_temperature,
        date(DATE) as observed_date,
        observed_temp.TEMPERATURE as observed_temperature
 FROM forecasts
LEFT JOIN observed_temp
ON forecasts.city = observed_temp.city
AND date(forecasts.forecast_date) = date(observed_temp.date);
"""


In [5]:
df = pd.read_sql_query(query, conn)
conn.close()
df

Unnamed: 0,CITY,forecast_date,PLATFORM,forecast_temperature,observed_date,observed_temperature
0,CHI,2022-11-05,TWC,65,2022-11-05,71.0
1,NYC,2022-11-05,TWC,73,2022-11-05,76.0
2,NYC,2022-11-05,Accuweather,73,2022-11-05,76.0
3,CHI,2022-11-05,Accuweather,63,2022-11-05,71.0
4,NYC,2022-11-05,NWS,72,2022-11-05,76.0
...,...,...,...,...,...,...
344,CHI,2022-12-09,Accuweather,40,,
345,NYC,2022-12-09,NWS,46,,
346,CHI,2022-12-09,NWS,39,,
347,NYC,2022-12-09,foreca,46,,


In [6]:
df = df.dropna(subset=['observed_temperature', "forecast_temperature"])
df

Unnamed: 0,CITY,forecast_date,PLATFORM,forecast_temperature,observed_date,observed_temperature
0,CHI,2022-11-05,TWC,65,2022-11-05,71.0
1,NYC,2022-11-05,TWC,73,2022-11-05,76.0
2,NYC,2022-11-05,Accuweather,73,2022-11-05,76.0
3,CHI,2022-11-05,Accuweather,63,2022-11-05,71.0
4,NYC,2022-11-05,NWS,72,2022-11-05,76.0
...,...,...,...,...,...,...
328,CHI,2022-12-07,Accuweather,46,2022-12-07,43.0
329,NYC,2022-12-07,NWS,57,2022-12-07,59.0
330,CHI,2022-12-07,NWS,45,2022-12-07,43.0
331,NYC,2022-12-07,foreca,57,2022-12-07,59.0


In [7]:
# this should be nothing, but it's worth a check
df[df["forecast_temperature"] == '']

Unnamed: 0,CITY,forecast_date,PLATFORM,forecast_temperature,observed_date,observed_temperature


In [8]:
df["observed_temperature"] = df["observed_temperature"].astype(int)
df["forecast_temperature"] = df["forecast_temperature"].astype(int)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["observed_temperature"] = df["observed_temperature"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["forecast_temperature"] = df["forecast_temperature"].astype(int)


Unnamed: 0,CITY,forecast_date,PLATFORM,forecast_temperature,observed_date,observed_temperature
0,CHI,2022-11-05,TWC,65,2022-11-05,71
1,NYC,2022-11-05,TWC,73,2022-11-05,76
2,NYC,2022-11-05,Accuweather,73,2022-11-05,76
3,CHI,2022-11-05,Accuweather,63,2022-11-05,71
4,NYC,2022-11-05,NWS,72,2022-11-05,76
...,...,...,...,...,...,...
328,CHI,2022-12-07,Accuweather,46,2022-12-07,43
329,NYC,2022-12-07,NWS,57,2022-12-07,59
330,CHI,2022-12-07,NWS,45,2022-12-07,43
331,NYC,2022-12-07,foreca,57,2022-12-07,59


In [9]:
df["error"] = df["observed_temperature"] -  df["forecast_temperature"] 
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["error"] = df["observed_temperature"] -  df["forecast_temperature"]


Unnamed: 0,CITY,forecast_date,PLATFORM,forecast_temperature,observed_date,observed_temperature,error
0,CHI,2022-11-05,TWC,65,2022-11-05,71,6
1,NYC,2022-11-05,TWC,73,2022-11-05,76,3
2,NYC,2022-11-05,Accuweather,73,2022-11-05,76,3
3,CHI,2022-11-05,Accuweather,63,2022-11-05,71,8
4,NYC,2022-11-05,NWS,72,2022-11-05,76,4
...,...,...,...,...,...,...,...
328,CHI,2022-12-07,Accuweather,46,2022-12-07,43,-3
329,NYC,2022-12-07,NWS,57,2022-12-07,59,2
330,CHI,2022-12-07,NWS,45,2022-12-07,43,-2
331,NYC,2022-12-07,foreca,57,2022-12-07,59,2


In [10]:
df_chi = df[df["CITY"]=='CHI']

In [11]:
errors = df_chi[["PLATFORM", "error"]].groupby('PLATFORM')["error"].value_counts()
errors

PLATFORM     error
Accuweather   0       8
              2       6
              1       4
             -3       3
             -1       3
              4       3
             -4       2
              3       2
              8       2
              5       1
              6       1
              9       1
              13      1
              19      1
NWS           4       7
              3       6
              1       5
              0       4
             -2       3
              2       3
             -1       2
              5       2
              9       2
              10      2
              13      1
              21      1
TWC           1       8
              0       7
             -1       5
              2       3
              3       3
             -3       2
             -2       2
             -5       1
              4       1
              5       1
              6       1
              7       1
              8       1
              12      1
              21     

In [12]:
accu_error = errors["Accuweather"]
accu_error

error
 0     8
 2     6
 1     4
-3     3
-1     3
 4     3
-4     2
 3     2
 8     2
 5     1
 6     1
 9     1
 13    1
 19    1
Name: error, dtype: int64

In [13]:
errors_dict = errors["Accuweather"].to_dict()
errors_dict

{0: 8,
 2: 6,
 1: 4,
 -3: 3,
 -1: 3,
 4: 3,
 -4: 2,
 3: 2,
 8: 2,
 5: 1,
 6: 1,
 9: 1,
 13: 1,
 19: 1}

## Mean

Okay I want to see what the mean looks like as well

In [14]:
means = df_chi.groupby("forecast_date")[["forecast_temperature", "observed_temperature"]].mean()
means

Unnamed: 0_level_0,forecast_temperature,observed_temperature
forecast_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-10-15,54.5,55.0
2022-10-16,55.75,57.0
2022-10-17,42.0,45.0
2022-10-18,46.5,50.0
2022-10-23,78.25,81.0
2022-10-24,76.333333,79.0
2022-10-25,64.333333,71.0
2022-10-26,56.5,57.0
2022-10-27,56.25,54.0
2022-10-28,60.5,61.0


In [15]:
means["f_t_round"] = means["forecast_temperature"].round().astype(int)
means["error"] = means["observed_temperature"] - means["f_t_round"]

In [16]:
means["error"].mean()

2.5

In [17]:
means["error"].abs().mean()

3.236842105263158

### Metrics

Okay so I guess we want to get the mean and stdev?

In [18]:
df_chi_accu = df_chi[df_chi["PLATFORM"] == "Accuweather"]
df_chi_accu

Unnamed: 0,CITY,forecast_date,PLATFORM,forecast_temperature,observed_date,observed_temperature,error
3,CHI,2022-11-05,Accuweather,63,2022-11-05,71,8
11,CHI,2022-11-06,Accuweather,63,2022-11-06,64,1
19,CHI,2022-11-07,Accuweather,56,2022-11-07,58,2
27,CHI,2022-11-08,Accuweather,58,2022-11-08,54,-4
35,CHI,2022-11-09,Accuweather,68,2022-11-09,70,2
124,CHI,2022-10-31,Accuweather,61,2022-10-31,61,0
128,CHI,2022-10-30,Accuweather,63,2022-10-30,67,4
132,CHI,2022-10-29,Accuweather,65,2022-10-29,67,2
136,CHI,2022-10-28,Accuweather,61,2022-10-28,61,0
140,CHI,2022-10-27,Accuweather,58,2022-10-27,54,-4


In [19]:
df_chi_accu["error"].mean()
df_chi_accu["error"].abs().mean()

3.210526315789474

In [20]:
def metrics(df, platform, city):
    df = df.copy()
    
    df = df[df["CITY"]== city]
    df = df[df["PLATFORM"] == platform]
    
    mean = df["error"].mean()
    abs_mean = df["error"].abs().mean()
    
    print(f'Mean Error for {platform} in {city}: {mean}')
    print(f'Mean Absolute Error for {platform} in {city}: {abs_mean}')
    
metrics(df, "foreca", "CHI")

Mean Error for foreca in CHI: 2.3055555555555554
Mean Absolute Error for foreca in CHI: 2.9722222222222223


In [21]:
for city in set(df["CITY"]):
    
    for platform in set(df["PLATFORM"]):
        
        metrics(df, platform, city)
        print("\n")

Mean Error for NWS in CHI: 3.4473684210526314
Mean Absolute Error for NWS in CHI: 3.8684210526315788


Mean Error for foreca in CHI: 2.3055555555555554
Mean Absolute Error for foreca in CHI: 2.9722222222222223


Mean Error for TWC in CHI: 1.736842105263158
Mean Absolute Error for TWC in CHI: 2.789473684210526


Mean Error for Accuweather in CHI: 2.1578947368421053
Mean Absolute Error for Accuweather in CHI: 3.210526315789474


Mean Error for NWS in NYC: 0.6842105263157895
Mean Absolute Error for NWS in NYC: 2.1052631578947367


Mean Error for foreca in NYC: 1.2972972972972974
Mean Absolute Error for foreca in NYC: 2.2162162162162162


Mean Error for TWC in NYC: 0.9473684210526315
Mean Absolute Error for TWC in NYC: 1.631578947368421


Mean Error for Accuweather in NYC: 0.15789473684210525
Mean Absolute Error for Accuweather in NYC: 1.7894736842105263




### Get Odds

Okay this is going to be super crazy but whatever

Heres' a good start: https://stackoverflow.com/questions/38711541/how-to-compute-the-probability-of-a-value-given-a-list-of-samples-from-a-distrib

In [22]:
def odds(df, platform, city):
    df = df.copy()
    df = df[df["CITY"]== city]
    df = df[df["PLATFORM"] == platform]
    
    counts = df["error"].value_counts()
    total = counts.sum()
    
    
    
    return counts / total

error_odds = odds(df, "foreca", "CHI")
error_odds

 0     0.222222
 4     0.138889
 2     0.138889
 3     0.111111
-3     0.083333
-1     0.083333
 5     0.055556
 1     0.055556
 8     0.027778
 9     0.027778
 14    0.027778
 10    0.027778
Name: error, dtype: float64

### And now given that table

We want to be able to say, what is the probability of it being 71 deg when the forecast is 72.

In [23]:
def probability_blaster(forecast, threshold, operator, odds):
    diff = forecast - threshold
    
    if operator == "lt":
        new_odds = odds[odds.keys() < diff]
    
    return new_odds.sum()
    
probability_blaster(71, 72, "lt", error_odds)

0.08333333333333333

### Here is the next step!!!!

So let's say you wanted to have an ensemble. You can't just get the probabilities from each of them. Just think bout it it now that you have this.

So what you have to do is get the numbers and the counts for each platform and then go from there.

NOW! this only matters if it's a different number of observations. This is worth investigating.

df_chi[["PLATFORM", "error"]].groupby('PLATFORM')["error"].value_counts()

In [24]:
df[df["CITY"] == "CHI"]

Unnamed: 0,CITY,forecast_date,PLATFORM,forecast_temperature,observed_date,observed_temperature,error
0,CHI,2022-11-05,TWC,65,2022-11-05,71,6
3,CHI,2022-11-05,Accuweather,63,2022-11-05,71,8
5,CHI,2022-11-05,NWS,61,2022-11-05,71,10
7,CHI,2022-11-05,foreca,66,2022-11-05,71,5
8,CHI,2022-11-06,TWC,64,2022-11-06,64,0
...,...,...,...,...,...,...,...
324,CHI,2022-12-06,foreca,45,2022-12-06,44,-1
325,CHI,2022-12-07,TWC,44,2022-12-07,43,-1
328,CHI,2022-12-07,Accuweather,46,2022-12-07,43,-3
330,CHI,2022-12-07,NWS,45,2022-12-07,43,-2


In [25]:
def error_counts(df, city):
    df = df[df["CITY"] == city].copy()
    value_counts = df[["PLATFORM", "error"]].groupby('PLATFORM')["error"].value_counts().rename("error_count")
    value_counts = value_counts.to_frame()
    value_counts = value_counts.reset_index(level='error')
    
    return value_counts

df_co = error_counts(df, "CHI")
df_co

Unnamed: 0_level_0,error,error_count
PLATFORM,Unnamed: 1_level_1,Unnamed: 2_level_1
Accuweather,0,8
Accuweather,2,6
Accuweather,1,4
Accuweather,-3,3
Accuweather,-1,3
Accuweather,4,3
Accuweather,-4,2
Accuweather,3,2
Accuweather,8,2
Accuweather,5,1


In [26]:
def error_samples(df, city):
    df = df[df["CITY"] == city].copy()
    counts = df[["PLATFORM", "error"]].groupby("PLATFORM").count()
    counts.columns=["samples"]
    
    return counts

error_samples(df, "CHI")

Unnamed: 0_level_0,samples
PLATFORM,Unnamed: 1_level_1
Accuweather,38
NWS,38
TWC,38
foreca,36


error = df["observed_temperature"] -  df["forecast_temperature"] 

In [27]:
def combined_odds(df, city, forecast, operator, threshold):
    
    errors = error_counts(df, city)
    error_to_predict = threshold - forecast
    
    # trim the erros based on the operator
    if operator == "lt=":
        errors = errors[errors["error"] <= error_to_predict]
    
    summed_errors = errors.groupby("PLATFORM")["error_count"].sum().to_frame()
    
    samples = error_samples(df, city)
    
    fin = samples.join(summed_errors).fillna(0)
    
    total_errors = fin["error_count"].sum()
    total_samples = fin["samples"].sum()
    
    probability = total_errors / total_samples
    
    fin["probability"] = fin["error_count"] / fin["samples"]
    
    return fin, probability

k = combined_odds(df, "CHI", 75, "lt=", 71)
k

(             samples  error_count  probability
 PLATFORM                                      
 Accuweather       38          2.0     0.052632
 NWS               38          0.0     0.000000
 TWC               38          1.0     0.026316
 foreca            36          0.0     0.000000,
 0.02)

## Except that's not it, is it

Now we have to

1) Get the forecast for each platoform

2) Trim each one based on that

3) THEN do what we've done above

In [28]:
query = """SELECT * FROM forecasts
    WHERE date(FORECAST_DATE) =
        (SELECT date(FORECAST_DATE) FROM forecasts
        ORDER BY date(FORECAST_DATE) desc
        LIMIT 1);"""

In [29]:
conn = sqlite3.connect(str(db_path))

In [30]:
df_f = pd.read_sql_query(query, conn)
conn.close()
df_f

Unnamed: 0,ID,CITY,FORECAST_DATE,TIME_OF_FORECAST,PLATFORM,TEMPERATURE
0,,CHI,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,TWC,40
1,,NYC,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,TWC,44
2,,NYC,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,Accuweather,47
3,,CHI,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,Accuweather,40
4,,NYC,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,NWS,46
5,,CHI,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,NWS,39
6,,NYC,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,foreca,46
7,,CHI,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,foreca,39


In [31]:
(df_f["CITY"] == "CHI") & (df_f["PLATFORM"] == "TWC")

0     True
1    False
2    False
3    False
4    False
5    False
6    False
7    False
dtype: bool

    if operator == "lt=":
        errors = errors[errors["error"] <= error_to_predict]

In [32]:
def city_platform(df, city, platform):
    
    df = df.copy()
    mask = (df["CITY"] == city) & (df["PLATFORM"] == platform)
    
    return df[mask]
    
    

In [33]:
def trim_errors(df, error, operator):
    df = df.copy()
    
    if operator == "lt=":
        df = df[df["error"] <= error]
    elif operator == "gt=":
        df = df[df["error"] >= error]
    elif operator == "bw":
        mask = (df["error"] >= error[0]) & (df["error"] <= error[1])
        df = df[mask]
        
    return df

In [34]:
def errors_by_platform(df, city, platform, forecast, threshold, operator):
    
    if isinstance(threshold, type(int())):
        error = threshold - forecast
    elif isinstance(threshold, type(tuple())):
        error = (threshold[0] - forecast, threshold[1] - forecast)
    
    k = city_platform(df, city, platform)
    samples = len(k)

    
    k = trim_errors(k, error, operator)
    
    errors = len(k)
    
    return errors, samples
    

errors_by_platform(df, "CHI", "TWC", 40, (38, 39), "bw")

(7, 38)

In [35]:
errors_by_platform(df, "CHI", "TWC", 40, 37, "lt=")

(3, 38)

In [36]:
errors_by_platform(df, "CHI", "TWC", 40, 40, "gt=")

(28, 38)

In [37]:
def city_odds(df, city, forecast, threshold, operator):
    
    # get a list of all unique platforms
    platforms = list(df[df["CITY"] == city]["PLATFORM"].unique())
    
    errors_dict = dict()
    
    for platform in platforms:
        t = errors_by_platform(df, city, platform, forecast, threshold, operator)
        errors_dict[platform] = t
    
    return errors_dict

    
errors = city_odds(df, "CHI", 40, 38, "lt=")

In [38]:
df_e = pd.DataFrame.from_dict(errors, orient='index', columns=["errors", "samples"])
df_e

Unnamed: 0,errors,samples
TWC,5,38
Accuweather,5,38
NWS,3,38
foreca,3,36


In [39]:
to_add = {"errors":df_e["errors"].sum(), "samples":df_e["samples"].sum()}
df_e = df_e.append(pd.Series(to_add, name="Total"), ignore_index=False)

In [40]:
df_e["probability"] = df_e["errors"] / df_e["samples"]
df_e

Unnamed: 0,errors,samples,probability
TWC,5,38,0.131579
Accuweather,5,38,0.131579
NWS,3,38,0.078947
foreca,3,36,0.083333
Total,16,150,0.106667


## Okay next steps

Okay now you need to get the df_f and then go through each forecast. In addition, you need to be able to do something 'between' something.


In [41]:
def errors_by_city(df_f, df, city, threshold, operator):
    
    df_f = df_f[df_f["CITY"] == city].copy()
    
    errors_dict = dict()
    
    for row in df_f.itertuples():
        
        # get each platform and forecast
        t = errors_by_platform(df, city, row[5], row[6], threshold, operator)
        errors_dict[row[5]] = t
    return errors_dict

errors_by_city(df_f, df, "CHI", 51, "lt=")

{'TWC': (36, 38), 'Accuweather': (36, 38), 'NWS': (36, 38), 'foreca': (35, 36)}

In [42]:
def odds_by_city(df_f, df, city, threshold, operator):
    
    errors = errors_by_city(df_f, df, city, threshold, operator)
    df_e = pd.DataFrame.from_dict(errors, orient='index', columns=["errors", "samples"])
    
    to_add = {"errors":df_e["errors"].sum(), "samples":df_e["samples"].sum()}
    df_e = df_e.append(pd.Series(to_add, name="Total"), ignore_index=False)
    
    df_e["probability"] = df_e["errors"] / df_e["samples"]
    
    return df_e
    
    
    
odds_by_city(df_f, df, "CHI", 47, "lt=")

Unnamed: 0,errors,samples,probability
TWC,35,38,0.921053
Accuweather,33,38,0.868421
NWS,32,38,0.842105
foreca,33,36,0.916667
Total,133,150,0.886667


In [43]:
odds_by_city(df_f, df, "CHI", (34, 35), "bw")

Unnamed: 0,errors,samples,probability
TWC,1,38,0.026316
Accuweather,0,38,0.0
NWS,0,38,0.0
foreca,0,36,0.0
Total,1,150,0.006667


In [44]:
df_f

Unnamed: 0,ID,CITY,FORECAST_DATE,TIME_OF_FORECAST,PLATFORM,TEMPERATURE
0,,CHI,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,TWC,40
1,,NYC,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,TWC,44
2,,NYC,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,Accuweather,47
3,,CHI,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,Accuweather,40
4,,NYC,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,NWS,46
5,,CHI,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,NWS,39
6,,NYC,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,foreca,46
7,,CHI,2022-12-09 21:32:24.496395,2022-12-08 21:32:24.496391,foreca,39


## OKay let's put it all together

In [45]:
boop = sc.scrape_websites()
boop

TWCCHI
should already be fahrenheit
TWCCHI
TWCNYC
AccuweatherNYC
AccuweatherCHI
AccuweatherCHI
NWSNYC
NWSCHI
forecaNYC
forecaCHI


[['CHI',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'TWC',
  '43'],
 ['NYC',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'TWC',
  '40'],
 ['NYC',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'Accuweather',
  '43'],
 ['CHI',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'Accuweather',
  '43'],
 ['NYC',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'NWS',
  '41'],
 ['CHI',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'NWS',
  '40'],
 ['NYC',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'foreca',
  '39'],
 ['CHI',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'foreca',
  '43']]

In [46]:
def forecasts():
    # change
    conn = sqlite3.connect(str(db_path))
    
    query = """SELECT forecasts.CITY, 
        date(FORECAST_DATE) as forecast_date, 
        PLATFORM, 
        forecasts.TEMPERATURE as forecast_temperature,
        date(DATE) as observed_date,
        observed_temp.TEMPERATURE as observed_temperature
         FROM forecasts
        LEFT JOIN observed_temp
        ON forecasts.city = observed_temp.city
        AND date(forecasts.forecast_date) = date(observed_temp.date);
        """
    df = pd.read_sql_query(query, conn)
    conn.close()
    df = df.dropna(subset=['observed_temperature', "forecast_temperature"])
    df["observed_temperature"] = df["observed_temperature"].astype(int)
    df["forecast_temperature"] = df["forecast_temperature"].astype(int)
    df["error"] = df["observed_temperature"] -  df["forecast_temperature"] 
    return df
df = forecasts()
df

Unnamed: 0,CITY,forecast_date,PLATFORM,forecast_temperature,observed_date,observed_temperature,error
0,CHI,2022-11-05,TWC,65,2022-11-05,71,6
1,NYC,2022-11-05,TWC,73,2022-11-05,76,3
2,NYC,2022-11-05,Accuweather,73,2022-11-05,76,3
3,CHI,2022-11-05,Accuweather,63,2022-11-05,71,8
4,NYC,2022-11-05,NWS,72,2022-11-05,76,4
...,...,...,...,...,...,...,...
328,CHI,2022-12-07,Accuweather,46,2022-12-07,43,-3
329,NYC,2022-12-07,NWS,57,2022-12-07,59,2
330,CHI,2022-12-07,NWS,45,2022-12-07,43,-2
331,NYC,2022-12-07,foreca,57,2022-12-07,59,2


In [47]:
def list_to_dataframe(forecast_list):
    
    df = pd.DataFrame(forecast_list, columns=["CITY", "FORECAST_DATE", "TIME_OF_FORECAST", "PLATFORM", "TEMPERATURE"])
    df["ID"] = None
    df = df[["ID", "CITY", "FORECAST_DATE", "TIME_OF_FORECAST", "PLATFORM", "TEMPERATURE"]]
    df["TEMPERATURE"] = df["TEMPERATURE"].astype(int)
    return df

list_to_dataframe(boop)

Unnamed: 0,ID,CITY,FORECAST_DATE,TIME_OF_FORECAST,PLATFORM,TEMPERATURE
0,,CHI,2022-12-10 21:37:30.405755,2022-12-09 21:37:30.405750,TWC,43
1,,NYC,2022-12-10 21:37:30.405755,2022-12-09 21:37:30.405750,TWC,40
2,,NYC,2022-12-10 21:37:30.405755,2022-12-09 21:37:30.405750,Accuweather,43
3,,CHI,2022-12-10 21:37:30.405755,2022-12-09 21:37:30.405750,Accuweather,43
4,,NYC,2022-12-10 21:37:30.405755,2022-12-09 21:37:30.405750,NWS,41
5,,CHI,2022-12-10 21:37:30.405755,2022-12-09 21:37:30.405750,NWS,40
6,,NYC,2022-12-10 21:37:30.405755,2022-12-09 21:37:30.405750,foreca,39
7,,CHI,2022-12-10 21:37:30.405755,2022-12-09 21:37:30.405750,foreca,43


## Okay this is super janky

But here's an example of what a read-in yaml is going to look like:

In [48]:
handles = {"CHI":
    {
        "lt=":42,
        "bw":[(43,44), (45,46)],
        "gt=":47
    },
 "NYC":
    {
        "lt=":40,
        "bw":[(41,42), (43,44)],
        "gt=":45
    }
}
handles

{'CHI': {'lt=': 42, 'bw': [(43, 44), (45, 46)], 'gt=': 47},
 'NYC': {'lt=': 40, 'bw': [(41, 42), (43, 44)], 'gt=': 45}}

In [49]:
def key_gen(city, operator, threshold):
    
    return f"{city} - {operator} - {threshold}"

In [50]:
# odds_by_city(df_f, df, "CHI", (34, 35), "bw")



def odds(forecast_list, handles):
    
    odds_dict = dict()

    df = forecasts()
    df_f = list_to_dataframe(forecast_list)
    
    for city in handles:
        for operator in handles[city]:
            if operator == "bw":
                for pair in handles[city][operator]:
                    
                    k = odds_by_city(df_f, df, city, pair, operator)
                    
                    key = key_gen(city, operator, pair)
                    
                    odds_dict[key] = k
            else:
                threshold = handles[city][operator]
                k = odds_by_city(df_f, df, city, threshold, operator)
                key = key_gen(city, operator, threshold)
                odds_dict[key] = k
                    
    return odds_dict

    
ret = odds(boop, handles)

In [51]:
boop

[['CHI',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'TWC',
  '43'],
 ['NYC',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'TWC',
  '40'],
 ['NYC',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'Accuweather',
  '43'],
 ['CHI',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'Accuweather',
  '43'],
 ['NYC',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'NWS',
  '41'],
 ['CHI',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'NWS',
  '40'],
 ['NYC',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'foreca',
  '39'],
 ['CHI',
  '2022-12-10 21:37:30.405755',
  '2022-12-09 21:37:30.405750',
  'foreca',
  '43']]

In [52]:
for i in ret:
    print(i)
    print(ret[i])
    print()

CHI - lt= - 42
             errors  samples  probability
TWC              10       38     0.263158
Accuweather       8       38     0.210526
NWS              17       38     0.447368
foreca            6       36     0.166667
Total            41      150     0.273333

CHI - bw - (43, 44)
             errors  samples  probability
TWC              15       38     0.394737
Accuweather      12       38     0.315789
NWS              13       38     0.342105
foreca           10       36     0.277778
Total            50      150     0.333333

CHI - bw - (45, 46)
             errors  samples  probability
TWC               6       38     0.157895
Accuweather       8       38     0.210526
NWS               2       38     0.052632
foreca            9       36     0.250000
Total            25      150     0.166667

CHI - gt= - 47
             errors  samples  probability
TWC               7       38     0.184211
Accuweather      10       38     0.263158
NWS               6       38     0.157895
for

### TODO:

    you need to finish the module: `/src/analytics.py` 
    you need to make a reader for the yaml file

In [53]:
import src.analytics as al

In [54]:
al.odds(boop, handles)

{'CHI - lt= - 42':              errors  samples  probability
 TWC              10       38     0.263158
 Accuweather       8       38     0.210526
 NWS              17       38     0.447368
 foreca            6       36     0.166667
 Total            41      150     0.273333,
 'CHI - bw - (43, 44)':              errors  samples  probability
 TWC              15       38     0.394737
 Accuweather      12       38     0.315789
 NWS              13       38     0.342105
 foreca           10       36     0.277778
 Total            50      150     0.333333,
 'CHI - bw - (45, 46)':              errors  samples  probability
 TWC               6       38     0.157895
 Accuweather       8       38     0.210526
 NWS               2       38     0.052632
 foreca            9       36     0.250000
 Total            25      150     0.166667,
 'CHI - gt= - 47':              errors  samples  probability
 TWC               7       38     0.184211
 Accuweather      10       38     0.263158
 NWS        