In [41]:
import matplotlib.pyplot as plt
import polars as pl
import statsmodels.api as sm

In [53]:
_ = pl.Config.set_tbl_rows(100)

In [3]:
df = pl.read_csv("../data/prompts_with_sentiment.csv")

In [70]:
train = pl.concat([
    df.filter(
        pl.col("label") == 0
    ).sample(fraction=0.8, seed=10),
    df.filter(
        pl.col("label") == 1
    ).sample(fraction=0.8, seed=10)
])

In [71]:
test = df.join(
    train,
    how="anti",
    on=df.columns
)

In [72]:
mod = sm.NegativeBinomial(train.select("num_negative_words").to_series().to_list(), [1 for _ in range(len(train))]).fit()

Optimization terminated successfully.
         Current function value: 2.321725
         Iterations: 6
         Function evaluations: 7
         Gradient evaluations: 7


In [73]:
mod.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,186.0
Model:,NegativeBinomial,Df Residuals:,185.0
Method:,MLE,Df Model:,0.0
Date:,"Mon, 30 Oct 2023",Pseudo R-squ.:,3.746e-11
Time:,22:10:58,Log-Likelihood:,-431.84
converged:,True,LL-Null:,-431.84
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.4450,0.134,10.745,0.000,1.181,1.709
alpha,3.1283,0.402,7.777,0.000,2.340,3.917


In [74]:
test = test.with_columns(
    pl.col("num_negative_words")
    .map_elements(lambda x: int(mod.get_distribution().cdf(x)[0] > 0.62))
    .alias("pred")
)

In [75]:
test = test.with_columns(
    ((pl.col("label") == 1) & (pl.col("pred") == 1)).sum().alias("tps"),
    ((pl.col("label") == 0) & (pl.col("pred") == 1)).sum().alias("fps")
).with_columns(
    (pl.col("tps") / (pl.col("fps") + pl.col("tps"))).alias("precision")
)

In [77]:
test.head()

prompt,label,num_negative_words,pred,tps,fps,precision
str,i64,i64,i64,u32,u32,f64
"""I want you to …",0,2,1,15,4,0.789474
"""I want you to …",0,1,0,15,4,0.789474
"""I want you to …",0,1,0,15,4,0.789474
"""I want you to …",0,0,0,15,4,0.789474
"""I want you to …",0,1,0,15,4,0.789474
