In [None]:
from transformers import pipeline, set_seed, AutoConfig
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import binned_statistic
from scipy.optimize import curve_fit
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pickle

In [None]:
config = AutoConfig.from_pretrained('gpt2')
generator = pipeline('text-generation', model='gpt2',config=config)
set_seed(42)
T=10.

In [2]:
#data = [text['generated_text'] for text in generator("Hello, I'm a language model,", max_length=250, num_return_sequences=100, temperature = T)]
#with open("data_high.pkl", "wb") as file:
#   pickle.dump(data, file)

with open("data.pkl", "rb") as file:
    data = pickle.load(file)
    
with open("data_low.pkl", "rb") as file:
    data_low = pickle.load(file)
    
with open("data_high.pkl", "rb") as file:
    data_high = pickle.load(file)

In [25]:
with open("llama.csv", "r") as file:
    data_llama = file.readlines()

In [26]:
def data_to_obs(data, doc_param = {"min":10, "step":5}):
    vec = CountVectorizer()
    vec.fit(data)
    X = pd.DataFrame(data=vec.fit_transform(data).toarray().T, index=vec.get_feature_names_out())
    f = X.divide(X.sum(0),1).mean(1).sort_values(ascending=False)
    M = X.sum(0)
    h = X.apply(lambda x: (x!=0).sum(), axis=0)
    N = len(f)
    doc_M = list(M.values)
    doc_h = list(h.values)
    for doc in data:
        _doc = doc.split(" ")
        for _M in range(doc_param["min"],len(_doc), doc_param["step"]):
            doc_M.append(_M)
            _h = (vec.transform([" ".join(_doc[:_M])])>0).sum()
            doc_h.append(_h)
    return N, f, M, h, doc_M, doc_h

In [32]:
N, f, M, h, doc_M, doc_h = data_to_obs(data)
N_low, f_low, M_low, h_low, doc_M_low, doc_h_low = data_to_obs(data_low)
N_high, f_high, M_high, h_high, doc_M_high, doc_h_high = data_to_obs(data_high)
N_llama, f_llama, M_llama, h_llama, doc_M_llama, doc_h_llama = data_to_obs(data_llama, doc_param = {"min":1, "step":3})

In [33]:
fig = go.Figure()

fig.add_scatter(y=f, mode="markers")
fig.add_scatter(y=f_llama, mode="markers")


fig.add_scatter(x=[1,len(f)], y=[1e-1, 1e-1/len(f)], mode="lines")

fig.update_layout(
    {
        "xaxis":{
            "title": "rank",
            "type": "log"
        },
        "yaxis":{
            "title": "f",
            "type": "log"
        },
        "width":1500,
        "height":1000
    }
)
fig.show()
fig.write_image("zipf.pdf")

In [34]:
h_t = [N - sum([np.exp(-_f*m) for _f in f]) for m in M]

In [35]:
fig = go.Figure()
#fig.add_scatter(x=M, y=h, mode="markers", name="data")
fig.add_scatter(x=doc_M, y=doc_h, mode="markers", name="data")
fig.add_scatter(x=doc_M_low, y=doc_h_low, mode="markers", name="low_t")
fig.add_scatter(x=doc_M_high, y=doc_h_high, mode="markers", name="high_t")


fig.add_scatter(x=M, y=h_t, mode="markers", name="teo")


fig.update_layout(
    {
        "xaxis":{
            "title": "M"
        },
        "yaxis":{
            "title": "h"
        },
        "width":1500,
        "height":1000
    }
)

fig.show()
fig.write_image("heaps.pdf")

In [36]:
fig = go.Figure()
#fig.add_scatter(x=M, y=h, mode="markers", name="data")
fig.add_scatter(x=doc_M, y=doc_h, mode="markers", name="gpt")
fig.add_scatter(x=doc_M_llama, y=doc_h_llama, mode="markers", name="llama")


fig.add_scatter(x=M, y=h_t, mode="markers", name="teo")


fig.update_layout(
    {
        "xaxis":{
            "title": "M"
        },
        "yaxis":{
            "title": "h"
        },
        "width":1500,
        "height":1000
    }
)

fig.show()
fig.write_image("heaps.pdf")

In [37]:
def get_avg(M, h):
    bins = 35
    bin_means, _, _ = binned_statistic(M, h, statistic='mean', bins=bins)
    bin_std, _, _ = binned_statistic(M, h, statistic='std', bins=bins)
    bin_counts, _, _ = binned_statistic(doc_M, doc_h, statistic='count', bins=bins)
    return bin_means[bin_counts>50], (bin_std**2)[bin_counts>50]


fit_func_1 = lambda x, C: C*x
fit_func_2 = lambda x, C: C*(x**2)

fig = go.Figure()

for x,y in zip([doc_M, doc_M_low, doc_M_high, doc_M_llama], [doc_h, doc_h_low, doc_M_high, doc_h_llama]):
    bin_means, bin_vars = get_avg(x, y)
    fig.add_scatter(x=bin_means, y=bin_vars, mode="markers", name="data")
    popt, pcov = curve_fit(fit_func_1, bin_means, bin_vars)
    fig.add_scatter(x=bin_means, y=fit_func_1(bin_means, *popt), mode="lines", name="y=x")
    popt, pcov = curve_fit(fit_func_2, bin_means, bin_vars)
    fig.add_scatter(x=bin_means, y=fit_func_2(bin_means, *popt), mode="lines", name="y=x^2")


fig.update_layout(
    {
        "xaxis":{
            "title": "<h>",
            "type": "log"
        },
        "yaxis":{
            "title": "var(h)",
            "type": "log"
        },
        "width":1000,
        "height":500
    }
)
fig.show()
fig.write_image("taylor.pdf")