In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
import sklearn

In [2]:
import pickle

data_fn = '../datasets/sehstr/block_18_stop6.pkl'
with open(data_fn, 'rb') as f:
  x_to_r = pickle.load(f)

In [3]:
json_fn = '../datasets/sehstr/block_18.json'
blocks_df = pd.read_json(json_fn)

In [4]:
xs = list(x_to_r.keys())
print(xs[:10])

['000000', '000001', '000002', '000003', '000004', '000005', '000006', '000007', '000008', '000009']


In [5]:
# Featurize

symbols = '0123456789abcdefghijklmnopqrstuvwxyz' + \
              'ABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\()*+,-./:;<=>?@[\]^_`{|}~'
num_blocks = len(blocks_df)

import functools

@functools.cache
def symbol_ohe(symbol):
  zs = np.zeros(num_blocks)
  zs[symbols.index(symbol)] = 1.0
  return zs

print(symbol_ohe('1'))

[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [6]:
# Featurization

def featurize(x):
  return np.concatenate([symbol_ohe(c) for c in x])

print(featurize(xs[0]))

[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [7]:
Y = np.array(list(x_to_r.values()))

In [8]:
from tqdm import tqdm

X = []
for x in tqdm(xs):
  X.append(featurize(x))
X = np.array(X)

100%|██████████| 34012224/34012224 [01:58<00:00, 287838.93it/s]


In [19]:
from sklearn.ensemble import HistGradientBoostingRegressor

# N_SUBSET = 1000000
N_SUBSET = len(X)

model = HistGradientBoostingRegressor()
model.fit(X[:N_SUBSET], Y[:N_SUBSET])
print(model.score(X[:N_SUBSET], Y[:N_SUBSET]))

0.808403188596164


In [20]:
from scipy.stats import pearsonr

PEARSONR_SUBSET = 1000000

pearsonr(model.predict(X[:PEARSONR_SUBSET]), Y[:PEARSONR_SUBSET])

(0.9013709040275963, 0.0)

In [21]:
with open('sehstr_gbtr.pkl', 'wb') as f:
  pickle.dump(model, f)
print('Saved to file.')

Saved to file.
