In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [6]:
data = pd.read_csv('data/entropy_rem.dat', sep=' ', header=None)
data.columns = ['entropy', 'guesses']

In [11]:
X = data['entropy'].values.reshape(-1,1)
Y = data['guesses'].values.reshape(-1,1)

In [None]:
regressor = LinearRegression()
regressor.fit(X, Y)

In [16]:
regressor.coef_

array([[0.25066396]])

In [17]:
regressor.intercept_

array([1.16938445])

In [20]:
test_data = pd.read_csv('data/test_entropy_rem.dat', sep=' ', header=None)
test_data.columns = ['entropy', 'guesses']

In [21]:
X_Test = test_data['entropy'].values.reshape(-1,1)
Y_Test = test_data['guesses'].values.reshape(-1,1)

In [22]:
pred_guesses = regressor.predict(X_Test)
mse = metrics.mean_squared_error(Y_Test, pred_guesses)
cod = metrics.r2_score(Y_Test, pred_guesses)
print(f'Mean squared error: {mse:.2f}')
print(f'Coefficient of determination: {cod:.2f}')

Mean squared error: 0.41
Coefficient of determination: 0.60


In [27]:
from sklearn import linear_model, svm, tree, ensemble

regressors = [
    linear_model.LinearRegression(),
    linear_model.Ridge(),
    linear_model.Lasso(),
    linear_model.ElasticNet(),
    svm.SVR(),
    tree.DecisionTreeRegressor(),
    ensemble.RandomForestRegressor(),
    ensemble.GradientBoostingRegressor()
]

In [28]:
for regressor in regressors:
    regressor.fit(X, Y)
    pred_guesses = regressor.predict(X_Test)
    mse = metrics.mean_squared_error(Y_Test, pred_guesses)
    cod = metrics.r2_score(Y_Test, pred_guesses)
    print(f'{regressor.__class__.__name__}') 
    print(f'Mean squared error: {mse:.2f}')
    print(f'Coefficient of determination: {cod:.2f}')
    print()

LinearRegression
Mean squared error: 0.41
Coefficient of determination: 0.60

Ridge
Mean squared error: 0.41
Coefficient of determination: 0.60

Lasso
Mean squared error: 0.68
Coefficient of determination: 0.33

ElasticNet
Mean squared error: 0.52
Coefficient of determination: 0.49


  y = column_or_1d(y, warn=True)


SVR
Mean squared error: 0.41
Coefficient of determination: 0.59

DecisionTreeRegressor
Mean squared error: 0.42
Coefficient of determination: 0.58

RandomForestRegressor
Mean squared error: 0.42
Coefficient of determination: 0.58


  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)


GradientBoostingRegressor
Mean squared error: 0.42
Coefficient of determination: 0.59


In [32]:
np.log(2.906 + 2.546 * 1.584963)

1.9374913528012172

In [33]:
np.log(2.906 + 2.546 * 5.614710)

2.8449705249735047

In [37]:
np.exp(0.1318 + 0.1394 * 1.584963)

1.422966592752472

In [60]:
def bluebrown(ent):
    # ent = 5.614710
    # Assuming you can definitely get it in the next guess,
    # this is the expected score
    min_score = 2**(-ent) + 2 * (1 - 2**(-ent))
    
    # To account for the likely uncertainty after the next guess,
    # and knowing that entropy of 11.5 bits seems to have average
    # score of 3.5, we add a line to account
    # we add a line which connects (0, 0) to (3.5, 11.5)
    return min_score + 1.5 * ent / 11.5

def exp_glm(ent):
    return np.log(2.906 + 2.546 * ent)

def ln_glm(ent):
    return np.exp(0.1318 + 0.1394 * ent)

def sk_lr(ent):
    return 1.169384 + 0.250664 * ent

def np_fit(ent):
    return 1.182898 + 0.573493 * np.log2(ent, out=np.zeros_like(ent), where=(ent>0))

In [61]:
for metric in [bluebrown, exp_glm, ln_glm, sk_lr, np_fit]:
    print(f'{metric.__name__}')
    Y_approx = metric(X_Test)
    mse = metrics.mean_squared_error(Y_Test, Y_approx)
    cod = metrics.r2_score(Y_Test, Y_approx)
    print(f'Mean squared error: {mse:.2f}')
    print(f'Coefficient of determination: {cod:.2f}')
    

bluebrown
Mean squared error: 0.39
Coefficient of determination: 0.61
exp_glm
Mean squared error: 0.40
Coefficient of determination: 0.61
ln_glm
Mean squared error: 0.47
Coefficient of determination: 0.54
sk_lr
Mean squared error: 0.41
Coefficient of determination: 0.60
np_fit
Mean squared error: 0.43
Coefficient of determination: 0.58


In [29]:
test_data.head()

Unnamed: 0,entropy,guesses
0,6.83289,3
1,1.584963,2
2,0.0,1
3,5.61471,3
4,3.321928,2


In [49]:
x = data['entropy'].values
y = data['guesses'].values


In [53]:
from numpy.polynomial import Polynomial
pfit = Polynomial.fit(np.log2(x, out=np.zeros_like(x), where=(x>0)), y, deg=1)

In [62]:
pfit.convert().coef

array([1.18289791, 0.57349342])