In [None]:
import pandas as pd
import os
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt

In [None]:
BEERS = pd.read_csv(os.path.join("..", "..", "data", "processed", "beers.csv"))
BEERS.head()

# Regression

In [None]:
DATA = BEERS[["alcohol", "astringency", "bitter", "body", "fruits", "hoppy", "malty", "salty", "sour", "spices", "sweet", "ave_rating"]].dropna()
X = DATA[["alcohol", "astringency", "bitter", "body", "fruits", "hoppy", "malty", "salty", "sour", "spices", "sweet"]]
y = DATA[["ave_rating"]]

In [None]:
reg = LinearRegression().fit(X, y)

In [None]:
plt.bar(["alcohol", "astringency", "bitter", "body", "fruits", "hoppy", "malty", "salty", "sour", "spices", "sweet"], reg.coef_[0])
plt.xticks(rotation=90)
plt.show()

In [None]:
pd.DataFrame(zip(["alcohol", "astringency", "bitter", "body", "fruits", "hoppy", "malty", "salty", "sour", "spices", "sweet"],reg.coef_[0])).to_csv(os.path.join("..", "..", "data", "website_preparation", "regression_coeffs.csv"), header=False, index=False)

# Gravity Balls

In [None]:
aromes = ["astringency", "bitter", "body", "fruits", "sour"]
DATA = BEERS[["meta_style", *aromes]]
DATA = DATA[DATA["meta_style"] != "Other"]
DATA = DATA[DATA["meta_style"] != "alcohol-free"]
DATA[aromes] = DATA[aromes].apply(lambda x: (x - x.mean()) / x.std())
DATA = DATA.groupby("meta_style").mean()
DATA["posx"] = 100*np.random.random(size=[DATA.shape[0],1])-50
DATA["posy"] = 100*np.random.random(size=[DATA.shape[0],1])-50
plt.scatter(DATA["posx"], DATA["posy"])
for (xi, yi, ni) in zip(DATA["posx"], DATA["posy"], DATA.index):
    plt.text(xi, yi, ni, va='bottom', ha='center')
plt.show()

In [None]:
def dist(x,y,method="cos"):
    if method == "cos":
        return x.dot(y)/(np.linalg.norm(x)*np.linalg.norm(y))
    if method == "euclid":
        return np.linalg.norm(x-y)

distance_matrix = np.zeros([DATA.shape[0],DATA.shape[0]])
for i in range(DATA.shape[0]):
    for j in range(i,DATA.shape[0]):
        d = dist(
            np.array(DATA[aromes].iloc[i,:]),
            np.array(DATA[aromes].iloc[j,:]),
            method = "euclid"
        )
        distance_matrix[i,j] = d
        distance_matrix[j,i] = d

In [None]:
attract = 10
repuls = 1
stepsize = 0.01
step = []
for _ in range(1000):
    temp = 0
    for i in range(DATA.shape[0]):
        dx = np.zeros([2])
        for j in range(DATA.shape[0]):
            if i != j and np.random.rand() > 0.02:
                dist = np.array([DATA["posx"].iloc[j] - DATA["posx"].iloc[i], DATA["posy"].iloc[j] - DATA["posy"].iloc[i]])
                dx -= repuls / np.linalg.norm(dist) * dist / np.linalg.norm(dist)
                dx += attract / distance_matrix[i,j] * dist
        DATA["posx"].iloc[i] += stepsize*dx[0]
        DATA["posy"].iloc[i] += stepsize*dx[1]
        temp += stepsize*stepsize*dx.dot(dx)
    step.append(temp)
    if temp < 1e-8:
        break
plt.plot(step)
plt.yscale('log')
plt.show()

In [None]:
DATA[["posx", "posy"]] = DATA[["posx", "posy"]].apply(lambda x: (x - x.mean()) / x.std())
plt.scatter(DATA["posx"], DATA["posy"])
for (xi, yi, ni) in zip(DATA["posx"], DATA["posy"], DATA.index):
    plt.text(xi, yi, ni, va='bottom', ha='center')
plt.show()

In [None]:
pd.DataFrame(DATA[["posx","posy"]]).to_csv(os.path.join("..", "..", "data", "website_preparation", "balls_pos.csv"), header=False, index=True)