# Loteca

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Ler arquivo

In [2]:
df = pd.read_csv("BRA.csv")

### Nomes dos times

In [27]:
vocab = df['Home'].unique()

In [30]:
vocab

array(['Palmeiras', 'Sport Recife', 'Figueirense', 'Botafogo RJ',
       'Corinthians', 'Internacional', 'Ponte Preta', 'Bahia', 'Cruzeiro',
       'Vasco', 'Atletico GO', 'Flamengo RJ', 'Portuguesa', 'Nautico',
       'Atletico-MG', 'Coritiba', 'Santos', 'Sao Paulo', 'Fluminense',
       'Gremio', 'Vitoria', 'Criciuma', 'Atletico-PR', 'Goias',
       'Chapecoense-SC', 'Avai', 'Joinville', 'Santa Cruz', 'America MG',
       'Parana', 'Ceara'], dtype=object)

### Transformar em índices

In [35]:
word2idx = {word: i for i, word in enumerate(vocab)}

In [45]:
def word2vector(word):
    m = np.zeros(len(word2idx))
    m[word2idx.get(word)] = 1
    return m

In [142]:
# Transforma os dois times em um vetor com o flag do time correspondente
def prepare(H,A):
    vec = [word2vector(H), word2vector(A)]
    return np.array(vec).reshape(np.array(vec).shape[1]*2)

### Função para indexar os jogos

In [5]:
def text_to_vector2(text): 
    return word2idx.get(text, None)


### Indexar os resultados

In [174]:
X = []
y = []

for _, row in df.iterrows():
    X.append(prepare(row['Home'], row['Away']))    
    n = -row['HG'] + row['AG']
    
    if np.isnan(n):
        n = 0
    
    y.append(n)
    
# X = np.array(X).reshape(np.array(X).shape[0],np.array(X).shape[2]*2)

In [176]:
np.array(X).shape

(2579, 62)

### Separar os datasets de treinamento e teste

In [182]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)

### Suport vector machine

In [178]:
from sklearn.svm import SVR
model = SVR(gamma='scale', C=1.0, epsilon=0.2)

In [169]:
model

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='scale',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

### Random forest

In [163]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.datasets import make_regression
# model = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)

### Treinar o modelo

In [183]:
model.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='scale',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [180]:
model.support_vectors_.shape

(1562, 62)

### Avaliar precisão

In [184]:
res = []
for j in range(len(X_test)):
    r = model.predict([X_test[j]])
    res.append(round(r[0],0) - y_test[j])
print(round((1 - abs(np.mean(res))) * 100,0), '%')

94.0 %


### Lista de times

In [17]:
vocab

array(['Palmeiras', 'Sport Recife', 'Figueirense', 'Botafogo RJ',
       'Corinthians', 'Internacional', 'Ponte Preta', 'Bahia', 'Cruzeiro',
       'Vasco', 'Atletico GO', 'Flamengo RJ', 'Portuguesa', 'Nautico',
       'Atletico-MG', 'Coritiba', 'Santos', 'Sao Paulo', 'Fluminense',
       'Gremio', 'Vitoria', 'Criciuma', 'Atletico-PR', 'Goias',
       'Chapecoense-SC', 'Avai', 'Joinville', 'Santa Cruz', 'America MG',
       'Parana', 'Ceara'], dtype=object)

### Realizar previsão

In [192]:
H = 'Palmeiras'
A = 'Bahia'

p = model.predict([prepare(H,A)])

if p < 0:
    print(H, 'Vence com ', round(abs(p[0]),3),' gol de diferença')
else:
    print(A, 'Vence com', round(abs(p[0]),3),' gol de diferença')

Palmeiras Vence com  0.7  gol de diferença
