In [None]:
import numpy as np
import pandas as pd
import  math
from scipy import optimize
from itertools import combinations

In [2]:
from sklearn import linear_model
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression

In [3]:
groups = pd.read_csv('groupstage.csv')
data = pd.read_csv('Past_International_Matches.csv')
groups.set_index(['Country_Name'])


Unnamed: 0_level_0,Group,conf_coeff,Rank
Country_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Qatar,A,0.85,50
Ecuador,A,1.0,44
Senegal,A,0.85,18
Netherlands,A,0.99,8
England,B,0.99,5
IR Iran,B,0.85,20
USA,B,0.85,16
Wales,B,0.99,19
Argentina,C,1.0,3
Saudi Arabia,C,0.85,51


In [4]:
groups.head()

Unnamed: 0,Country_Name,Group,conf_coeff,Rank
0,Qatar,A,0.85,50
1,Ecuador,A,1.0,44
2,Senegal,A,0.85,18
3,Netherlands,A,0.99,8
4,England,B,0.99,5


In [5]:
data['tvalue_difference'] = data['tvalue_home'] - data['tvalue_away']
data['is_won'] = (data['home_team_result'] == 'Win') * 1

In [6]:
X = data.loc[:, ['mean_coeff', 'home_team_fifa_rank', 'away_team_fifa_rank', 'importance']]
Y = data['is_won']
print(X)


       mean_coeff  home_team_fifa_rank  away_team_fifa_rank  importance
0           1.000                   59                   22         2.5
1           0.925                    8                   14         1.0
2           1.000                   35                   94         2.5
3           0.850                   65                   86         1.0
4           1.000                   67                    5         2.5
...           ...                  ...                  ...         ...
23916       0.990                  180                  153         2.5
23917       0.990                  192                  135         2.5
23918       0.925                   28                   60         1.0
23919       0.850                   23                   35         1.0
23920       0.850                   29                   32         1.0

[23921 rows x 4 columns]


In [7]:
x_train, x_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.5, train_size = 0.5, shuffle = True)

In [8]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
log_pred = logreg.predict(x_test)
acc_log = round(logreg.score(x_test, y_test) * 100, 2)
acc_log

67.76

In [9]:
m, n = X.shape

X = np.concatenate([np.ones((m, 1)), X], axis=1)

In [10]:
def sigmoid(z):
    z = np.array(z)
    g = np.zeros(z.shape)
    g = 1 / (1 + np.exp(-z))
    return g

In [11]:
# def costFunction(theta, X, Y):
    
#     m = Y.size 

#     J = 0
#     grad = np.zeros(theta.shape)

#     h = sigmoid(X.dot(theta.T))

#     J = (-1/m) * sum((Y*np.log(h)) + (1-Y)*np.log(1-h))
#     grad = (1/m) * (h - Y).dot(X)
    
#     return J, grad

In [12]:
# initial_theta = np.zeros(n+1)
# options= {'maxiter': 400}


# res = optimize.minimize(costFunction,
#                         initial_theta,
#                         (X, Y),
#                         jac=True,
#                         method='TNC',
#                         options=options)


# cost = res.fun

# theta = res.x

In [13]:
def predict(theta, X):
    m = X.shape[0] 
    p = np.zeros(m)

    h = sigmoid(X.dot(theta.T))

    p = np.round(h)
    
    return p

In [14]:
# prob = sigmoid(np.dot(X[1], theta))
# print(prob)
# p = predict(theta, X)
# print(np.mean(p == Y) * 100)

In [15]:
def costFunctionReg(theta, X, Y, lambda_):
   
    m = Y.size  

    J = 0
    grad = np.zeros(theta.shape)

    h = sigmoid(X.dot(theta.T))
    theta_ = theta
    theta_[0] = 0
    J = -(1/m)*(np.sum(Y*np.log(h) + (1-Y)*np.log(1-h))) + (lambda_/(2*m)) * (np.sum(np.square(theta_)))
    grad = (1/m) * (h - Y).dot(X)
    
    grad = (1/m)* (h-Y).dot(X) + (lambda_ / m)*(theta_)
    
    return J, grad

In [16]:
initial_theta = np.zeros(X.shape[1])

lambda_ = 1

options= {'maxfun': 100}

res = optimize.minimize(costFunctionReg,
                        initial_theta,
                        (X, Y, lambda_),
                        jac=True,
                        method='TNC',
                        options=options)

cost = res.fun
theta = res.x

p = predict(theta, X)


In [17]:
prob = sigmoid(np.dot(X[0], theta))
print(prob)
p = predict(theta, X)
print(np.mean(p == Y) * 100)

0.2796042347717337
67.45119351197692


## Group Stage set-up

In [18]:
teams_groups = groups.set_index(['Country_Name'])
teams_groups['points'] = 0
teams_groups['coefficient'] = 0
teams_groups['total_points'] = 0
teams_groups['probability'] = 0

In [21]:

for group in set(groups['Group']):
    print('====Group {}===='.format(group))
    for home, away in combinations (groups.query('Group == "{}"'.format(group)).values, 2):
        print("{} vs {} : " .format(home[0], away[0], end=''))
        row = pd.DataFrame(np.array([[np.nan, np.nan, np.nan, True, True]]))
#         teams_groups.loc[:, ['probability']] += (1-prob)
#         teams_groups.loc[:, ['probability']] += prob
        home = home[0]
        away = away[0]

        home_coeff = groups.loc[home, 'conf_coeff']
        away_coeff = groups.loc[away,'conf_coeff'] 
        margin = 0.05
        points = 0 
        if prob <= 0.5 - margin:
            print("{} wins with a probability of {:2f}".format(away, (1-prob)))
        
        

====Group A====
Qatar vs Ecuador : 


KeyError: 'Qatar'