In [139]:
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score,roc_curve
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

In [145]:
df = pd.read_csv("new_super_league.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)
df

Unnamed: 0,week,home,away,goal1,goal2,result,home_form,away_form,home_rank,away_rank
0,6,KASIMPAŞA,GÖZTEPE,0,0,0,1.40,1.2,8.0,10
1,6,GAZİANTEP,KONYASPOR,1,0,1,0.80,1.5,17.0,11
2,6,ERZURUMSPOR,GALATASARAY,1,2,2,1.40,1.4,9.0,6
3,6,BAŞAKŞEHİR,ANTALYASPOR,5,1,1,0.80,1.6,20.0,5
4,6,MALATYASPOR,GENÇLERBİRLİĞİ,2,1,1,1.00,1.0,15.0,19
...,...,...,...,...,...,...,...,...,...,...
365,42,ANTALYASPOR,KONYASPOR,0,0,0,0.25,1.6,16.0,11
366,42,KAYSERİSPOR,FENERBAHÇE,1,2,2,1.20,2.0,17.0,3
367,42,GALATASARAY,MALATYASPOR,3,1,1,3.00,1.6,2.0,14
368,42,GÖZTEPE,BEŞİKTAŞ,1,2,2,1.00,1.8,10.0,1


In [146]:
df["result"] = df["goal1"] - df["goal2"]
df

Unnamed: 0,week,home,away,goal1,goal2,result,home_form,away_form,home_rank,away_rank
0,6,KASIMPAŞA,GÖZTEPE,0,0,0,1.40,1.2,8.0,10
1,6,GAZİANTEP,KONYASPOR,1,0,1,0.80,1.5,17.0,11
2,6,ERZURUMSPOR,GALATASARAY,1,2,-1,1.40,1.4,9.0,6
3,6,BAŞAKŞEHİR,ANTALYASPOR,5,1,4,0.80,1.6,20.0,5
4,6,MALATYASPOR,GENÇLERBİRLİĞİ,2,1,1,1.00,1.0,15.0,19
...,...,...,...,...,...,...,...,...,...,...
365,42,ANTALYASPOR,KONYASPOR,0,0,0,0.25,1.6,16.0,11
366,42,KAYSERİSPOR,FENERBAHÇE,1,2,-1,1.20,2.0,17.0,3
367,42,GALATASARAY,MALATYASPOR,3,1,2,3.00,1.6,2.0,14
368,42,GÖZTEPE,BEŞİKTAŞ,1,2,-1,1.00,1.8,10.0,1


In [147]:
df.drop(["goal1", "goal2"], axis=1, inplace=True)
df

Unnamed: 0,week,home,away,result,home_form,away_form,home_rank,away_rank
0,6,KASIMPAŞA,GÖZTEPE,0,1.40,1.2,8.0,10
1,6,GAZİANTEP,KONYASPOR,1,0.80,1.5,17.0,11
2,6,ERZURUMSPOR,GALATASARAY,-1,1.40,1.4,9.0,6
3,6,BAŞAKŞEHİR,ANTALYASPOR,4,0.80,1.6,20.0,5
4,6,MALATYASPOR,GENÇLERBİRLİĞİ,1,1.00,1.0,15.0,19
...,...,...,...,...,...,...,...,...
365,42,ANTALYASPOR,KONYASPOR,0,0.25,1.6,16.0,11
366,42,KAYSERİSPOR,FENERBAHÇE,-1,1.20,2.0,17.0,3
367,42,GALATASARAY,MALATYASPOR,2,3.00,1.6,2.0,14
368,42,GÖZTEPE,BEŞİKTAŞ,-1,1.00,1.8,10.0,1


In [148]:
df = df.dropna()
df

Unnamed: 0,week,home,away,result,home_form,away_form,home_rank,away_rank
0,6,KASIMPAŞA,GÖZTEPE,0,1.40,1.2,8.0,10
1,6,GAZİANTEP,KONYASPOR,1,0.80,1.5,17.0,11
2,6,ERZURUMSPOR,GALATASARAY,-1,1.40,1.4,9.0,6
3,6,BAŞAKŞEHİR,ANTALYASPOR,4,0.80,1.6,20.0,5
4,6,MALATYASPOR,GENÇLERBİRLİĞİ,1,1.00,1.0,15.0,19
...,...,...,...,...,...,...,...,...
365,42,ANTALYASPOR,KONYASPOR,0,0.25,1.6,16.0,11
366,42,KAYSERİSPOR,FENERBAHÇE,-1,1.20,2.0,17.0,3
367,42,GALATASARAY,MALATYASPOR,2,3.00,1.6,2.0,14
368,42,GÖZTEPE,BEŞİKTAŞ,-1,1.00,1.8,10.0,1


In [149]:
def get_forms(df, team):
    team_list = []
    week_list = []
    temp_df = df[(df["home"] == team) | (df["away"] == team)]
    for i in range(len(temp_df)):
        if temp_df.iloc[i]["home"] == team:
            team_list.append(temp_df.iloc[i]["home_form"])
        else:
            team_list.append(temp_df.iloc[i]["away_form"])
        week_list.append(temp_df.iloc[i]["week"])

    return team_list, week_list

In [150]:
"""tier_dict = {'BEŞİKTAŞ':1, 'GALATASARAY':1, 'FENERBAHÇE':1, 'TRABZONSPOR':2, 'BAŞAKŞEHİR':2, 'SİVASSPOR':3,
             'ALANYASPOR':3, 'GAZİANTEP':3, 'KASIMPAŞA':4, 'ANTALYASPOR':4, 'GÖZTEPE':4, 'GENÇLERBİRLİĞİ':5,
             'KONYASPOR':5, 'DENİZLİSPOR':5, 'RİZESPOR':5, 'MALATYASPOR':6, 'KAYSERİSPOR':6, 'ANKARAGÜCÜ':6,
             'ERZURUMSPOR':100, 'KARAGÜMRÜK':100, 'HATAYSPOR':100}"""
tier_dict = {'BEŞİKTAŞ':1, 'GALATASARAY':2, 'FENERBAHÇE':3, 'TRABZONSPOR':4, 'BAŞAKŞEHİR':5, 'SİVASSPOR':6,
             'ALANYASPOR':7, 'GAZİANTEP':8, 'KASIMPAŞA':9, 'ANTALYASPOR':10, 'GÖZTEPE':11, 'GENÇLERBİRLİĞİ':12,
             'KONYASPOR':13, 'DENİZLİSPOR':14, 'RİZESPOR':15, 'MALATYASPOR':16, 'KAYSERİSPOR':17, 'ANKARAGÜCÜ':18,
             'ERZURUMSPOR':19, 'KARAGÜMRÜK':20, 'HATAYSPOR':21}
             
len(tier_dict), len(df["home"].unique())

(21, 21)

In [151]:
df["home"] = df["home"].map(tier_dict)
df["away"] = df["away"].map(tier_dict)
df

Unnamed: 0,week,home,away,result,home_form,away_form,home_rank,away_rank
0,6,9,11,0,1.40,1.2,8.0,10
1,6,8,13,1,0.80,1.5,17.0,11
2,6,19,2,-1,1.40,1.4,9.0,6
3,6,5,10,4,0.80,1.6,20.0,5
4,6,16,12,1,1.00,1.0,15.0,19
...,...,...,...,...,...,...,...,...
365,42,10,13,0,0.25,1.6,16.0,11
366,42,17,3,-1,1.20,2.0,17.0,3
367,42,2,16,2,3.00,1.6,2.0,14
368,42,11,1,-1,1.00,1.8,10.0,1


In [152]:
y = df["result"]
X = df.drop("result", axis=1)

In [153]:
from sklearn import linear_model, model_selection, metrics

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=0)

In [154]:
#Scaling Data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [155]:
X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape

((295, 7), (74, 7), (295,), (74,))

In [156]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

knn_params = {'n_neighbors': np.arange(1,15,1)}
np.arange(1,30,1)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [157]:
knn = KNeighborsRegressor()
knn_cv_model = GridSearchCV(knn, knn_params, cv=10)

In [158]:
knn_cv_model.fit(X_train_scaled, y_train)

GridSearchCV(cv=10, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])})

In [159]:
knn_cv_model.best_params_["n_neighbors"]

14

In [160]:
RMSE = []
RMSE_CV = []

for k in range(15):
    k = k+1
    knn_model = KNeighborsRegressor(n_neighbors = k).fit(X_train_scaled, y_train)
    y_pred = knn_model.predict(X_train_scaled)
    rmse = np.sqrt(mean_squared_error(y_train,y_pred))
    rmse_cv = np.sqrt(-1*cross_val_score(knn_model, X_train_scaled, y_train, cv=10,
                                        scoring = "neg_mean_squared_error").mean())
    
    RMSE.append(rmse)
    RMSE_CV.append(rmse_cv)
    print("k =", k, " için RMSE değeri: ",rmse, " RMSE_CV değeri: ",rmse_cv)

k = 1  için RMSE değeri:  0.0  RMSE_CV değeri:  2.1878452108562585
k = 2  için RMSE değeri:  1.0927340923619844  RMSE_CV değeri:  1.9257182568064573
k = 3  için RMSE değeri:  1.2766186505404507  RMSE_CV değeri:  1.8277518660311447
k = 4  için RMSE değeri:  1.3447953659465022  RMSE_CV değeri:  1.766586015292006
k = 5  için RMSE değeri:  1.3870588319858066  RMSE_CV değeri:  1.6918275502456255
k = 6  için RMSE değeri:  1.396255417252573  RMSE_CV değeri:  1.6466819069516712
k = 7  için RMSE değeri:  1.3999555262985943  RMSE_CV değeri:  1.6402084738450171
k = 8  için RMSE değeri:  1.4077945848810256  RMSE_CV değeri:  1.6439058260565076
k = 9  için RMSE değeri:  1.4379689706005927  RMSE_CV değeri:  1.651795725498146
k = 10  için RMSE değeri:  1.4564293462705171  RMSE_CV değeri:  1.6337992532744041
k = 11  için RMSE değeri:  1.4726340589496945  RMSE_CV değeri:  1.6206003787290715
k = 12  için RMSE değeri:  1.4752509442122876  RMSE_CV değeri:  1.6160913579186074
k = 13  için RMSE değeri:  1.47

In [161]:
knn_tuned = KNeighborsRegressor(n_neighbors = knn_cv_model.best_params_["n_neighbors"])
knn_tuned.fit(X_train_scaled, y_train)

KNeighborsRegressor(n_neighbors=14)

In [162]:
#Test Error
np.sqrt(mean_squared_error(y_test, knn_tuned.predict(X_test_scaled)))

1.597925088694359

In [163]:
knn_tuned.score(X_test_scaled, y_test)

0.023041888639189656

In [164]:
knn_tuned.score(X_train_scaled, y_train)

0.22417634906926887

In [165]:
np.sqrt(mean_squared_error(y_test, knn_tuned.predict(X_test_scaled)))

1.597925088694359

In [166]:
cross_val_score(knn_tuned, X_train_scaled, y_train, cv = 10, scoring = "r2").mean()

0.010674771305101537

In [167]:
df2 = df.copy()
x = df2["result"]
x = x.to_list()

for i in range(len(x)):
    if x[i] > 0:
        x[i] = 1
    elif x[i] < 0:
        x[i] = 2

df2["result"] = x
df2

Unnamed: 0,week,home,away,result,home_form,away_form,home_rank,away_rank
0,6,9,11,0,1.40,1.2,8.0,10
1,6,8,13,1,0.80,1.5,17.0,11
2,6,19,2,2,1.40,1.4,9.0,6
3,6,5,10,1,0.80,1.6,20.0,5
4,6,16,12,1,1.00,1.0,15.0,19
...,...,...,...,...,...,...,...,...
365,42,10,13,0,0.25,1.6,16.0,11
366,42,17,3,2,1.20,2.0,17.0,3
367,42,2,16,1,3.00,1.6,2.0,14
368,42,11,1,2,1.00,1.8,10.0,1


In [168]:
new_y = df2["result"]
new_x = df2.drop("result", axis=1)

In [169]:
new_x_scaled = scaler.transform(new_x)
new_x_scaled

array([[-1.71297626, -0.36911068, -0.00450331, ..., -0.27409095,
        -0.53580556, -0.18723072],
       [-1.71297626, -0.5348453 ,  0.32761598, ...,  0.2024719 ,
         0.94603169, -0.01883724],
       [-1.71297626,  1.28823559, -1.49904015, ...,  0.04361762,
        -0.37115698, -0.86080463],
       ...,
       [ 1.64902852, -1.52925306,  0.82579493, ...,  0.36132619,
        -1.52369707,  0.4863432 ],
       [ 1.64902852, -0.03764142, -1.6650998 , ...,  0.67903476,
        -0.20650839, -1.70277203],
       [ 1.64902852, -1.19778381,  0.16155634, ..., -0.27409095,
        -1.1943999 ,  1.49670408]])

In [170]:
pred = knn_tuned.predict(new_x_scaled)

In [171]:
pred_list = []

for i in range(len(pred)):
    if pred[i] > 0:
        pred_list.append(1)
    elif pred[i] == 0:
        pred_list.append(0)
    else:
        pred_list.append(2)

In [172]:
true_sum = 0
for i in range(len(pred_list)):
    if pred_list[i] == x[i]:
        true_sum += 1

print("True Sum: ", true_sum, "  Average: ", true_sum/len(x))

True Sum:  185   Average:  0.5013550135501355


In [173]:
model = linear_model.LinearRegression()
model.fit(X_train_scaled, y_train)

LinearRegression()

In [174]:
from sklearn.metrics import mean_squared_error, r2_score
np.sqrt(mean_squared_error(y_train, model.predict(X_train_scaled)))

1.5400449142221577

In [175]:
np.sqrt(mean_squared_error(y_test, model.predict(X_test_scaled)))

1.6461945177120252

In [176]:
print(model.score(X_train_scaled, y_train))
print(model.score(X_test_scaled, y_test))

0.1614643101236719
-0.03687263781684713


In [177]:
pred = model.predict(new_x_scaled)

In [178]:
pred_list = []

for i in range(len(pred)):
    if pred[i] > 0:
        pred_list.append(1)
    elif pred[i] == 0:
        pred_list.append(0)
    else:
        pred_list.append(2)

In [179]:
true_sum = 0
for i in range(len(pred_list)):
    if pred_list[i] == x[i]:
        true_sum += 1

print("True Sum: ", true_sum, "  Average: ", true_sum/len(x))

True Sum:  177   Average:  0.4796747967479675
