In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from IPython.display import display

In [2]:
data = pd.read_csv("merged.csv")
data = data.drop(columns="Unnamed: 0")
data = data.drop(columns="Date")
data = data.rename(columns={"HT Red Cards.1":"AT Red Cards"})
data = data.drop(columns=["Time"])

data.head()

Unnamed: 0,HomeTeam,AwayTeam,Home Goals,Away Goals,Winner,Home Team Shots,Away Team Shots,HT Shots on Target,AT Shots on Target,HT Fouls,AT Fouls,HT Corners,AT Corners,HT Yellow Cards,AT Yellow Cards,HT Red Cards,AT Red Cards
0,Fulham,Arsenal,0,3,A,5,13,2,6,12,12,2,3,2,2,0,0
1,Crystal Palace,Southampton,1,0,H,5,9,3,5,14,11,7,3,2,1,0,0
2,Liverpool,Leeds,4,3,H,22,6,6,3,9,6,9,0,1,0,0,0
3,West Ham,Newcastle,0,2,A,15,15,3,2,13,7,8,7,2,2,0,0
4,West Brom,Leicester,0,3,A,7,13,1,7,12,9,2,5,1,1,0,0


In [3]:
data.dtypes

HomeTeam              object
AwayTeam              object
Home Goals             int64
Away Goals             int64
Winner                object
Home Team Shots        int64
Away Team Shots        int64
HT Shots on Target     int64
AT Shots on Target     int64
HT Fouls               int64
AT Fouls               int64
HT Corners             int64
AT Corners             int64
HT Yellow Cards        int64
AT Yellow Cards        int64
HT Red Cards           int64
AT Red Cards           int64
dtype: object

In [4]:
def get_team(row):
    if row['Winner'] == 'H':
        return row['HomeTeam']
    elif row['Winner'] == 'A':
        return row['AwayTeam']
    else:
        return 0

data['WinningTeam'] = data.apply(get_team, axis=1)

In [5]:
data.head()

Unnamed: 0,HomeTeam,AwayTeam,Home Goals,Away Goals,Winner,Home Team Shots,Away Team Shots,HT Shots on Target,AT Shots on Target,HT Fouls,AT Fouls,HT Corners,AT Corners,HT Yellow Cards,AT Yellow Cards,HT Red Cards,AT Red Cards,WinningTeam
0,Fulham,Arsenal,0,3,A,5,13,2,6,12,12,2,3,2,2,0,0,Arsenal
1,Crystal Palace,Southampton,1,0,H,5,9,3,5,14,11,7,3,2,1,0,0,Crystal Palace
2,Liverpool,Leeds,4,3,H,22,6,6,3,9,6,9,0,1,0,0,0,Liverpool
3,West Ham,Newcastle,0,2,A,15,15,3,2,13,7,8,7,2,2,0,0,Newcastle
4,West Brom,Leicester,0,3,A,7,13,1,7,12,9,2,5,1,1,0,0,Leicester


In [6]:
data["Team_code"] = data["WinningTeam"].astype("category").cat.codes
WinningTeam_list = data[["WinningTeam", "Team_code"]].copy().drop_duplicates().sort_values(by=["Team_code"])


In [7]:
WinningTeam_list.head(21)

Unnamed: 0,WinningTeam,Team_code
20,0,0
0,Arsenal,1
16,Aston Villa,2
380,Brentford,3
13,Brighton,4
86,Burnley,5
6,Chelsea,6
1,Crystal Palace,7
5,Everton,8
66,Fulham,9


In [8]:
HomeTeam_dummies = pd.get_dummies(data["HomeTeam"])
AwayTeam_dummies = pd.get_dummies(data["AwayTeam"])

data = pd.concat([data, HomeTeam_dummies], axis=1)
data = pd.concat([data, AwayTeam_dummies], axis=1)

# Drop the original education column
data = data.drop(columns=["HomeTeam", "AwayTeam"])

In [9]:
data = data.drop(columns=["WinningTeam"])

In [10]:
def encode_Winner(Winner):

    if Winner == "H":
        return 1
    else:
        return 0

data["Winner"] = data["Winner"].apply(encode_Winner)

In [11]:
data.head()

Unnamed: 0,Home Goals,Away Goals,Winner,Home Team Shots,Away Team Shots,HT Shots on Target,AT Shots on Target,HT Fouls,AT Fouls,HT Corners,...,Man United,Newcastle,Norwich,Sheffield United,Southampton,Tottenham,Watford,West Brom,West Ham,Wolves
0,0,3,0,5,13,2,6,12,12,2,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,5,9,3,5,14,11,7,...,0,0,0,0,1,0,0,0,0,0
2,4,3,1,22,6,6,3,9,6,9,...,0,0,0,0,0,0,0,0,0,0
3,0,2,0,15,15,3,2,13,7,8,...,0,1,0,0,0,0,0,0,0,0
4,0,3,0,7,13,1,7,12,9,2,...,0,0,0,0,0,0,0,0,0,0


In [12]:
data.dtypes

Home Goals         int64
Away Goals         int64
Winner             int64
Home Team Shots    int64
Away Team Shots    int64
                   ...  
Tottenham          uint8
Watford            uint8
West Brom          uint8
West Ham           uint8
Wolves             uint8
Length: 62, dtype: object

In [13]:
from sklearn.preprocessing import StandardScaler

data_scaled = StandardScaler().fit_transform(data[['Home Goals',
 'Away Goals',
 'Home Team Shots',
 'Away Team Shots',
 'HT Shots on Target',
 'AT Shots on Target',
 'HT Fouls',
 'AT Fouls',
 'HT Corners',
 'AT Corners',
 'HT Yellow Cards',
 'AT Yellow Cards',
 'HT Red Cards',
 'AT Red Cards']])

data_scaled

array([[-1.08211851,  1.33384203, -1.48956917, ...,  0.33561414,
        -0.22331316, -0.25273863],
       [-0.32692102, -1.0532536 , -1.48956917, ..., -0.49522217,
        -0.22331316, -0.25273863],
       [ 1.93867145,  1.33384203,  1.54979181, ..., -1.32605849,
        -0.22331316, -0.25273863],
       ...,
       [ 1.18347396, -0.25755505,  2.80129339, ..., -1.32605849,
        -0.22331316, -0.25273863],
       [ 1.18347396,  0.53814349,  1.90736369, ..., -0.49522217,
        -0.22331316, -0.25273863],
       [-1.08211851,  2.92523912, -0.77442541, ..., -0.49522217,
        -0.22331316, -0.25273863]])

In [14]:
# Create a DataFrame of the scaled data
data_scaled = pd.DataFrame(data_scaled, columns=['Home Goals',
 'Away Goals',
 'Home Team Shots',
 'Away Team Shots',
 'HT Shots on Target',
 'AT Shots on Target',
 'HT Fouls',
 'AT Fouls',
 'HT Corners',
 'AT Corners',
 'HT Yellow Cards',
 'AT Yellow Cards',
 'HT Red Cards',
 'AT Red Cards'])

# Replace the original data with the columns of information from the scaled Data
data["Home Goals"] = data_scaled["Home Goals"]
data["Away Goals"] = data_scaled["Away Goals"]
data['Home Team Shots'] = data_scaled['Home Team Shots']
data['Home Team Shots'] = data_scaled['Home Team Shots']
data['Away Team Shots'] = data_scaled['Away Team Shots']
data['HT Shots on Target'] = data_scaled['HT Shots on Target']
data['AT Shots on Target'] = data_scaled['AT Shots on Target']
data['HT Fouls'] = data_scaled['HT Fouls']
data['AT Fouls'] = data_scaled['AT Fouls']
data['HT Corners'] = data_scaled['HT Corners']
data['AT Corners'] = data_scaled['AT Corners']
data['HT Yellow Cards'] = data_scaled['HT Yellow Cards']
data['AT Yellow Cards'] = data_scaled['AT Yellow Cards']
data['HT Red Cards'] = data_scaled['HT Red Cards']
data['AT Red Cards'] = data_scaled['AT Red Cards']

# Review the DataFrame
data.head()

Unnamed: 0,Home Goals,Away Goals,Winner,Home Team Shots,Away Team Shots,HT Shots on Target,AT Shots on Target,HT Fouls,AT Fouls,HT Corners,...,Man United,Newcastle,Norwich,Sheffield United,Southampton,Tottenham,Watford,West Brom,West Ham,Wolves
0,-1.082119,1.333842,0,-1.489569,0.283394,-1.006625,0.777471,0.390293,0.457325,-1.181299,...,0,0,0,0,0,0,0,0,0,0
1,-0.326921,-1.053254,1,-1.489569,-0.498383,-0.621217,0.365422,0.96403,0.179494,0.468439,...,0,0,0,0,1,0,0,0,0,0
2,1.938671,1.333842,1,1.549792,-1.084715,0.535007,-0.458675,-0.470314,-1.209664,1.128333,...,0,0,0,0,0,0,0,0,0,0
3,-1.082119,0.538143,0,0.29829,0.674283,-0.621217,-0.870724,0.677161,-0.931832,0.798386,...,0,1,0,0,0,0,0,0,0,0
4,-1.082119,1.333842,0,-1.131997,0.283394,-1.392032,1.18952,0.390293,-0.376169,-1.181299,...,0,0,0,0,0,0,0,0,0,0


In [16]:
y = data['Team_code']
X = data.drop(['Team_code'],1)

  X = data.drop(['Team_code'],1)


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=3, 
                                                    stratify=y)
X_train.shape 

(570, 61)

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn import svm

clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores.shape

(5,)

In [19]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=2)
classifier

LogisticRegression(max_iter=200, random_state=2)

In [20]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=2)

In [21]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9333333333333333
Testing Data Score: 0.6210526315789474


In [22]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,14,17
2,2,1
3,6,6
4,0,0
5,0,22
6,7,7
7,19,19
8,22,22
9,0,0


In [23]:
actual_standing = results['Actual'].value_counts()
predicted_standing = results['Prediction'].value_counts()

In [24]:
actual_standing.head(5)

0     43
13    14
12    12
1     10
6     10
Name: Actual, dtype: int64

In [25]:
predicted_standing.head(5)

0     67
19    13
1     12
13    12
10    11
Name: Prediction, dtype: int64

In [26]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, predictions)

0.6210526315789474