# RUSSIAN PREMIER LEAGUE FOOTBALL ANALYSIS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pls
import seaborn as sns

In [2]:
data = pd.read_csv('russian_premier_league_scores.csv')

## Data processing

GOALS

In [3]:
home_goals = []
away_goals = []
for score in data.score:
    home_goals.append(score.split()[0])
    away_goals.append(score.split()[-1])
    
data['home_goals'] = home_goals
data['away_goals'] = away_goals

data

Unnamed: 0,home,away,score,year,home_goals,away_goals
0,Зенит,Спартак М,3 : 1,2020/2021,3,1
1,Зенит,Локомотив М,6 : 1,2020/2021,6,1
2,Зенит,Рубин,1 : 2,2020/2021,1,2
3,Зенит,Сочи,3 : 1,2020/2021,3,1
4,Зенит,ЦСКА,2 : 1,2020/2021,2,1
...,...,...,...,...,...,...
6853,Динамо-Газовик,Текстильщик Км,0 : 2,1992,0,2
6854,Динамо-Газовик,Уралмаш,2 : 3,1992,2,3
6855,Динамо-Газовик,Океан,3 : 3,1992,3,3
6856,Динамо-Газовик,Факел,0 : 1,1992,0,1


YEAR

In [4]:
year_start = []
year_end = []
for year in data.year:
    if '/' in year:
        year_start.append(year.split('/')[0])
        year_end.append(year.split('/')[1])
    else:
        year_start.append(year)
        year_end.append(year)
data['year_start'] = year_start
data['year_end'] = year_end

In [5]:
data

Unnamed: 0,home,away,score,year,home_goals,away_goals,year_start,year_end
0,Зенит,Спартак М,3 : 1,2020/2021,3,1,2020,2021
1,Зенит,Локомотив М,6 : 1,2020/2021,6,1,2020,2021
2,Зенит,Рубин,1 : 2,2020/2021,1,2,2020,2021
3,Зенит,Сочи,3 : 1,2020/2021,3,1,2020,2021
4,Зенит,ЦСКА,2 : 1,2020/2021,2,1,2020,2021
...,...,...,...,...,...,...,...,...
6853,Динамо-Газовик,Текстильщик Км,0 : 2,1992,0,2,1992,1992
6854,Динамо-Газовик,Уралмаш,2 : 3,1992,2,3,1992,1992
6855,Динамо-Газовик,Океан,3 : 3,1992,3,3,1992,1992
6856,Динамо-Газовик,Факел,0 : 1,1992,0,1,1992,1992


In [6]:
data.home = pd.Categorical(data.home)

In [7]:
data['home_code'] = data.home.cat.codes

In [8]:
data.away = pd.Categorical(data.away)
data['away_code'] = data.away.cat.codes

In [9]:
data.dtypes

home          category
away          category
score           object
year            object
home_goals      object
away_goals      object
year_start      object
year_end        object
home_code         int8
away_code         int8
dtype: object

In [10]:
data.home_goals = data.home_goals.astype('int')
data.away_goals = data.away_goals.astype('int')
data.year_end = data.year_end.astype('int')
data.year_start = data.year_start.astype('int')

In [11]:
data.dtypes

home          category
away          category
score           object
year            object
home_goals       int64
away_goals       int64
year_start       int64
year_end         int64
home_code         int8
away_code         int8
dtype: object

In [12]:
data['sum_goals'] = data.home_goals + data.away_goals

2.5 more \
2.5 less \
0.5 more \
0.5 less \
W1, W2, X \
1x, x2 

In [13]:
data['more2_5'] = 0

In [14]:
data.loc[data.sum_goals >= 3, 'more2_5'] = int(1)
data.loc[data.sum_goals < 3, 'more2_5'] = int(0)
data.loc[data.sum_goals > 1.5, 'more1_5'] = int(1)
data.loc[data.sum_goals < 1.5, 'more1_5'] = int(0)
data.loc[data.sum_goals > 0.5, 'more0_5'] = int(1)
data.loc[data.sum_goals < 0.5, 'more0_5'] = int(0)
data.loc[data.sum_goals > 3.5, 'more3_5'] = int(1)
data.loc[data.sum_goals < 3.5, 'more3_5'] = int(0)
data.loc[data.sum_goals > 4.5, 'more4_5'] = int(1)
data.loc[data.sum_goals < 4.5, 'more4_5'] = int(0)
data.loc[data.sum_goals > 5.5, 'more5_5'] = int(1)
data.loc[data.sum_goals < 5.5, 'more5_5'] = int(0)

In [15]:
for i in range(6):
    data[f'more{i}_5'] = data[f'more{i}_5'].astype('int')

In [16]:
data

Unnamed: 0,home,away,score,year,home_goals,away_goals,year_start,year_end,home_code,away_code,sum_goals,more2_5,more1_5,more0_5,more3_5,more4_5,more5_5
0,Зенит,Спартак М,3 : 1,2020/2021,3,1,2020,2021,13,39,4,1,1,1,1,0,0
1,Зенит,Локомотив М,6 : 1,2020/2021,6,1,2020,2021,13,21,7,1,1,1,1,1,1
2,Зенит,Рубин,1 : 2,2020/2021,1,2,2020,2021,13,31,3,1,1,1,0,0,0
3,Зенит,Сочи,3 : 1,2020/2021,3,1,2020,2021,13,36,4,1,1,1,1,0,0
4,Зенит,ЦСКА,2 : 1,2020/2021,2,1,2020,2021,13,57,3,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6853,Динамо-Газовик,Текстильщик Км,0 : 2,1992,0,2,1992,1992,10,43,2,0,1,1,0,0,0
6854,Динамо-Газовик,Уралмаш,2 : 3,1992,2,3,1992,1992,10,53,5,1,1,1,1,1,0
6855,Динамо-Газовик,Океан,3 : 3,1992,3,3,1992,1992,10,26,6,1,1,1,1,1,1
6856,Динамо-Газовик,Факел,0 : 1,1992,0,1,1992,1992,10,55,1,0,0,1,0,0,0


In [17]:
data.more2_5.sum()

3111

In [18]:
champions = pd.DataFrame(data.year.unique())

Локомотив М == Локомотив (заменить) тоже самое со спартаком

In [19]:
champions['champion'] = [
    'Зенит', 'Локомотив М', 'Зенит', 'Локомотив М', "Спартак М", "ЦСКА", "Зенит", "ЦСКА", "ЦСКА", "Зенит", "Зенит",
    "Рубин", "Рубин", "Зенит", 'ЦСКА', "ЦСКА", "Локомотив", "ЦСКА", "Локомотив", "Спартак", "Спартак", "Спартак", "Спартак",
    "Спартак", "Спартак", "Спартак-Алания", "Спартак", "Спартак М", "Динамо"
]
champions.columns = ['year', 'champion']
champions

Unnamed: 0,year,champion
0,2020/2021,Зенит
1,2019/2020,Локомотив М
2,2018/2019,Зенит
3,2017/2018,Локомотив М
4,2016/2017,Спартак М
5,2015/2016,ЦСКА
6,2014/2015,Зенит
7,2013/2014,ЦСКА
8,2012/2013,ЦСКА
9,2011/2012,Зенит


In [182]:
champions.to_csv('champions_russian_premiere_league.csv', index=False)

In [20]:
data

Unnamed: 0,home,away,score,year,home_goals,away_goals,year_start,year_end,home_code,away_code,sum_goals,more2_5,more1_5,more0_5,more3_5,more4_5,more5_5
0,Зенит,Спартак М,3 : 1,2020/2021,3,1,2020,2021,13,39,4,1,1,1,1,0,0
1,Зенит,Локомотив М,6 : 1,2020/2021,6,1,2020,2021,13,21,7,1,1,1,1,1,1
2,Зенит,Рубин,1 : 2,2020/2021,1,2,2020,2021,13,31,3,1,1,1,0,0,0
3,Зенит,Сочи,3 : 1,2020/2021,3,1,2020,2021,13,36,4,1,1,1,1,0,0
4,Зенит,ЦСКА,2 : 1,2020/2021,2,1,2020,2021,13,57,3,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6853,Динамо-Газовик,Текстильщик Км,0 : 2,1992,0,2,1992,1992,10,43,2,0,1,1,0,0,0
6854,Динамо-Газовик,Уралмаш,2 : 3,1992,2,3,1992,1992,10,53,5,1,1,1,1,1,0
6855,Динамо-Газовик,Океан,3 : 3,1992,3,3,1992,1992,10,26,6,1,1,1,1,1,1
6856,Динамо-Газовик,Факел,0 : 1,1992,0,1,1992,1992,10,55,1,0,0,1,0,0,0


In [21]:
data['goal_diff'] = data.home_goals - data.away_goals

In [22]:
from sklearn.model_selection import train_test_split

dataset = data.copy(deep=False)

Converting data to numbers so we could use it as a features

In [23]:
dataset

Unnamed: 0,home,away,score,year,home_goals,away_goals,year_start,year_end,home_code,away_code,sum_goals,more2_5,more1_5,more0_5,more3_5,more4_5,more5_5,goal_diff
0,Зенит,Спартак М,3 : 1,2020/2021,3,1,2020,2021,13,39,4,1,1,1,1,0,0,2
1,Зенит,Локомотив М,6 : 1,2020/2021,6,1,2020,2021,13,21,7,1,1,1,1,1,1,5
2,Зенит,Рубин,1 : 2,2020/2021,1,2,2020,2021,13,31,3,1,1,1,0,0,0,-1
3,Зенит,Сочи,3 : 1,2020/2021,3,1,2020,2021,13,36,4,1,1,1,1,0,0,2
4,Зенит,ЦСКА,2 : 1,2020/2021,2,1,2020,2021,13,57,3,1,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6853,Динамо-Газовик,Текстильщик Км,0 : 2,1992,0,2,1992,1992,10,43,2,0,1,1,0,0,0,-2
6854,Динамо-Газовик,Уралмаш,2 : 3,1992,2,3,1992,1992,10,53,5,1,1,1,1,1,0,-1
6855,Динамо-Газовик,Океан,3 : 3,1992,3,3,1992,1992,10,26,6,1,1,1,1,1,1,0
6856,Динамо-Газовик,Факел,0 : 1,1992,0,1,1992,1992,10,55,1,0,0,1,0,0,0,-1


In [24]:
for i in range(6):
    dataset.drop(columns=[f'more{i}_5'], inplace = True)

In [25]:
dataset.drop(columns=['home', 'away', 'score', 'year'], inplace=True)

In [26]:
dataset.dtypes

home_goals    int64
away_goals    int64
year_start    int64
year_end      int64
home_code      int8
away_code      int8
sum_goals     int64
goal_diff     int64
dtype: object

In [28]:
X = dataset.drop(columns=['goal_diff'])
y = dataset.goal_diff
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=8)

In [29]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(n_estimators=100, criterion='mae', max_depth=7, random_state=8)
regr.fit(X_train, y_train)
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, f1_score, recall_score, mean_absolute_error

In [30]:
preds = regr.predict(X_test)

In [31]:
mean_absolute_error(y_test,preds)

0.03499558303886925

##### Understand how to know who's gonna be champion

In [32]:
print('Score train:', regr.score(X_train, y_train))
print('Score test:', regr.score(X_test, y_test))

Score train: 0.9984717143710259
Score test: 0.996551660024993


In [183]:
new_season = pd.read_excel('this_season_rpl.xlsx', header=None)

new_season.columns = ['match']

new_season = pd.DataFrame(new_season.match.str.split('–').to_list(), columns=['home', 'away'])

In [173]:
mapper = {}
for i in range(len(data.home)):
    mapper[data.home[i]] = data.home_code[i]
    

In [186]:
new_season.to_csv('rpl_this_season_matches.csv', index=False)

In [174]:
home = new_season.home
away = new_season.away

In [175]:
new_season.home = new_season.home.apply(lambda x: x[:-1])
new_season.away = new_season.away.apply(lambda x: x[1:])
new_season.home[new_season.home == 'Арсенал Т'] = "Арсенал"
new_season.away[new_season.away == 'Арсенал Т'] = "Арсенал"
new_season['home_codes'] = new_season.home.map(mapper)
new_season['away_codes'] = new_season.away.map(mapper)

In [176]:
new_season

Unnamed: 0,home,away,home_codes,away_codes
0,Ростов,Динамо М,28.0,8.0
1,Химки,Зенит,56.0,13.0
2,Локомотив М,Арсенал,21.0,3.0
3,Рубин,Спартак М,31.0,39.0
4,Урал,Краснодар,51.0,16.0
...,...,...,...,...
235,Рубин,Уфа,31.0,54.0
236,Крылья Советов,Локомотив М,17.0,21.0
237,Динамо М,Сочи,8.0,36.0
238,Краснодар,Ахмат,16.0,5.0


In [143]:
new_season.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   home    210 non-null    float64
 1   away    225 non-null    float64
dtypes: float64(2)
memory usage: 3.9 KB


In [147]:
new_season[new_season.isna() == True].home

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
235   NaN
236   NaN
237   NaN
238   NaN
239   NaN
Name: home, Length: 240, dtype: float64

In [160]:
new_season.isna().sum(axis=0)

home    30
away    15
dtype: int64