In [1]:
import matplotlib
import pandas as pd
import requests
import io
import numpy as np
import matplotlib.pyplot as plt
import math
import datetime
import re
from datetime import datetime


%matplotlib inline
    


%matplotlib inline
url1 = "https://raw.githubusercontent.com/docju/datasciencecapstone/main/results.csv"
download1 = requests.get(url1).content

df = pd.read_csv(io.StringIO(download1.decode('ISO-8859-1')))
df['date']=pd.to_datetime(df['DateTime'])
df['outcome']=df['FTHG']-df['FTAG']
all_results=df[['Season','date','HomeTeam','AwayTeam','FTHG','FTAG','outcome']][~df['Season'].isin(['2020-21','2021-22'])]

In [2]:
def split_season_train_test (season,test,input_df=all_results):
    '''
    This function takes data from a particular season and splits it into training and test samples
    
    INPUT:
    input_df - (dataframe) input matrix containing results of all seasons
    season - (string) season to split
    test - (int) number of rows that constitute the test data frame
    
    OUTPUT:
    season_train - (dataframe) a dataframe containing only the non-test rows
    season_test - (dataframe) a dataframe containing the test rows
    '''
    train=input_df[input_df['Season']==season].shape[0]-test
    season_train=input_df[input_df['Season']==season].copy().head(train)
    season_test=input_df[input_df['Season']==season].copy().tail(test)

    return season_train, season_test

train,test=split_season_train_test('2007-08',92)
print(test)

       Season       date       HomeTeam       AwayTeam  FTHG  FTAG  outcome
5772  2007-08 2008-03-15        Arsenal  Middlesbrough     1     1        0
5773  2007-08 2008-03-15          Derby     Man United     0     1       -1
5774  2007-08 2008-03-15      Liverpool        Reading     2     1        1
5775  2007-08 2008-03-15     Portsmouth    Aston Villa     2     0        2
5776  2007-08 2008-03-15     Sunderland        Chelsea     0     1       -1
5777  2007-08 2008-03-15       West Ham      Blackburn     2     1        1
5778  2007-08 2008-03-16         Fulham        Everton     1     0        1
5779  2007-08 2008-03-16       Man City      Tottenham     2     1        1
5780  2007-08 2008-03-16          Wigan         Bolton     1     0        1
5781  2007-08 2008-03-17     Birmingham      Newcastle     1     1        0
5782  2007-08 2008-03-19     Man United         Bolton     2     0        2
5783  2007-08 2008-03-19      Tottenham        Chelsea     4     4        0
5784  2007-0

In [5]:
df2=covid_season_pre_break.copy()
output = np.asmatrix(df2.groupby(['HomeTeam', 'AwayTeam'])['outcome'].max().unstack())

df3=covid_season_post_break.copy()
output_post = np.asmatrix(df3.groupby(['HomeTeam', 'AwayTeam'])['outcome'].max().unstack())



In [172]:
def FunkSVD(ratings_mat, latent_features=4, learning_rate=0.0001, iters=500):
    '''
    This function performs matrix factorization using a basic form of FunkSVD with no regularization
    
    INPUT:
    ratings_mat - (numpy array) a matrix with users as rows, movies as columns, and ratings as values
    latent_features - (int) the number of latent features used
    learning_rate - (float) the learning rate 
    iters - (int) the number of iterations
    
    OUTPUT:
    user_mat - (numpy array) a user by latent feature matrix
    movie_mat - (numpy array) a latent feature by movie matrix
    '''
    
    # Set up useful values to be used through the rest of the function
    n_users = ratings_mat.shape[0]
    n_movies = ratings_mat.shape[1]
    num_ratings = np.count_nonzero(~np.isnan(ratings_mat)) #-n_users
    
    # initialize the user and movie matrices with random values
    home_mat = np.random.rand(n_users, latent_features)
    away_mat = np.random.rand(latent_features, n_movies)
    
    # initialize sse at 0 for first iteration
    sse_accum = 0
    
    # header for running results
    print("Optimizaiton Statistics")
    print("Iterations | Mean Squared Error ")
    #print(home_mat)
    #print(away_mat)
    # for each iteration
    for iteration in range(iters):

        # update our sse
        old_sse = sse_accum
        sse_accum = 0
        
        # For each user-movie pair
        for i in range(n_users):
            for j in range(n_movies):
 
                
                # if the rating exists
                if pd.isna(ratings_mat[i,j]) is False:
                    
                    # compute the error as the actual minus the dot product of the user and movie latent features
                    diff = ratings_mat[i, j] - np.dot(home_mat[i, :], away_mat[:, j])
                    
                    # Keep track of the sum of squared errors for the matrix
                    sse_accum += diff**2
                    
                    # update the values in each matrix in the direction of the gradient
                    for k in range(latent_features):
                        home_mat[i, k] += learning_rate * (2*diff*away_mat[k, j])
                        away_mat[k, j] += learning_rate * (2*diff*home_mat[i, k])

        # print results for iteration
        if iteration in {0,1,iters-1}:
            print("%d \t\t %f" % (iteration+1, sse_accum / num_ratings))
        
    return home_mat, away_mat 

In [180]:
home_mat,away_mat=FunkSVD(output,latent_features=10,learning_rate=0.005, iters=500)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 6.504899
2 		 4.163364
500 		 0.004662


In [187]:
print(np.dot(home_mat, away_mat)-output_post)


[[             nan              nan              nan              nan
               nan              nan              nan              nan
   -2.02395742e+00  -2.15967040e+00              nan              nan
               nan  -1.66509454e+00              nan              nan
               nan  -2.28035438e+00              nan              nan]
 [ -3.22473242e+00              nan              nan              nan
               nan   7.29416133e-01  -1.59648762e+00              nan
               nan              nan              nan   2.22242431e+00
               nan              nan  -6.25244831e-01              nan
               nan              nan              nan   3.43193038e-01]
 [             nan              nan              nan              nan
               nan              nan   7.89430244e+00              nan
   -1.30497962e+01              nan              nan              nan
    1.29201326e+01              nan              nan  -3.47757969e+00
    3.41390950e+00