In [102]:
# Import necessary packages

import pandas as pd
import numpy as np

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from scipy.stats import loguniform

from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error


from sklearn.preprocessing import Normalizer

In [103]:
# Silence warnings

import warnings
warnings.filterwarnings('ignore')

In [104]:
# Load The Datasets

df_dev = pd.read_csv("dev.csv")
df_test = pd.read_csv("test.csv")

print("Dataset Shapes \n")
print("- Development Set: {}".format(df_dev.shape))
print("- Test Set: {}".format(df_test.shape))

Dataset Shapes 

- Development Set: (77930, 28)
- Test Set: (19483, 28)


In [105]:
# Split into features

dev_x, dev_y = df_dev.drop(columns=["total_points"]), df_dev["total_points"]
test_x, test_y = df_test.drop(columns=["total_points"]), df_test["total_points"]

# Linear Regression

In [106]:
# Categorical Variable Handling: 

# Numerical features
num_features = ['assists', 'bonus', 'bps', 'clean_sheets', 'creativity',
       'element', 'goals_conceded', 'goals_scored', 'ict_index', 'influence',
       'minutes', 'own_goals', 'penalties_missed', 'penalties_saved',
       'red_cards', 'saves', 'selected', 'team_a_score', 'team_h_score',
       'threat', 'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards']

# Identify the categorical features to one-hot encode later on (linear regression models can't handle categorical data)
cat_features = ['position']

# See if there are any positions missing in dev that are not in test
print("Positions in development: {}".format(df_dev['position'].unique()))
print("Positions in test: {}".format(df_test['position'].unique()))

# One-hot encoding (handle_unknown because there are no missing positions)
ohe_preprocess = make_column_transformer((StandardScaler(), num_features), 
                                         (OneHotEncoder(), cat_features)
                                        )

Positions in development: ['MID' 'FWD' 'DEF' 'GK']
Positions in test: ['MID' 'DEF' 'FWD' 'GK']


In [118]:
# Let's use random search to find promising areas

# Set up search space for alphas
search_space = dict()
search_space['alpha'] = loguniform(1e-5, 100)

# Pipeline
ohe_pipe = make_pipeline(ohe_preprocess, 
                         RandomizedSearchCV(Ridge(), search_space, n_iter=100, scoring='neg_mean_squared_error')
                        )

# Fit the model
ohe_pipe.fit(dev_x, dev_y)

# Get the best esimator from the random search
model = ohe_pipe[1].best_estimator_

# Find mean absolute error
print("Mean absolute error: {}.format(abs(test_y-tmp_pipe.predict(test_x)).mean())"