In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# Load in Dataset
results = pd.read_csv("../testing/output/season_1_dataset_1487386080.csv")

In [3]:
# Check Correlation
corr_analysis = results.corr()["home_points"]
print(corr_analysis)

home_team_id     -0.049158
home_team_lp      0.264107
home_team_form    0.158498
away_team_id     -0.007858
away_team_lp     -0.238432
away_team_form   -0.194600
home_points       1.000000
Name: home_points, dtype: float64


In [4]:
# That's interesting - got a decent positive correlation to lp and form of home team, and negative to the same for the away team
# In other words, the higher the lp and form coefficient the more likely that team gets more points

# Get all the columns from the dataframe
columns = results.columns.tolist()

# Filter the columns to remove ones we don't want
columns = [c for c in columns if c not in ["home_team_lp", "home_team_form", "away_team_lp", "away_team_form", "home_points"]]

In [5]:
# Store the variable we'll be predicting on
target = "home_points"

In [6]:
# Generate the training set - Set random_state to be able to replicate results
train = results.sample(frac=0.8, random_state=1)

# Select anything not in the training set and put it in the testing set
test = results.loc[~results.index.isin(train.index)]

In [7]:
# Initialize the model class
model = LinearRegression()

# Fit the model to the training data
model.fit(train[columns], train[target])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
# Generate our predictions for the test set
predictions = model.predict(test[columns])

In [9]:
# Compute error between our test predictions and the actual values.
mse = mean_squared_error(predictions, test[target])

print(mse)

1.95515654973


In [10]:
# Model seems to be ok-ish - Basically we want to minimise the mean squared error we calculated above

# Now lets try Random Forest (Non-Linear Model)
from sklearn.ensemble import RandomForestRegressor

# Initialize the model with some parameters
model_RF = RandomForestRegressor(n_estimators=100, min_samples_leaf=10, random_state=1)

# Fit the model to the data
model_RF.fit(train[columns], train[target])

# Make predictions
predictions = model_RF.predict(test[columns])

# Compute the error
mean_squared_error(predictions, test[target])

1.8613240712777734

In [11]:
# Load in Dataset for next season
results = pd.read_csv("../testing/output/season_2_dataset_1487386080.csv")

# Get all the columns from the dataframe
columns = results.columns.tolist()

# Filter the columns to remove ones we don't want - we will take away home points to check predictions manually
columns = [c for c in columns if c not in ["home_team_lp", "home_team_form", "away_team_lp", "away_team_form", "home_points"]]

predictions_S2 = model_RF.predict(results[columns])

it = 0
for item in predictions_S2:
    print('Game '+str(it+1)+':')
    print(int(results.iloc[it]['home_points']))
    print(item)
    print("\r\n")
    it = it + 1

Game 1:
0
1.0155289388


Game 2:
3
2.01229264956


Game 3:
3
2.42837260392


Game 4:
1
1.28086745209


Game 5:
3
1.44709212591


Game 6:
3
0.925772081413


Game 7:
3
2.17549655257


Game 8:
0
1.18849096374


Game 9:
0
1.43252316039


Game 10:
3
1.74380821661


Game 11:
0
0.983035076094


Game 12:
1
2.37380612181


Game 13:
3
2.23791318789


Game 14:
3
0.935395657392


Game 15:
0
1.61348007579


Game 16:
0
1.20986822002


Game 17:
1
1.93778587012


Game 18:
0
1.59295175259


Game 19:
1
1.19477599253


Game 20:
1
1.12773655474


Game 21:
3
0.970519219378


Game 22:
0
1.26676241164


Game 23:
3
2.44981810694


Game 24:
3
1.50105688353


Game 25:
0
1.44709212591


Game 26:
1
1.5009596995


Game 27:
3
1.7512996426


Game 28:
3
1.34094394497


Game 29:
3
0.972383249736


Game 30:
3
1.80773314645


Game 31:
3
1.82333604074


Game 32:
3
2.42837260392


Game 33:
3
2.09848090117


Game 34:
3
2.07254424963


Game 35:
1
0.983035076094


Game 36:
1
1.83061348351
