In [3]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [13]:
# Define Linear Regression Model
class LinearRegression(nn.Module):
    def __init__(self, input_size):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_size, 1)  # One output for regression

    def forward(self, x):
        return self.linear(x)

In [4]:
# Step 1: Load the CSV file
df = pd.read_csv('data/merged_data_with_whoscored.csv')

In [5]:
# Step 1b: Delete irrelevant Columns
columns_to_delete = ['Player_URL', 'Player_x', 'Player1', 'Player_URL2', 'Player3', 'Player4', 'Player5', 'Apps', 'Goals', 'Assists', 'Yel', 'SpG', 'PS', 'AerialsWon', 'MotM', 'Red', 'Unnamed: 0.1', 'Player_y', 'Squad', 'Age', 'Born', '90s', 'Based', 'Position', 'player_code', 'Unnamed: 0', 'player_id', 'name', 'country_of_birth', 'date_of_birth', 'sub_position', 'foot', 'height_in_cm', 'contract_expiration_date', 'date', 'market_value_in_eur']

for column in columns_to_delete:
    del df[column]

In [6]:
df.head()

Unnamed: 0,Mins,Rating,Gls,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,...,Long Cmp%,Ast,xAG,xA,A-xAG,KP,1/3,PPA,CrsPA,PrgP
0,2882,7.45,8,56,19,33.9,1.76,0.6,0.14,0.42,...,42.9,21,12.7,10.3,8.3,96,87,71,9,157.0
1,2670,7.38,9,57,20,35.1,1.93,0.68,0.05,0.15,...,53.5,7,7.8,7.3,-0.8,78,196,46,7,232.0
2,3034,7.28,10,58,21,36.2,1.72,0.62,0.14,0.38,...,80.8,5,5.7,6.1,-0.7,57,251,40,2,294.0
3,2367,7.28,14,64,37,57.8,2.45,1.42,0.22,0.38,...,47.5,12,6.1,6.6,5.9,42,41,41,11,82.0
4,2616,7.26,7,32,17,53.1,1.1,0.59,0.22,0.41,...,57.5,5,3.4,3.0,1.6,28,189,35,9,238.0


In [7]:
selected_features = []

In [8]:
predictors = df.drop(columns=['Rating'])  # Exclude the target column
target = df['Rating']

In [9]:
predictors.head()

Unnamed: 0,Mins,Gls,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,...,Long Cmp%,Ast,xAG,xA,A-xAG,KP,1/3,PPA,CrsPA,PrgP
0,2882,8,56,19,33.9,1.76,0.6,0.14,0.42,18.7,...,42.9,21,12.7,10.3,8.3,96,87,71,9,157.0
1,2670,9,57,20,35.1,1.93,0.68,0.05,0.15,25.7,...,53.5,7,7.8,7.3,-0.8,78,196,46,7,232.0
2,3034,10,58,21,36.2,1.72,0.62,0.14,0.38,13.9,...,80.8,5,5.7,6.1,-0.7,57,251,40,2,294.0
3,2367,14,64,37,57.8,2.45,1.42,0.22,0.38,13.7,...,47.5,12,6.1,6.6,5.9,42,41,41,11,82.0
4,2616,7,32,17,53.1,1.1,0.59,0.22,0.41,12.3,...,57.5,5,3.4,3.0,1.6,28,189,35,9,238.0


In [10]:
target.head()

0    7.45
1    7.38
2    7.28
3    7.28
4    7.26
Name: Rating, dtype: float64

In [11]:
# Step 4: Define the function to evaluate the performance of the model with added feature
def evaluate_model(model, inputs, targets):
    with torch.no_grad():
        outputs = model(inputs)
        loss = criterion(outputs, targets.view(-1, 1))
    return loss.item()

In [14]:
# Perform forward selection
while len(selected_features) < len(predictors.columns):
    best_feature = None
    best_loss = float('inf')
    
    for feature in predictors.columns:
        if feature not in selected_features:
            selected_features.append(feature)
            # Select columns based on selected features
            selected_data = df[selected_features]
            inputs = torch.tensor(selected_data.values, dtype=torch.float32)
            targets = torch.tensor(target.values, dtype=torch.float32)

            # Split data into training and validation sets
            X_train, X_val, y_train, y_val = train_test_split(inputs, targets, test_size=0.2, random_state=42)

            # Define Linear Regression Model
            model = LinearRegression(input_size=len(selected_features))

            # Define Loss Function and Optimizer
            criterion = nn.MSELoss()
            optimizer = optim.SGD(model.parameters(), lr=0.01)

            # Train the Model
            num_epochs = 100
            for epoch in range(num_epochs):
                optimizer.zero_grad()
                outputs = model(X_train)
                loss = criterion(outputs, y_train.view(-1, 1))
                loss.backward()
                optimizer.step()

            # Evaluate model performance
            val_loss = evaluate_model(model, X_val, y_val)
            print(f"Feature: {feature}, Validation Loss: {val_loss:.4f}")

            # If adding this feature improves the performance, update best_feature and best_loss
            if val_loss < best_loss:
                best_loss = val_loss
                best_feature = feature

            selected_features.remove(feature)  # Remove the feature for the next iteration
    
    # Add the best feature to the selected features list
    selected_features.append(best_feature)
    print(f"Added feature: {best_feature}, Validation Loss: {best_loss:.4f}")

print("Selected features:", selected_features)

Feature: Gls, Validation Loss: nan
Feature: Sh, Validation Loss: nan
Feature: SoT, Validation Loss: nan
Feature: SoT%, Validation Loss: nan
Feature: Sh/90, Validation Loss: nan
Feature: SoT/90, Validation Loss: nan
Feature: G/Sh, Validation Loss: nan
Feature: G/SoT, Validation Loss: nan
Feature: Dist, Validation Loss: nan
Feature: FK, Validation Loss: nan
Feature: PK, Validation Loss: nan
Feature: PKatt, Validation Loss: nan
Feature: xG, Validation Loss: nan
Feature: npxG, Validation Loss: nan
Feature: npxG/Sh, Validation Loss: nan
Feature: G-xG, Validation Loss: nan
Feature: np:G-xG, Validation Loss: nan
Feature: Tkl, Validation Loss: nan
Feature: TklW, Validation Loss: nan
Feature: Def 3rd, Validation Loss: nan
Feature: Mid 3rd, Validation Loss: nan
Feature: Att 3rd, Validation Loss: nan
Feature: Tkl.1, Validation Loss: nan
Feature: Att, Validation Loss: nan
Feature: Tkl%, Validation Loss: nan
Feature: Lost, Validation Loss: nan
Feature: Blocks, Validation Loss: nan
Feature: Blocks S

KeyError: '[None] not in index'