In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('dataset.csv')

# Display the first few rows of the dataset
data.head()

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

# Copy data for preprocessing
data_preprocessed = data.copy()

# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
categorical_columns = ['name', 'based_in', 'birth_city', 'nation_of_birth', 'birth_region', 'nationality', 'position', 'group', 'club', 'division', 'division_tier', 'second_nationality', 'is_top_4_tier', 'train_or_test']

for col in categorical_columns:
    le = LabelEncoder()
    data_preprocessed[col] = le.fit_transform(data_preprocessed[col].astype(str))
    label_encoders[col] = le

# Handle missing values - fill NaNs with a placeholder (e.g., -1) or the median for numerical columns
data_preprocessed.fillna(-1, inplace=True)

# Convert date_of_birth to age (assuming the current year is 2023)
data_preprocessed['date_of_birth'] = pd.to_datetime(data_preprocessed['date_of_birth'], errors='coerce')
data_preprocessed['age'] = 2023 - data_preprocessed['date_of_birth'].dt.year

# Drop the original date_of_birth column
data_preprocessed.drop(columns=['date_of_birth'], inplace=True)

# Convert heights to a consistent unit if necessary and drop redundant columns
data_preprocessed['height_(cm)'] = data_preprocessed['height_(cm)'].astype(float)
data_preprocessed['weight_(kg)'] = data_preprocessed['weight_(kg)'].astype(float)

# Extract features and targets
features = data_preprocessed.drop(columns=['goals', 'appearances', 'tier_quality', 'name'])
target_goals = data_preprocessed['goals'].astype(int)
target_appearances = data_preprocessed['appearances'].astype(int)
target_tier_quality = data_preprocessed['tier_quality']

# Split the dataset into training and testing sets
X_train_goals, X_test_goals, y_train_goals, y_test_goals = train_test_split(features, target_goals, test_size=0.2, random_state=42)
X_train_appearances, X_test_appearances, y_train_appearances, y_test_appearances = train_test_split(features, target_appearances, test_size=0.2, random_state=42)
X_train_tier_quality, X_test_tier_quality, y_train_tier_quality, y_test_tier_quality = train_test_split(features, target_tier_quality, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
(X_train_goals.shape, X_test_goals.shape, y_train_goals.shape, y_test_goals.shape,
 X_train_appearances.shape, X_test_appearances.shape, y_train_appearances.shape, y_test_appearances.shape,
 X_train_tier_quality.shape, X_test_tier_quality.shape, y_train_tier_quality.shape, y_test_tier_quality.shape)


  data_preprocessed['date_of_birth'] = pd.to_datetime(data_preprocessed['date_of_birth'], errors='coerce')


((3816, 21),
 (955, 21),
 (3816,),
 (955,),
 (3816, 21),
 (955, 21),
 (3816,),
 (955,),
 (3816, 21),
 (955, 21),
 (3816,),
 (955,))

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score

# Load the dataset
data = pd.read_csv('dataset.csv')

# Display the first few rows of the dataset
print(data.head())

# Print the column names to verify
print("Columns in the dataset:", data.columns)

# Copy data for preprocessing
data_preprocessed = data.copy()

# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
categorical_columns = ['name', 'based_in', 'birth_city', 'nation_of_birth', 'birth_region', 'nationality', 'position', 'group', 'club', 'division', 'division_tier', 'second_nationality', 'is_top_4_tier', 'train_or_test']

# Verify that all categorical columns exist in the data
for col in categorical_columns:
    if col not in data_preprocessed.columns:
        print(f"Column '{col}' not found in the dataset.")
        categorical_columns.remove(col)

# Apply LabelEncoder to categorical columns
for col in categorical_columns:
    le = LabelEncoder()
    data_preprocessed[col] = le.fit_transform(data_preprocessed[col].astype(str))
    label_encoders[col] = le

# Handle missing values - fill NaNs with a placeholder (e.g., -1) or the median for numerical columns
data_preprocessed.fillna(-1, inplace=True)

# Convert date_of_birth to age (assuming the current year is 2023)
data_preprocessed['date_of_birth'] = pd.to_datetime(data_preprocessed['date_of_birth'], errors='coerce')
data_preprocessed['age'] = 2023 - data_preprocessed['date_of_birth'].dt.year

# Drop the original date_of_birth column
data_preprocessed.drop(columns=['date_of_birth'], inplace=True)

# Verify that all features are numeric
for column in data_preprocessed.columns:
    if data_preprocessed[column].dtype == 'object':
        print(f"Column {column} is not numeric.")

# Extract relevant features and targets
features = data_preprocessed[['name', 'position', 'group', 'age', 'second_nationality', 'height_(cm)', 'weight_(kg)', 'club', 'division', 'division_tier', 'is_top_4_tier']]
target_goals = data_preprocessed['goals'].astype(int)
target_appearances = data_preprocessed['appearances'].astype(int)
target_tier_quality = data_preprocessed['tier_quality']

# Split the dataset based on the 'train_or_test' column
X_train = features[data_preprocessed['train_or_test'] == 0]
y_train_goals = target_goals[data_preprocessed['train_or_test'] == 0]
y_train_appearances = target_appearances[data_preprocessed['train_or_test'] == 0]
y_train_tier_quality = target_tier_quality[data_preprocessed['train_or_test'] == 0]

X_test = features[data_preprocessed['train_or_test'] == 1]
y_test_goals = target_goals[data_preprocessed['train_or_test'] == 1]
y_test_appearances = target_appearances[data_preprocessed['train_or_test'] == 1]
y_test_tier_quality = target_tier_quality[data_preprocessed['train_or_test'] == 1]

# Train Random Forest model for goals prediction
rf_goals = RandomForestRegressor(n_estimators=100, random_state=42)
rf_goals.fit(X_train.drop(columns=['name', 'position', 'group']), y_train_goals)

# Train Random Forest model for appearances prediction
rf_appearances = RandomForestRegressor(n_estimators=100, random_state=42)
rf_appearances.fit(X_train.drop(columns=['name', 'position', 'group']), y_train_appearances)

# Train Random Forest model for tier quality prediction
rf_tier_quality = RandomForestClassifier(n_estimators=100, random_state=42)
rf_tier_quality.fit(X_train.drop(columns=['name', 'position', 'group']), y_train_tier_quality)

# Make predictions on the testing set
y_pred_goals = rf_goals.predict(X_test.drop(columns=['name', 'position', 'group']))
y_pred_appearances = rf_appearances.predict(X_test.drop(columns=['name', 'position', 'group']))
y_pred_tier_quality = rf_tier_quality.predict(X_test.drop(columns=['name', 'position', 'group']))

# Evaluate the models
mse_goals = mean_squared_error(y_test_goals, y_pred_goals)
mse_appearances = mean_squared_error(y_test_appearances, y_pred_appearances)
accuracy_tier_quality = accuracy_score(y_test_tier_quality, y_pred_tier_quality)

print(f'MSE Goals: {mse_goals}')
print(f'MSE Appearances: {mse_appearances}')
print(f'Accuracy Tier Quality: {accuracy_tier_quality}')

# Create a DataFrame for predictions on the test set with the specified columns
predictions = pd.DataFrame({
    'Player name': X_test['name'],
    'Position': X_test['position'],
    'Group': X_test['group'],
    'Predicted appearances': y_pred_appearances,
    'Predicted goals': y_pred_goals,
    'Predicted tier quality': y_pred_tier_quality
})

# Display the first few rows of the predictions DataFrame
print(predictions.head())

# Save predictions to a new CSV file
predictions.to_csv('predictions.csv', index=False)
print("Predictions have been saved to 'predictions.csv'.")


                name position       group         club  \
0      Kady Kennelly    D (C)    Defender          NaN   
1      Kieran Graham   AM (C)     Forward     Barnsley   
2   Lennon Patterson    D (L)    Defender       Wolves   
3  Ashton McWilliams    D (R)    Defender  Aston Villa   
4      Divine Mukasa    M (C)  Midfielder     Man City   

                    division based_in division_tier  tier_quality  \
0                        NaN      ENG          NONE             7   
1  English Football League 1      ENG          ENG3             4   
2   English Premier Division      ENG          ENG1             1   
3   English Premier Division      ENG          ENG1             1   
4   English Premier Division      ENG          ENG1             1   

  date_of_birth  birth_month  ...  nation_of_birth  birth_region  nationality  \
0    23/09/2007            9  ...              ENG  UK & Ireland          ENG   
1    31/08/2007            8  ...              ENG  UK & Ireland          

  data_preprocessed['date_of_birth'] = pd.to_datetime(data_preprocessed['date_of_birth'], errors='coerce')


MSE Goals: 1401.1391872956563
MSE Appearances: 50985.34283045306
Accuracy Tier Quality: 0.9631013545072397
      Player name  Position  Group  Predicted appearances  Predicted goals  \
1137         3368        70      2                    0.0              0.0   
1268         4691       108      1                    0.0              0.0   
1528         1027        70      2                    0.0              0.0   
1538         1392       108      1                    0.0              0.0   
1573          993        70      2                    0.0              0.0   

      Predicted tier quality  
1137                       5  
1268                       3  
1528                       3  
1538                       1  
1573                       3  
Predictions have been saved to 'predictions.csv'.
