In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
from sklearn.ensemble import VotingRegressor

1.5.0


In [None]:
# Load the dataset using a raw string or forward slashes
legacy_player_data = pd.read_csv(r"C:\Users\grise\Downloads\male_players (legacy).csv"

In [None]:
# Display first few rows of the dataset
print(legacy_player_data.head())

# Drop columns with more than 30% missing values and irrelevant columns
drop_threshold = 0.3 * legacy_player_data.shape[0]
legacy_player_data = legacy_player_data.dropna(thresh=drop_threshold, axis=1)
legacy_player_data.drop(columns=['value_eur', 'wage_eur', 'release_clause_eur', 'international_reputation'], inplace=True)


In [3]:
# Select numeric columns for feature selection
numeric_features = legacy_player_data.select_dtypes(include=np.number)

# Impute missing values using mean strategy
mean_imputer = SimpleImputer(strategy='mean')
imputed_numeric_features = pd.DataFrame(mean_imputer.fit_transform(numeric_features), columns=numeric_features.columns, index=numeric_features.index)

# Define target and features
target_variable = imputed_numeric_features['overall']
input_features = imputed_numeric_features.drop(columns=['overall'])

# Select top 13 features using SelectKBest
feature_selector = SelectKBest(f_regression, k=13)
top_features = feature_selector.fit_transform(input_features, target_variable)
selected_feature_columns = input_features.columns[feature_selector.get_support()]

# Create DataFrame with selected features
selected_features_df = pd.DataFrame(top_features, columns=selected_feature_columns, index=input_features.index)

# Print the shape and selected features
print("Shape of selected features:", selected_features_df.shape)
print("Selected features:", selected_feature_columns.tolist())


Shape of selected features: (161583, 13)
Selected features: ['potential', 'age', 'shooting', 'passing', 'dribbling', 'physic', 'attacking_short_passing', 'skill_long_passing', 'skill_ball_control', 'movement_reactions', 'power_shot_power', 'mentality_vision', 'mentality_composure']


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(selected_features_df, target_variable, test_size=0.2, random_state=42)

# Train Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
random_forest_model = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=42)
random_forest_model.fit(X_train, y_train)

# Train Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
decision_tree_model = DecisionTreeRegressor(max_depth=5, min_samples_split=5)
decision_tree_model.fit(X_train, y_train)

# Train Linear Regression
from sklearn.linear_model import LinearRegression
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)


In [7]:
# Evaluate models on the test set
trained_models = {
    'RandomForest': random_forest_model, 
    'DecisionTree': decision_tree_model, 
    'LinearRegression': linear_regression_model
}

for model_name, model in trained_models.items():
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} - RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")


RandomForest - RMSE: 1.9739, MAE: 1.3853, R2: 0.9214
DecisionTree - RMSE: 2.7605, MAE: 2.0483, R2: 0.8463
LinearRegression - RMSE: 2.0756, MAE: 1.6091, R2: 0.9131


In [21]:
#Importing FIFA 2022 dataset
data_2022 = pd.read_csv(r'C:\Users\grise\Downloads\players_22.csv')
numeric_2022_data = data_2022.select_dtypes(include=[np.number]).dropna(axis=1)
numeric_2022_data.columns





  data_2022 = pd.read_csv(r'C:\Users\grise\Downloads\players_22.csv')


Index(['sofifa_id', 'overall', 'potential', 'age', 'height_cm', 'weight_kg',
       'nationality_id', 'weak_foot', 'skill_moves',
       'international_reputation', 'attacking_crossing', 'attacking_finishing',
       'attacking_heading_accuracy', 'attacking_short_passing',
       'attacking_volleys', 'skill_dribbling', 'skill_curve',
       'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
       'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
       'movement_reactions', 'movement_balance', 'power_shot_power',
       'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
       'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awareness',
       'defending_standing_tackle', 'defending_sliding_tackle',
       'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
       'goalkeeping_positioning',

0        93
1        92
2        91
3        91
4        91
         ..
19234    47
19235    47
19236    47
19237    47
19238    47
Name: overall, Length: 19239, dtype: int64

In [29]:
# Assuming you have defined selected_features_df and target_variable earlier

# Ensure X and y have the same index
common_index = selected_features_df.index.intersection(target_variable.index)

# Define available_features based on your data
#available_features = selected_features_df.columns.tolist()  # Adjust this based on your actual feature selection process
available_features = []
for items in selected_features_df:
    if items in numeric_2022_data:
        available_features.append(items)

y_test_2022 = numeric_2022_data['overall']
# Update X_train and y_train to use only the common indices
X_train_updated = selected_features_df.loc[common_index, available_features]
y_train_updated = target_variable.loc[common_index]

print(f"Updated X shape: {X_train_updated.shape}")
print(f"Updated y shape: {y_train_updated.shape}")

# Retrain the models with the updated feature set and aligned y
for model_name, model in trained_models.items():
    model.fit(X_train_updated, y_train_updated)

# Assuming you have defined numeric_2022_data and y_test_2022 earlier for evaluation

# Prepare the test data with only the available features
X_test_2022 = numeric_2022_data[available_features]

# Now evaluate the models on the new season data
for model_name, model in trained_models.items():
    y_pred_2022 = model.predict(X_test_2022)
    rmse_2022 = np.sqrt(mean_squared_error(y_test_2022, y_pred_2022))
    mae_2022 = mean_absolute_error(y_test_2022, y_pred_2022)
    r2_2022 = r2_score(y_test_2022, y_pred_2022)
    print(f"{model_name} on new season data - RMSE: {rmse_2022:.4f}, MAE: {mae_2022:.4f}, R2: {r2_2022:.4f}")


Updated X shape: (161583, 9)
Updated y shape: (161583,)
RandomForest on new season data - RMSE: 1.8260, MAE: 1.2970, R2: 0.9296
DecisionTree on new season data - RMSE: 2.6552, MAE: 2.0325, R2: 0.8511
LinearRegression on new season data - RMSE: 2.2115, MAE: 1.6932, R2: 0.8967


In [68]:
available_features

['potential',
 'age',
 'attacking_short_passing',
 'skill_long_passing',
 'skill_ball_control',
 'movement_reactions',
 'power_shot_power',
 'mentality_vision',
 'mentality_composure']

In [64]:
# Print the features in X_train and X_test_2022
print("X_train features:", X_train.columns.tolist())
print("X_test_2022 features:", X_test_2022.columns.tolist())

# Find the common features
common_features = list(set(X_train.columns) & set(X_test_2022.columns))
print("Common features:", common_features)

# Reorder and select only common features for both datasets
X_train_aligned = X_train[common_features]
X_test_2022_aligned = X_test_2022[common_features]


# Retrain individual models
random_forest_model.fit(X_train_aligned, y_train)
decision_tree_model.fit(X_train_aligned, y_train)
linear_regression_model.fit(X_train_aligned, y_train)

# Retrain ensemble model
ensemble_model = VotingRegressor(estimators=[
    ('RandomForest', random_forest_model),
    ('DecisionTree', decision_tree_model),
    ('LinearRegression', linear_regression_model)
])
ensemble_model.fit(X_train_aligned, y_train)

# Predict on validation set
ensemble_predictions = ensemble_model.predict(X_test_2022_aligned)

y_test = y_test[:len(ensemble_predictions)]

# Evaluate ensemble model on validation set
ensemble_rmse = np.sqrt(mean_squared_error(y_test, ensemble_predictions))
ensemble_mae = mean_absolute_error(y_test, ensemble_predictions)
ensemble_r2 = r2_score(y_test, ensemble_predictions)

print("Ensemble Model on Validation Set:")
print(f"RMSE: {ensemble_rmse:.4f}, MAE: {ensemble_mae:.4f}, R2: {ensemble_r2:.4f}")

# Predict on new season data
ensemble_predictions_2022 = ensemble_model.predict(X_test_2022_aligned)

ensemble_rmse_2022 = np.sqrt(mean_squared_error(y_test_2022, ensemble_predictions_2022))
ensemble_mae_2022 = mean_absolute_error(y_test_2022, ensemble_predictions_2022)
ensemble_r2_2022 = r2_score(y_test_2022, ensemble_predictions_2022)

print("Ensemble Model on New Season Data:")
print(f"RMSE: {ensemble_rmse_2022:.4f}, MAE: {ensemble_mae_2022:.4f}, R2: {ensemble_r2_2022:.4f}")
# Save the best model using joblib
joblib.dump(ensemble_model, 'fifa_overall_rating_predictor.pkl')

X_train features: ['potential', 'age', 'shooting', 'passing', 'dribbling', 'physic', 'attacking_short_passing', 'skill_long_passing', 'skill_ball_control', 'movement_reactions', 'power_shot_power', 'mentality_vision', 'mentality_composure']
X_test_2022 features: ['potential', 'age', 'attacking_short_passing', 'skill_long_passing', 'skill_ball_control', 'movement_reactions', 'power_shot_power', 'mentality_vision', 'mentality_composure']
Common features: ['age', 'power_shot_power', 'skill_long_passing', 'mentality_composure', 'attacking_short_passing', 'mentality_vision', 'potential', 'movement_reactions', 'skill_ball_control']
Ensemble Model on Validation Set:
RMSE: 9.4208, MAE: 7.4985, R2: -0.8030
Ensemble Model on New Season Data:
RMSE: 1.8424, MAE: 1.3535, R2: 0.9283


['fifa_overall_rating_predictor.pkl']

In [None]:
# Define preprocessing function
def preprocess_input_data(input_data):
    input_df = pd.DataFrame([input_data])
    for feature in common_columns:
        if feature not in input_df.columns:
            input_df[feature] = 0
    return input_df[common_columns]

# Define prediction function
def predict_player_rating(input_data):
    processed_input = preprocess_input_data(input_data)
    return ensemble_model.predict(processed_input)[0]


In [None]:
# Uncomment and run the following code in a Streamlit app

# import streamlit as st

# def main():
#     st.title("FIFA Player Rating Predictor")
#     user_input = {
#         'feature1': st.number_input('Feature 1'),
#         'feature2': st.number_input('Feature 2'),
#         # Add more features as needed
#     }
#     if st.button('Predict'):
#         result = predict_player_rating(user_input)
#         st.write(f"Predicted Overall Rating: {result:.2f}")

# if __name__ == "__main__":
#     main()


In [None]:
streamlit run app.py