Project Objective
Use labeled player data to train a supervised multiclass classification model that predicts a player's general position based on their attributes.

Imports and Data Loading

In [38]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Step 2: Load Data
df_train = pd.read_csv('fifa_21.csv')
df_test = pd.read_csv('fifa_22.csv')


Preprocessing

In [39]:
# Step 3: Define target and features
target = 'club_position'
numeric_features = [
    'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic',
    'attacking_crossing','attacking_finishing','attacking_heading_accuracy',
    'attacking_short_passing','attacking_volleys','skill_dribbling','skill_curve',
    'skill_fk_accuracy','skill_long_passing','skill_ball_control','movement_acceleration',
    'movement_sprint_speed','movement_agility','movement_reactions','movement_balance',
    'power_shot_power','power_jumping','power_stamina','power_strength','power_long_shots',
    'mentality_aggression','mentality_interceptions','mentality_positioning','mentality_vision',
    'mentality_penalties','mentality_composure','defending_marking_awareness',
    'defending_standing_tackle','defending_sliding_tackle'
]

# Step 4: Filter Function to Apply on Both
def clean_df(df):
    df = df[df[target].notna()]
    df = df[~df[target].isin(['SUB', 'RES'])]
    df = df[~df['club_position'].str.startswith('GK', na=False)]
    df = df.dropna(subset=numeric_features)
    return df

df_train = clean_df(df_train)
df_test = clean_df(df_test)

# Step 5: Encode target using FIFA 21 only
le = LabelEncoder()
df_train['position_encoded'] = le.fit_transform(df_train[target])

# Only use test rows with club positions also seen in training set
valid_positions = set(df_train[target].unique())
df_test = df_test[df_test[target].isin(valid_positions)]

# Step 6: Align encoding in test set
df_test['position_encoded'] = le.transform(df_test[target])

# Step 7: Prepare X and y
X_train = df_train[numeric_features]
y_train = df_train['position_encoded']
X_test = df_test[numeric_features]
y_test = df_test['position_encoded']


Model Training

In [41]:
# Step 8: Train Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 9: Predict and Evaluate
y_pred = model.predict(X_test)
labels_in_test = sorted(np.unique(y_test))
target_names = le.inverse_transform(labels_in_test)

print(classification_report(y_test, y_pred, labels=labels_in_test, target_names=target_names))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

         CAM       0.44      0.35      0.39        23
          CB       0.43      0.27      0.33        11
         CDM       0.56      0.38      0.45        13
          CF       1.00      0.67      0.80         3
          CM       0.00      0.00      0.00         4
         LAM       1.00      1.00      1.00         1
          LB       0.53      0.69      0.60        39
         LCB       0.54      0.58      0.56        50
         LCM       0.24      0.41      0.31        29
         LDM       0.62      0.25      0.36        20
          LF       0.00      0.00      0.00         4
          LM       0.45      0.29      0.36        34
          LS       0.45      0.36      0.40        14
          LW       0.46      0.55      0.50        11
         LWB       0.00      0.00      0.00         4
         RAM       0.00      0.00      0.00         1
          RB       0.60      0.67      0.63        39
         RCB       0.54    

Fuzzy Player Search + Prediction Function

In [42]:
# Use df_test for prediction, since it's the 2022 data
df = df_test  # Make sure df points to test data

# Predict Function by Partial Name Match
def search_player_prediction(name_fragment):
    # Search player(s) by partial match (case insensitive)
    matched = df[df['short_name'].str.lower().str.contains(name_fragment.lower())]

    if matched.empty:
        print("No player found with that name fragment.")
        return

    for _, player in matched.iterrows():
        # Ensure the player row has all required features
        if any(pd.isna(player[numeric_features])):
            print(f"\nSkipping {player['short_name']} due to missing data.")
            continue

        try:
            input_features = player[numeric_features].values.astype(float).reshape(1, -1)
            predicted_position = le.inverse_transform(model.predict(input_features))
            print(f"\nPlayer: {player['short_name']}")
            print(f"   - Actual Position: {player['club_position']}")
            print(f"   - Predicted Position: {predicted_position[0]}")
        except Exception as e:
            print(f"\nError with {player['short_name']}: {e}")

 

Prediction:

In [43]:

search_player_prediction("Saka")  
search_player_prediction("messi")
search_player_prediction("rice")  
search_player_prediction("ronaldo")


Player: A. Wan-Bissaka
   - Actual Position: RB
   - Predicted Position: RB

Player: B. Saka
   - Actual Position: LM
   - Predicted Position: LB

Player: L. Messi
   - Actual Position: RW
   - Predicted Position: CAM

Player: D. Rice
   - Actual Position: LDM
   - Predicted Position: CDM

Player: Cristiano Ronaldo
   - Actual Position: ST
   - Predicted Position: LS
