In [6]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, validation_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score
import joblib

# Set seeds for reproducibility
np.random.seed(42)

# Create directories if they don't exist
graphs_dir = 'graphs_knn'
models_dir = 'models_knn'
os.makedirs(graphs_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)

# Load the dataset
file_path = 'fifa_players_processed.csv'
data = pd.read_csv(file_path, index_col=0)

# List of columns to save for post-analysis
post_analysis_columns = [
    'player_id', 'overall', 'potential', 'club_team_id', 'league_name', 
    'nationality_name', 'st', 'lw', 'cf', 'rw', 'cam', 'lm', 'cm', 'rm', 
    'lwb', 'cdm', 'rwb', 'lb', 'cb', 'rb', 'gk', 'alternative_positions'
]

# Separate the post-analysis columns
post_analysis_data = data[post_analysis_columns]

# Drop the post-analysis columns from the main dataset
data_cleaned = data.drop(columns=post_analysis_columns)

# Ensure there are no NaNs in 'first_position'
assert data_cleaned['first_position'].isnull().sum() == 0, "There are NaNs in the 'first_position' column."

# Encode the target feature 'first_position' as categorical labels
label_encoder = LabelEncoder()
y_labels = label_encoder.fit_transform(data_cleaned['first_position'])

# Save the label encoder classes for later use in confusion matrix
label_classes = label_encoder.classes_

# Drop the original 'first_position' column from the cleaned data
X = data_cleaned.drop(columns=['first_position'])

# Combine the cleaned data with the target labels
data_for_training = X.copy()
data_for_training['first_position'] = y_labels

# Save the post-analysis dataset to a CSV file
post_analysis_file_path = 'fifa_post_analysis.csv'
post_analysis_data.to_csv(post_analysis_file_path, index=False)

# Save the training dataset to a CSV file
training_file_path = 'fifa_training_data.csv'
data_for_training.to_csv(training_file_path, index=False)

# Display the first few rows of the training dataset
print("\nFirst few rows of the training dataset:")
print(data_for_training.head())

# Load training data
data_path = 'fifa_training_data.csv'
data = pd.read_csv(data_path)
X = data.drop('first_position', axis=1)
y = data['first_position']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the parameter grid for KNN
param_grid = {
    'n_neighbors': [5, 10]
}

# Initialize the KNN model
knn = KNeighborsClassifier()

# Setup GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Save the best model
best_model_path = os.path.join(models_dir, 'best_model_knn.joblib')
joblib.dump(grid_search.best_estimator_, best_model_path)

# Generate learning curves and validation curves for each combination of hyperparameters
suffix_count = 1
for params in grid_search.cv_results_['params']:
    clf = KNeighborsClassifier(**params)
    clf.fit(X_train, y_train)

    # Learning curve
    train_sizes, train_scores, test_scores = learning_curve(clf, X_train, y_train, cv=5)
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

    plt.figure()
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
    plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Cross-validation score')
    plt.title(f'Learning Curve for KNN (n_neighbors={params["n_neighbors"]})')
    plt.xlabel('Training examples')
    plt.ylabel('Score')
    plt.legend(loc='best')
    plt.grid()
    learning_curve_path = os.path.join(graphs_dir, f'learning_curve_knn_{params["n_neighbors"]}_suffix{suffix_count}.png')
    plt.savefig(learning_curve_path)
    plt.close()

    # Validation curve for n_neighbors
    param_range = param_grid['n_neighbors']
    train_scores, test_scores = validation_curve(
        clf, X_train, y_train, param_name="n_neighbors", param_range=param_range, cv=5, scoring="accuracy", n_jobs=-1)

    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

    plt.figure()
    plt.plot(param_range, train_scores_mean, 'o-', color='r', label='Training score')
    plt.plot(param_range, test_scores_mean, 'o-', color='g', label='Validation score')
    plt.title(f'Validation Curve for KNN (n_neighbors={params["n_neighbors"]})')
    plt.xlabel('n_neighbors')
    plt.ylabel('Score')
    plt.legend(loc='best')
    plt.grid()
    validation_curve_path = os.path.join(graphs_dir, f'validation_curve_knn_{params["n_neighbors"]}_suffix{suffix_count}.png')
    plt.savefig(validation_curve_path)
    plt.close()

    suffix_count += 1

# Evaluate on test data
y_pred = grid_search.best_estimator_.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_classes, yticklabels=label_classes)
plt.title('Confusion Matrix for Best KNN Model')
plt.xlabel('Predicted')
plt.ylabel('True')
conf_matrix_path = os.path.join(graphs_dir, f'confusion_matrix_best_knn_suffix{suffix_count}.png')
plt.savefig(conf_matrix_path)
plt.close()

# Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

print("All tasks completed successfully. Models and graphs are saved in their respective directories.")



First few rows of the training dataset:
     value_eur  wage_eur  age  height_cm  weight_kg  club_jersey_number  \
0  181500000.0  230000.0   24        182         75                 7.0   
1  185000000.0  340000.0   22        195         94                 9.0   
2  103000000.0  350000.0   32        181         75                17.0   
3   41000000.0   23000.0   36        169         67                10.0   
4   51000000.0   95000.0   35        185         81                 9.0   

   weak_foot  skill_moves  attacking_crossing  attacking_finishing  ...  \
0          4            5                  78                   94  ...   
1          3            3                  47                   96  ...   
2          5            4                  95                   85  ...   
3          4            4                  83                   89  ...   
4          4            4                  75                   91  ...   

   body_type_Lean (170-185)  body_type_Lean (185+)  body_

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Your confusion matrix (assuming it is stored in conf_matrix)
conf_matrix = np.array([
    [118, 0, 6, 1, 84, 0, 5, 39, 10, 0, 2, 26, 2, 0, 21],
    [0, 811, 38, 0, 14, 0, 30, 0, 0, 1, 26, 0, 0, 1, 1],
    [3, 78, 188, 0, 129, 0, 11, 0, 0, 12, 0, 0, 0, 0, 1],
    [15, 0, 0, 1, 7, 0, 1, 4, 2, 0, 1, 6, 1, 0, 13],
    [53, 13, 113, 0, 451, 0, 20, 8, 1, 1, 5, 5, 1, 1, 3],
    [0, 0, 0, 0, 0, 611, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [5, 36, 11, 0, 46, 0, 236, 3, 2, 10, 17, 3, 0, 0, 0],
    [49, 2, 8, 1, 35, 0, 15, 79, 10, 3, 2, 34, 6, 0, 52],
    [32, 0, 1, 0, 14, 0, 2, 25, 9, 0, 2, 20, 1, 0, 28],
    [4, 3, 1, 0, 9, 0, 61, 0, 0, 1, 2, 2, 1, 0, 1],
    [3, 40, 48, 0, 69, 0, 1, 4, 1, 0, 187, 1, 0, 5, 2],
    [43, 0, 7, 3, 57, 0, 6, 52, 11, 1, 10, 54, 5, 0, 34],
    [25, 0, 0, 0, 17, 0, 3, 32, 4, 0, 3, 27, 6, 0, 32],
    [3, 11, 7, 0, 28, 0, 1, 1, 0, 1, 42, 0, 0, 3, 2],
    [11, 2, 4, 0, 7, 0, 1, 27, 5, 0, 0, 16, 4, 0, 626]
])

# Class labels (assuming they are stored in label_classes)
label_classes = ['CAM', 'CB', 'CDM', 'CF', 'CM', 'GK', 'LB', 'LM', 'LW', 'LWB', 'RB', 'RM', 'RW', 'RWB', 'ST']

# Normalize the confusion matrix by row (i.e by the number of samples in each class)
conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis] * 100

# Plot the normalized confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix_normalized, annot=True, fmt='.2f', cmap='Blues', xticklabels=label_classes, yticklabels=label_classes)
plt.title('Confusion Matrix for Best KNN Model (Percentage)')
plt.xlabel('Predicted')
plt.ylabel('True')
conf_matrix_path = os.path.join('graphs_knn', f'confusion_matrix_best_knn_percentage.png')
plt.savefig(conf_matrix_path)
plt.close()

print("Confusion matrix with percentages saved successfully.")


Confusion matrix with percentages saved successfully.
