Neural Networks


In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import History, EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix, mean_squared_error
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import random

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)

# Create directories if they don't exist
graphs_dir = 'graphs'
models_dir = 'models'
os.makedirs(graphs_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)

# Load the dataset
file_path = 'fifa_players_processed.csv'
data = pd.read_csv(file_path, index_col=0)

# List of columns to save for post-analysis
post_analysis_columns = [
    'player_id', 'overall', 'potential', 'club_team_id', 'league_name', 
    'nationality_name', 'st', 'lw', 'cf', 'rw', 'cam', 'lm', 'cm', 'rm', 
    'lwb', 'cdm', 'rwb', 'lb', 'cb', 'rb', 'gk', 'alternative_positions'
]

# Separate the post-analysis columns
post_analysis_data = data[post_analysis_columns]

# Drop the post-analysis columns from the main dataset
data_cleaned = data.drop(columns=post_analysis_columns)

# Ensure there are no NaNs in 'first_position'
assert data_cleaned['first_position'].isnull().sum() == 0, "There are NaNs in the 'first_position' column before encoding."

# Encode the target feature 'first_position' as categorical labels
label_encoder = LabelEncoder()
y_labels = label_encoder.fit_transform(data_cleaned['first_position'])

# Save the label encoder classes for later use in confusion matrix
label_classes = label_encoder.classes_

# Drop the original 'first_position' column from the cleaned data
X = data_cleaned.drop(columns=['first_position'])

# Combine the cleaned data with the target labels
data_for_training = X.copy()
data_for_training['first_position'] = y_labels

# Save the post-analysis dataset to a CSV file
post_analysis_file_path = 'fifa_post_analysis.csv'
post_analysis_data.to_csv(post_analysis_file_path, index=False)

# Save the training dataset to a CSV file
training_file_path = 'fifa_training_data.csv'
data_for_training.to_csv(training_file_path, index=False)

# Display the first few rows of the training dataset
print("\nFirst few rows of the training dataset:")
print(data_for_training.head())

# Display the first few rows of the post-analysis dataset
print("\nFirst few rows of the post-analysis dataset:")
print(post_analysis_data.head())

# Load training data
data_path = 'fifa_training_data.csv'
data = pd.read_csv(data_path)
X = data.drop('first_position', axis=1)
y = data['first_position']

# Convert target variable to categorical
y_categorical = to_categorical(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.3, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the model
def create_model(activation='relu', learning_rate=0.001, dropout_rate=None):
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation=activation))
    if dropout_rate:
        model.add(Dropout(dropout_rate))
    model.add(Dense(64, activation=activation))
    if dropout_rate:
        model.add(Dropout(dropout_rate))
    model.add(Dense(y_train.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy'])
    return model

# Cross-validation settings
kf = KFold(n_splits=5, shuffle=True, random_state=42)
activation_functions = ['relu', 'tanh']
learning_rates = [0.1, 0.01, 0.001]
dropout_rate = None  # No dropout for this test to avoid regularization issues
history_logs = []
global_suffix_count = 1  # Global suffix count

for activation in activation_functions:
    for learning_rate in learning_rates:
        fold_no = 1
        best_val_accuracy = 0
        best_epoch = 0
        train_accuracies = []
        val_accuracies = []
        train_losses = []
        val_losses = []

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            model = create_model(activation=activation, learning_rate=learning_rate, dropout_rate=dropout_rate)
            history = model.fit(X_train_fold, y_train_fold, epochs=100, batch_size=32, validation_data=(X_val_fold, y_val_fold), verbose=0, callbacks=[EarlyStopping(monitor='val_loss', patience=10)])

            for epoch, val_acc in enumerate(history.history['val_accuracy']):
                if val_acc > best_val_accuracy:
                    best_val_accuracy = val_acc
                    best_epoch = epoch + 1

            train_accuracies.extend(history.history['accuracy'])
            val_accuracies.extend(history.history['val_accuracy'])
            train_losses.extend(history.history['loss'])
            val_losses.extend(history.history['val_loss'])

            fold_no += 1

        # Save combined learning and loss curves
        plt.figure()
        plt.plot(train_accuracies, 'o-', color='r', label='Training accuracy')
        plt.plot(val_accuracies, 'o-', color='g', label='Validation accuracy')
        plt.title(f'Learning Curve for activation={activation}, learning_rate={learning_rate}')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend(loc='best')
        plt.grid()
        learning_curve_path = os.path.join(graphs_dir, f'learning_curve_{activation}_{learning_rate}_{global_suffix_count}.png')
        plt.savefig(learning_curve_path)
        plt.close()
        global_suffix_count += 1

        plt.figure()
        plt.plot(train_losses, 'o-', color='r', label='Training loss')
        plt.plot(val_losses, 'o-', color='g', label='Validation loss')
        plt.title(f'Loss Curve for activation={activation}, learning_rate={learning_rate}')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend(loc='best')
        plt.grid()
        loss_curve_path = os.path.join(graphs_dir, f'loss_curve_{activation}_{learning_rate}_{global_suffix_count}.png')
        plt.savefig(loss_curve_path)
        plt.close()
        global_suffix_count += 1

        # Collect history logs
        history_logs.append((activation, learning_rate, best_val_accuracy, best_epoch))

        # Save model
        model_path = os.path.join(models_dir, f'model_{activation}_{learning_rate}.h5')
        model.save(model_path)

# Validation curves
suffix_count = global_suffix_count
for activation in activation_functions:
    train_scores = []
    val_scores = []

    for learning_rate in learning_rates:
        model_path = os.path.join(models_dir, f'model_{activation}_{learning_rate}.h5')
        model = create_model(activation=activation, learning_rate=learning_rate)
        model.load_weights(model_path)

        history = model.fit(X_train, y_train, epochs=1, batch_size=32, validation_data=(X_test, y_test), verbose=0)
        train_scores.append(history.history['accuracy'][-1])
        val_scores.append(history.history['val_accuracy'][-1])

    plt.figure()
    plt.plot(learning_rates, train_scores, 'o-', color='r', label='Training score')
    plt.plot(learning_rates, val_scores, 'o-', color='g', label='Validation score')
    plt.title(f'Validation Curve for Learning Rate (activation={activation})')
    plt.xlabel('Learning rate')
    plt.ylabel('Score')
    plt.xscale('log')
    plt.legend(loc='best')
    plt.grid()
    validation_curve_path = os.path.join(graphs_dir, f'validation_curve_lr_{activation}_{suffix_count}.png')
    plt.savefig(validation_curve_path)
    plt.close()

    suffix_count += 1

# Find the best model
best_model_info = max(history_logs, key=lambda x: x[2])
best_activation, best_learning_rate, best_val_accuracy, best_epoch = best_model_info

best_model_path = os.path.join(models_dir, f'model_{best_activation}_{best_learning_rate}.h5')
best_model = create_model(activation=best_activation, learning_rate=best_learning_rate)
best_model.load_weights(best_model_path)

# Evaluate on test data
y_pred = best_model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

# Map the numerical labels back to original positions
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
y_true_labels = label_encoder.inverse_transform(y_true)

# Confusion Matrix
# Confusion Matrix
conf_matrix = confusion_matrix(y_true_labels, y_pred_labels, labels=label_classes)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_classes, yticklabels=label_classes)
plt.title('Confusion Matrix for Best Model')
plt.xlabel('Predicted')
plt.ylabel('True')
conf_matrix_path = os.path.join(graphs_dir, f'confusion_matrix_best_model_{suffix_count}.png')
plt.savefig(conf_matrix_path)
plt.close()

# Mean Squared Error
mse = mean_squared_error(y_true, y_pred_classes)
print(f'Mean Squared Error: {mse}')

print("All tasks completed successfully. Models and graphs are saved in their respective directories.")



First few rows of the training dataset:
     value_eur  wage_eur  age  height_cm  weight_kg  club_jersey_number  \
0  181500000.0  230000.0   24        182         75                 7.0   
1  185000000.0  340000.0   22        195         94                 9.0   
2  103000000.0  350000.0   32        181         75                17.0   
3   41000000.0   23000.0   36        169         67                10.0   
4   51000000.0   95000.0   35        185         81                 9.0   

   weak_foot  skill_moves  attacking_crossing  attacking_finishing  ...  \
0          4            5                  78                   94  ...   
1          3            3                  47                   96  ...   
2          5            4                  95                   85  ...   
3          4            4                  83                   89  ...   
4          4            4                  75                   91  ...   

   body_type_Lean (170-185)  body_type_Lean (185+)  body_

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 555us/step
Mean Squared Error: 8.109041095890412
All tasks completed successfully. Models and graphs are saved in their respective directories.


In [3]:
# Load the dataset
file_path = 'fifa_players_processed.csv'
data = pd.read_csv(file_path, index_col=0)

# Count the number of each first_position in the dataset
position_counts = data['first_position'].value_counts()

# Create a DataFrame to display the counts
position_counts_df = position_counts.reset_index()
position_counts_df.columns = ['Position', 'Count']

# Display the DataFrame
print(position_counts_df)

   Position  Count
0        CB   3244
1        ST   2446
2        CM   2189
3        GK   2033
4       CDM   1511
5        RB   1179
6        LB   1155
7       CAM   1043
8        LM    954
9        RM    939
10       RW    451
11       LW    411
12      RWB    300
13      LWB    265
14       CF    130


In [4]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import os

# Your confusion matrix (assuming it is stored in conf_matrix)
conf_matrix = np.array([
    [187, 0, 0, 1, 54, 0, 1, 24, 7, 0, 0, 21, 3, 0, 16],
    [0, 835, 50, 0, 1, 0, 14, 0, 0, 0, 22, 0, 0, 0, 0],
    [0, 36, 292, 0, 81, 0, 7, 0, 0, 0, 5, 2, 0, 0, 0],
    [20, 0, 0, 3, 1, 0, 0, 5, 7, 0, 0, 4, 1, 0, 10],
    [57, 2, 66, 0, 513, 0, 6, 7, 1, 0, 6, 16, 0, 0, 1],
    [0, 0, 0, 0, 0, 611, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [1, 17, 5, 0, 12, 0, 295, 4, 0, 10, 22, 3, 0, 0, 0],
    [43, 0, 2, 0, 13, 0, 14, 98, 23, 3, 3, 53, 11, 0, 33],
    [17, 0, 0, 0, 3, 0, 1, 27, 26, 1, 0, 30, 8, 0, 21],
    [0, 1, 0, 0, 6, 0, 66, 4, 0, 2, 5, 1, 0, 0, 0],
    [0, 25, 14, 0, 13, 0, 2, 2, 0, 0, 289, 11, 0, 5, 0],
    [33, 0, 1, 0, 17, 0, 0, 61, 17, 1, 9, 86, 23, 3, 32],
    [15, 0, 0, 0, 2, 0, 1, 37, 12, 0, 0, 42, 22, 0, 18],
    [1, 1, 3, 0, 5, 0, 1, 1, 0, 1, 72, 12, 0, 2, 0],
    [12, 0, 0, 0, 2, 0, 17, 3, 0, 0, 6, 7, 0, 0, 656]
])

# Class labels (assuming they are stored in label_classes)
label_classes = ['CAM', 'CB', 'CDM', 'CF', 'CM', 'GK', 'LB', 'LM', 'LW', 'LWB', 'RB', 'RM', 'RW', 'RWB', 'ST']

# Normalize the confusion matrix by row (i.e by the number of samples in each class)
conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis] * 100

# Plot the normalized confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix_normalized, annot=True, fmt='.2f', cmap='Blues', xticklabels=label_classes, yticklabels=label_classes)
plt.title('Confusion Matrix for Best Model (Percentage)')
plt.xlabel('Predicted')
plt.ylabel('True')
conf_matrix_path = os.path.join('graphs', f'confusion_matrix_best_model_percentage.png')
plt.savefig(conf_matrix_path)
plt.close()

print("Confusion matrix with percentages saved successfully.")


Confusion matrix with percentages saved successfully.
