In [42]:
from google.colab import files
import io
import pandas as pd

# Upload file
data = files.upload()

# Extract the filename and file content
filename = next(iter(data))
file_content = data[filename]

# Reading the file into a DataFrame
data = pd.read_csv(io.BytesIO(file_content))


Saving pre_model_data.csv to pre_model_data (6).csv


In [44]:
# Check for missing values in the dataset
missing_data = data.isnull().sum()

# Check the data types of the columns
data_types = data.dtypes

missing_data, data_types


(player_id                   0
 Year                        0
 matches_appearances         0
 yellow_permatch             0
 red_permatch                0
 goals_permatch              0
 assists_permatch            0
 min_permatch                0
 name                        0
 country_of_birth          479
 country_of_citizenship    424
 city_of_birth              55
 Year_birth                  0
 position                    0
 foot                        6
 height_in_cm                6
 mat                         0
 win_percentage              0
 major_league                0
 Age                         0
 market_value_in_eur         0
 value_increase              0
 matches                     0
 club_changed                0
 type_ratio                  0
 ever_captain                0
 dtype: int64,
 player_id                   int64
 Year                        int64
 matches_appearances         int64
 yellow_permatch           float64
 red_permatch              float64
 goa

In [45]:
# Fill missing values in categorical columns with 'Unknown'
data.fillna({'country_of_birth': 'Unknown', 'country_of_citizenship': 'Unknown', 'city_of_birth': 'Unknown'}, inplace=True)

# Encode categorical variables using one-hot encoding
data_encoded = pd.get_dummies(data, columns=['name', 'country_of_birth', 'country_of_citizenship', 'city_of_birth', 'position', 'foot'])

# Check the new shape and columns after encoding
data_encoded.shape, data_encoded.columns


((11587, 2700),
 Index(['player_id', 'Year', 'matches_appearances', 'yellow_permatch',
        'red_permatch', 'goals_permatch', 'assists_permatch', 'min_permatch',
        'Year_birth', 'height_in_cm',
        ...
        'city_of_birth_Żnin', 'city_of_birth_Žiar nad Hronom',
        'city_of_birth_Žilina', 'position_Attack', 'position_Defender',
        'position_Goalkeeper', 'position_Midfield', 'foot_both', 'foot_left',
        'foot_right'],
       dtype='object', length=2700))

In [46]:
# Sort data to ensure we calculate year-over-year changes correctly
data_encoded.sort_values(by=['player_id', 'Year'], inplace=True)

# Calculate year-over-year change in market value
data_encoded['market_value_change'] = data_encoded.groupby('player_id')['market_value_in_eur'].diff()

# Define the target variable 'market_value_trend'
# If market_value_change > 0 -> 'Rise'
# If market_value_change < 0 -> 'Fall'
# If market_value_change == 0 -> 'Same'
data_encoded['market_value_trend'] = pd.cut(data_encoded['market_value_change'],
                                            bins=[-float('inf'), -1, 1, float('inf')],
                                            labels=['Fall', 'Same', 'Rise'])

# Drop the first year for each player where we cannot calculate the change
data_encoded.dropna(subset=['market_value_trend'], inplace=True)

# Display the unique values in the new target variable
data_encoded['market_value_trend'].value_counts()


market_value_trend
Fall    5249
Rise    3543
Same    1332
Name: count, dtype: int64

In [47]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Select features and target
X = data_encoded.drop(['player_id', 'Year', 'market_value_in_eur', 'market_value_change', 'market_value_trend'], axis=1)
y = data_encoded['market_value_trend']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Check the shape of the training and testing data
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((8099, 2697), (2025, 2697), (8099,), (2025,))

In [51]:
# Drop any rows with NaN values from the dataset
data_encoded_clean = data_encoded.dropna()

# Select features and target from the cleaned dataset
X_clean = data_encoded_clean.drop(['player_id', 'Year', 'market_value_in_eur', 'market_value_change', 'market_value_trend'], axis=1)
y_clean = data_encoded_clean['market_value_trend']

# Scale features using the same scaler
X_clean_scaled = scaler.fit_transform(X_clean)

# Split the cleaned and scaled data into training and testing sets
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean_scaled, y_clean, test_size=0.2, random_state=42, stratify=y_clean)

# Reinitialize and retrain the SVM classifier
svm_model_clean = SVC(kernel='linear', probability=True, random_state=42)
svm_model_clean.fit(X_train_clean, y_train_clean)

# Predict on the cleaned test data and evaluate
y_pred_clean = svm_model_clean.predict(X_test_clean)
report_clean = classification_report(y_test_clean, y_pred_clean)

report_clean


'              precision    recall  f1-score   support\n\n        Fall       0.82      0.89      0.85      1050\n        Rise       1.00      1.00      1.00       708\n        Same       0.33      0.22      0.26       266\n\n    accuracy                           0.84      2024\n   macro avg       0.72      0.70      0.70      2024\nweighted avg       0.82      0.84      0.83      2024\n'

In [54]:
print(report_clean)

              precision    recall  f1-score   support

        Fall       0.82      0.89      0.85      1050
        Rise       1.00      1.00      1.00       708
        Same       0.33      0.22      0.26       266

    accuracy                           0.84      2024
   macro avg       0.72      0.70      0.70      2024
weighted avg       0.82      0.84      0.83      2024



In [57]:
from google.colab import drive
drive.mount('/content/drive')

model.save('/content/drive/MyDrive/Colab Notebooks/SVM_football.keras')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from tensorflow.keras.models import load_model

# Load the model
model = load_model(/content/drive/MyDrive/Colab Notebooks/SVM_football.keras)