In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the CSV file
file_path = r"premier league players.csv"
df = pd.read_csv(file_path)

# Add additional features and handle NaNs
df['Goals_per_Game'] = df['Goals'] / df['Appearances']
df['Assists_per_Game'] = df['Assists'] / df['Appearances']
df = df.fillna(0)  # Replace NaNs with 0

# Classify players with very few appearances as 'Inactive'
threshold = 5
df['Position'] = df.apply(lambda row: 'Inactive' if row['Appearances'] < threshold else row['Position'], axis=1)

# Select numeric features and target
numeric_features = ['Goals', 'Assists', 'Appearances', 'Tackles', 'Interceptions', 'Passes', 'Clearances',
                    'Goals_per_Game', 'Assists_per_Game', 'Shots', 'Shots on target', 'Shooting accuracy %',
                    'Goals conceded', 'Saves', 'Clean sheets', 'Tackle success %', 'Duels won', 'Aerial battles won',
                    'Crosses', 'Cross accuracy %', 'Headed goals', 'Goals with right foot', 'Goals with left foot',
                    'Penalties scored', 'Freekicks scored', 'Big chances missed', 'Duels lost', 'Aerial battles lost',
                    'Successful 50/50s']

features = df[numeric_features].select_dtypes(include=['float64', 'int64'])
target = df['Position']

# Handle missing values for numeric columns only
features = features.fillna(features.mean())

# Encode categorical target labels
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target_encoded, test_size=0.2, random_state=42)

# Normalize/scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Decision Tree model
decision_tree = DecisionTreeClassifier(criterion='entropy', max_depth = 5)

# Fit the model to the training data
decision_tree.fit(X_train, y_train)

# Calculate Training Accuracy
y_train_pred = decision_tree.predict(X_train)
training_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training accuracy: {training_accuracy}')

# Perform 5-fold cross-validation and calculate the average score
cv_scores = cross_val_score(decision_tree, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Average cross-validation score: {cv_scores.mean()}')

# Calculate Testing Accuracy for comparison
y_test_pred = decision_tree.predict(X_test)
testing_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Testing accuracy: {testing_accuracy}')


Training accuracy: 0.9868421052631579
Cross-validation scores: [1.         0.92307692 0.95604396 0.95604396 0.9010989 ]
Average cross-validation score: 0.9472527472527472
Testing accuracy: 0.9652173913043478
