In [1]:
import pandas as pd

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

def preprocess_data(file_path):
    #Load the dataset
    data = pd.read_csv(file_path)

    #Drop the irrelevant columns 
    irrelevant_columns = ['player_url', 'long_name', 'dob', 'club_logo_url', 'nation_logo_url']
    data = data.drop(columns=irrelevant_columns, errors='ignore')

    ##Impute the missing values
    imputer = SimpleImputer(strategy="median")
    ##Get the numerical columns from the data
    numerical_cols=  data.select_dtypes(include=['float64', 'int64'])

    scaled = imputer.fit_transform(numerical_cols)

    ##Dataframe with transformed features
    data_tr = pd.DataFrame(scaled, columns = numerical_cols.columns)

    corr_matrix = data_tr.corr()
    #Get the top features
    top_features = corr_matrix["overall"].sort_values(ascending=False).index[1:10]

    #Get the categorical columns
    legacy_df_cat= data.select_dtypes(include=['object'])

    #use label encoder to handle the encoding
    # Initialize the LabelEncoder
    le = LabelEncoder()

    # Apply Label Encoding to each categorical column
    encoded_categorical = pd.DataFrame()
    for col in legacy_df_cat.columns:
        encoded_categorical[col] = le.fit_transform(data[col].astype(str))

    # Combine encoded categorical features with numerical features
    X = pd.concat([ data_tr, encoded_categorical], axis=1)
    y =  data_tr['overall']

    # Calculate correlation matrix
    correlation_matrix = X.corr()
    # Select top N features
    N = 15
    top_features = correlation_matrix['overall'].abs().sort_values(ascending=False)[1:N+1].index.tolist()
    X = X[top_features]

    return X, y
    

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
##This is a function that uses random forest regressor to train the model
def randForestRegressor(X, y):
    #Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=10, random_state=42)
    model.fit(X_train, y_train)
    ##Predict the test set
    y_pred = model.predict(X_test)
    #Evaluating the model
    mse = mean_squared_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)
    return mse, r2

In [4]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
def gradientboostingRegressor(X, y):
    #Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=10)
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)
    return mse, r2
    

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

## This is a function that uses a decision tree regressor to train the model
def decisionTreeRegressor(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the Decision Tree Regressor
    model = DecisionTreeRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    # Predict the test set
    y_pred = model.predict(X_test)
    
    # Evaluating the model
    mse = mean_squared_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)
   
    return mse, r2

In [6]:
X, y = preprocess_data("C:\\Users\\Edward Ofosu Mensah\\Downloads\\male_players (legacy).csv")
mse, r2 =  randForestRegressor(X, y)
print(f"Mean squared error: {mse}")
print(f"R^2 score: {r2}")

  data = pd.read_csv(file_path)


Mean squared error: 3.1467304266244462
R^2 score: 0.9365111811372516


In [7]:
mse, r2 =  gradientboostingRegressor(X, y)
print(f"Mean squared error: {mse}")
print(f"R^2 score: {r2}")

Mean squared error: 17.058098759559172
R^2 score: 0.6558337081799929


In [8]:
mse, r2 = decisionTreeRegressor(X, y)
print(f"Mean squared error: {mse}")
print(f"R^2 score: {r2}")

Mean squared error: 5.835740047377885
R^2 score: 0.8822573933682945


In [9]:
from sklearn.model_selection import GridSearchCV

##Tune the randomForest
def tune_random_forest(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    param_grid = [
        {'n_estimators': [10, 40],
        'max_features': [4, 6]}
    ]
    rf = RandomForestRegressor()
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    
    best_rf = grid_search.best_estimator_
    y_pred = best_rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("Best Random Forest parameters:", grid_search.best_params_)
    return mse, r2

In [10]:
mse, r2 =  tune_random_forest(X, y)
print(f"Mean squared error: {mse}")
print(f"R^2 score: {r2}")

Best Random Forest parameters: {'max_features': 4, 'n_estimators': 40}
Mean squared error: 2.529910537302928
R^2 score: 0.9489562148435813


In [11]:
###Considering the fact tht the Random Forest Regressor is giving me the best result for the R2 score, I will stick with it and proceed to test the playerss 22 data it

X, y = preprocess_data("C:\\Users\\Edward Ofosu Mensah\\Downloads\\players_22-1.csv")


  data = pd.read_csv(file_path)


In [12]:
X

Unnamed: 0,movement_reactions,mentality_composure,passing,cm,lcm,rcm,potential,rm,lm,ram,cam,lam,dribbling,lf,cf
0,94.0,96.0,91.0,210,210,210,93.0,234,234,251,251,251,95.0,131,131
1,93.0,88.0,79.0,186,186,186,92.0,224,224,245,245,245,86.0,128,128
2,94.0,95.0,80.0,177,177,177,91.0,230,230,245,245,245,88.0,129,129
3,89.0,93.0,86.0,195,195,195,91.0,232,232,249,249,249,94.0,128,128
4,91.0,89.0,93.0,211,211,211,91.0,232,232,249,249,249,88.0,127,127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19234,53.0,37.0,46.0,48,48,48,52.0,46,46,47,47,47,48.0,30,30
19235,49.0,47.0,50.0,49,49,49,59.0,49,49,48,48,48,46.0,31,31
19236,46.0,36.0,45.0,49,49,49,55.0,47,47,48,48,48,49.0,31,31
19237,48.0,47.0,36.0,35,35,35,60.0,43,43,46,46,46,48.0,32,32


In [13]:
y

0        93.0
1        92.0
2        91.0
3        91.0
4        91.0
         ... 
19234    47.0
19235    47.0
19236    47.0
19237    47.0
19238    47.0
Name: overall, Length: 19239, dtype: float64

In [19]:
##Use the random forest regressor to test it
best_model= RandomForestRegressor()

In [20]:
best_model.fit(X, y)

In [16]:
mse, r2_score = randForestRegressor(X, y)

In [17]:
print(f"MSE {mse}, RS_SCORE {r2_score}")

MSE 4.104714137214137, RS_SCORE 0.912288923851465


In [18]:
##create a pickle file using serialisation
import pickle
pickle_out = open("regression.pkl", "wb")
pickle.dump(best_model, pickle_out)
pickle_out.close()