In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_fp = pd.read_csv("data/fp_data_final.csv", index_col=0)
df_gk = pd.read_csv("data/gk_data_final.csv", index_col=0)
pd.set_option('display.max_columns', None)

In [3]:
# set random seed for all algos
rseed = 42

In [4]:
# suppress scientific notation
pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(suppress=True)

### Prepare datasets for prediction | general

In [5]:
# drop irrelevant rows

df_fp = df_fp.drop(["player_name","long_name","year_of_birth","height_cm","weight_kg","nationality","club"],axis=1)

df_gk = df_gk.drop(["player_name","long_name","year_of_birth","height_cm","weight_kg","nationality","club"],axis=1)

### Vorgehensweise
1. Für finalen Test --> Test-Set wegpacken
2. Mit train_set train-test-split und predicten
3. Final mit test_set_final predicten

In [6]:
# 1. put final test set aside

# fieldplayers
train_set_fp = df_fp.sample(frac=0.80, random_state=rseed)
y_final_fp = df_fp.drop(train_set_fp.index)
y_final_fp = y_final_fp["market_value_in_euro"]

# goal keepers
train_set_gk = df_gk.sample(frac=0.80, random_state=rseed)
y_final_gk = df_gk.drop(train_set_gk.index)
y_final_gk = y_final_gk["market_value_in_euro"]

print("Fieldplayers: ")
print(train_set_fp.shape)
print(y_final_fp.shape)
print("-----------------------")
print("Goalkeepers: ")
print(train_set_gk.shape)
print(y_final_gk.shape)

Fieldplayers: 
(8598, 54)
(2149,)
-----------------------
Goalkeepers: 
(1110, 37)
(277,)


# define pipeline

### Regressors to be used:
1. Linear Regression
6. Stochastic Gradient Descent
7. Decision Trees
8. Random Forest
9. AdaBoost
10. Gradient Tree Boosting
11. XGBoost


In [9]:
def algo_pipeline (train_set):

    ###################################
    # 2.1. train-test-split with remaining 80% of data
    ###################################

    from sklearn.model_selection import train_test_split

    X = train_set.drop(["market_value_in_euro"],axis=1)

    y = train_set["market_value_in_euro"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rseed)

    ###################################
    # 2.2. Build preprocessing pipeline with
    # - standard scaler
    # - onehotencoder
    ###################################

    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler, OneHotEncoder

    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    ###################################
    # 2.3. Apply column transformer
    ###################################

    from sklearn.compose import ColumnTransformer

    numeric_features = train_set.select_dtypes(include=['int64', 'float64']).drop(["market_value_in_euro"],axis=1).columns
    categorical_features = train_set.select_dtypes(include=['object']).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    ###################################
    # 2.4. Fit algorithms
    ###################################

    from sklearn import linear_model
    from sklearn import svm
    from sklearn import tree
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.ensemble import AdaBoostRegressor
    from sklearn.ensemble import GradientBoostingRegressor
    import xgboost as xgb
    

    regressors = [
    linear_model.LinearRegression(),
    linear_model.SGDRegressor(random_state=rseed),
    tree.DecisionTreeRegressor(random_state=rseed),
    RandomForestRegressor(random_state=rseed),
    AdaBoostRegressor(random_state=rseed),
    GradientBoostingRegressor(random_state=rseed),
    xgb.XGBRegressor(random_state=rseed)
    ]
    
    for regressor in regressors:
        pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', regressor)])
        pipe.fit(X_train, y_train)
        
        y_pred = pipe.predict(X_test)
    
    
        ###################################
        # 2.5. Print results
        ###################################
        from sklearn.ensemble import RandomForestRegressor
        from sklearn.model_selection import train_test_split as tts
        from yellowbrick.regressor import residuals_plot
        from yellowbrick.datasets import load_concrete

        # Create the visualizer, fit, score, and show it
        viz = residuals_plot(regressor, X_train, y_train, X_test, y_test)

In [10]:
algo_pipeline (train_set_fp)

ModuleNotFoundError: No module named 'yellowbrick'

In [None]:
        ###################################
        # 2.5. Test model with following metrics
        # a. MSE
        # b. MRSE
        # c. MAE
        # d. MAPE
        # e. R²
        # f. Adjusted R²
        ###################################

        from sklearn.metrics import mean_squared_error
        from sklearn.metrics import mean_absolute_error
        from sklearn.metrics import r2_score
        
        import math
        from math import sqrt
        

        mse = math.ceil(mean_squared_error(y_test, y_pred))
        mrse = math.ceil(sqrt(mse))
        mae = math.ceil(mean_absolute_error(y_test, y_pred))
        mape = round((np.mean(np.abs((y_test - y_pred) / y_test)) * 100),2)
        
        r_squared = round(r2_score(y_test, y_pred),4)
        
        n = train_set.shape[0]
        p = len(train_set.columns)
        adjusted_r_squared = round((1-(1-r_squared)*(n-1)/(n-p-1)),4)
        
        ###################################
        # 2.6. Print results
        ###################################
      
          
        from prettytable import PrettyTable
        ptable = PrettyTable()

        ptable.field_names = ["Metric", "Score"]
        ptable.add_row(["MSE",f"{mse:0,}"])
        ptable.add_row(["MRSE",f"{mrse:0,}"])
        ptable.add_row(["MAE",f"{mae:0,}"])
        ptable.add_row(["MAPE",f"{mape:0,}"])
        ptable.add_row(["r_squared",r_squared])
        ptable.add_row(["adjusted_r_squared",adjusted_r_squared])
        
        print("----------------------------------------------------------------------------------------------------------")
        print("------------",regressor,"------------")
        print("----------------------------------------------------------------------------------------------------------")
        print (ptable)