In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
df_fp = pd.read_csv("data/fp_data_final.csv", index_col=0)
df_gk = pd.read_csv("data/gk_data_final.csv", index_col=0)
pd.set_option('display.max_columns', None)

In [5]:
# set random seed for all algos
rseed = 42

### Prepare datasets for prediction | general

In [17]:
# drop irrelevant rows

df_fp = df_fp.drop(["player_name","long_name","year_of_birth","height_cm","weight_kg","nationality","club"],axis=1)

df_gk = df_gk.drop(["player_name","long_name","year_of_birth","height_cm","weight_kg","nationality","club"],axis=1)

### Vorgehensweise
1. Für finalen Test --> Test-Set wegpacken
2. Mit train_set train-test-split und predicten
3. Final mit test_set_final predicten

In [5]:
# 1. put final test set aside

# fieldplayers
train_set_fp = df_fp.sample(frac=0.80, random_state=rseed)
y_final_fp = df_fp.drop(train_set_fp.index)
y_final_fp = y_final_fp["market_value_in_euro"]

# goal keepers
train_set_gk = df_gk.sample(frac=0.80, random_state=rseed)
y_final_gk = df_gk.drop(train_set_gk.index)
y_final_gk = y_final_gk["market_value_in_euro"]

print("Fieldplayers: ")
print(train_set_fp.shape)
print(y_final_fp.shape)
print("-----------------------")
print("Goalkeepers: ")
print(train_set_gk.shape)
print(y_final_gk.shape)

Fieldplayers: 
(8598, 54)
(2149,)
-----------------------
Goalkeepers: 
(1110, 37)
(277,)


# define first pipeline

In [6]:
def lr_pipeline (train_set):

    ###################################
    # 2.1. train-test-split with remaining 80% of data
    ###################################

    from sklearn.model_selection import train_test_split

    X = train_set.drop(["market_value_in_euro"],axis=1)

    y = train_set["market_value_in_euro"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rseed)

    ###################################
    # 2.2. Build preprocessing pipeline
    ###################################

    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler, OneHotEncoder

    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    ###################################
    # 2.3. Apply column transformer
    ###################################

    from sklearn.compose import ColumnTransformer

    numeric_features = train_set.select_dtypes(include=['int64', 'float64']).drop(["market_value_in_euro"],axis=1).columns
    categorical_features = train_set.select_dtypes(include=['object']).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    ###################################
    # 2.4. Fit algorithm - in this case try a simple linear regression
    ###################################

    from sklearn.linear_model import LinearRegression

    lr = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', LinearRegression())])

    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_test)

    ###################################
    # 2.5. Test model
    ###################################

    from sklearn.metrics import mean_squared_error
    from math import sqrt
    from sklearn.metrics import r2_score

    rms = sqrt(mean_squared_error(y_test, y_pred))
    print(rms)
    print(r2_score(y_test, y_pred))

In [7]:
lr_pipeline(train_set_fp)

2475134.913365766
0.5308382826651521


In [8]:
lr_pipeline(train_set_gk)

1915079.6643769485
0.33845337473766324


### Regressors to be used:
1. Linear Regression
6. Stochastic Gradient Descent
7. Decision Trees
8. Random Forest
9. AdaBoost
10. Gradient Tree Boosting
11. XGBoost


In [33]:
def pipeline_several_models(train_set, rseed):
    
    
    ###################################
    # 2.1. train-test-split with remaining 80% of data
    ###################################

    from sklearn.model_selection import train_test_split

    X = train_set.drop(["market_value_in_euro"],axis=1)

    y = train_set["market_value_in_euro"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rseed)

    ###################################
    # 2.2. Build preprocessing pipeline
    ###################################

    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler, OneHotEncoder

    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    ###################################
    # 2.3. Apply column transformer
    ###################################

    from sklearn.compose import ColumnTransformer

    numeric_features = train_set.select_dtypes(include=['int64', 'float64']).drop(["market_value_in_euro"],axis=1).columns
    categorical_features = train_set.select_dtypes(include=['object']).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    ###################################
    # 2.4. Fit algorithms
    ###################################

    from sklearn import linear_model
    from sklearn import svm
    from sklearn import tree
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.ensemble import AdaBoostRegressor
    from sklearn.ensemble import GradientBoostingRegressor
    import xgboost as xgb
    

    regressors = [
    linear_model.LinearRegression(),
    linear_model.SGDRegressor(random_state=rseed),
    tree.DecisionTreeRegressor(random_state=rseed),
    RandomForestRegressor(random_state=rseed),
    AdaBoostRegressor(random_state=rseed),
    GradientBoostingRegressor(random_state=rseed),
    xgb.XGBRegressor(random_state=rseed)
    ]
    
    for regressor in regressors:
        pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', regressor)])
        pipe.fit(X_train, y_train)
        
        y_pred = pipe.predict(X_test)
        
        ###################################
        # 2.5. Feature importances
        ##################################

        import eli5
    
        onehot_columns = list(pipe.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names(input_features=categorical_features))
        numeric_features_list = list(numeric_features)
        numeric_features_list.extend(onehot_columns)
    
        weights = eli5.explain_weights(pipe.named_steps['regressor'], top=100, feature_names=numeric_features_list)

        ###################################
        # 2.6. Test models
        ###################################
        
        from sklearn.metrics import mean_squared_error
        from math import sqrt
        from sklearn.metrics import r2_score
        
        print(regressor)
        print("--------")
        print("R-squared: %.3f" % r2_score(y_test, y_pred))
        print("RMSE score: %.3f" % sqrt(mean_squared_error(y_test, y_pred)))
        print("-----------------------------------------------")
        print("-----------------------------------------------")
        print("-----------------------------------------------")

In [34]:
pipeline_several_models(train_set_fp, rseed)

ValueError: Shape of passed values is (79, 1), indices imply (19, 1)

In [11]:
pipeline_several_models(train_set_gk, rseed)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
--------
R-squared: 0.338
RMSE score: 1915079.664
-----------------------------------------------
-----------------------------------------------
-----------------------------------------------
SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=42,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)
--------
R-squared: 0.370
RMSE score: 1869407.323
-----------------------------------------------
-----------------------------------------------
-----------------------------------------------
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
    

  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)
--------
R-squared: 0.615
RMSE score: 1461499.669
-----------------------------------------------
-----------------------------------------------
-----------------------------------------------


# Experimental

In [12]:
df_fp = pd.read_csv("data/fp_data_final.csv", index_col=0)
df_gk = pd.read_csv("data/gk_data_final.csv", index_col=0)
pd.set_option('display.max_columns', None)

df_fp = df_fp.drop(["player_name","long_name","year_of_birth","height_cm","weight_kg","nationality","club"],axis=1)

df_gk = df_gk.drop(["player_name","long_name","year_of_birth","height_cm","weight_kg","nationality","club"],axis=1)

df_fp.head()

Unnamed: 0,market_value_in_euro,player_age,geographical_continent,main_position,positional_flexibility,league,division,current_national_player,national_team_appearances,overall,preferred_foot,weak_foot,attacking_work_rate,defensive_work_rate,skill_moves,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,attack_positions_avg_score,midfield_attack_positions_avg_score,idfield_defense_positions_avg_score,defense_positions_avg_score
0,12000000.0,34,Europe,Centre Midfield,Low,LaLiga (ESP),1,True,127,90,Right,4 Stars,High,High,4 Star Moves,74,76,89,89,72,66,86,72,55,92,76,87,85,78,88,92,77,71,92,89,93,79,68,85,58,82,62,82,79,91,82,92,68,76,71,81,85,84,76
1,4000000.0,35,Europe,Centre Back,Low,Serie A (ITA),1,False,103,89,Left,3 Stars,Medium,High,2 Star Moves,68,46,58,60,90,82,54,33,83,65,45,59,60,31,65,61,61,73,57,82,57,78,89,59,89,49,91,88,28,50,50,84,94,91,89,56,56,67,79
2,14500000.0,34,Europe,Centre Back,Low,LaLiga (ESP),1,True,170,89,Right,3 Stars,High,Medium,3 Star Moves,72,68,75,73,87,85,66,63,92,80,69,65,74,72,83,83,74,71,78,87,66,79,93,80,85,62,90,88,67,71,86,84,85,87,90,73,73,79,83
3,8000000.0,34,South America,Centre Back,Low,Serie A (ITA),1,True,135,88,Right,3 Stars,Medium,High,2 Star Moves,60,48,64,61,89,83,52,42,88,79,47,53,49,49,70,74,61,60,57,81,53,67,90,65,89,43,88,87,48,52,50,82,90,89,87,60,61,71,80
4,20000000.0,33,South America,Striker,Low,Ligue 1 (FRA),1,True,116,88,Right,4 Stars,High,High,3 Star Moves,75,86,72,79,55,83,70,88,89,78,90,79,77,76,52,81,74,76,74,91,59,88,88,91,79,79,84,54,93,77,85,80,57,48,39,83,79,71,66


In [14]:
df_fp = df_fp.drop(["overall"], axis = 1) 
df_fp = df_fp.drop(df_fp.loc[:, 'attacking_crossing':'defense_positions_avg_score'].columns, axis = 1) 
df_fp.head()

Unnamed: 0,market_value_in_euro,player_age,geographical_continent,main_position,positional_flexibility,league,division,current_national_player,national_team_appearances,preferred_foot,weak_foot,attacking_work_rate,defensive_work_rate,skill_moves,pace,shooting,passing,dribbling,defending,physic
0,12000000.0,34,Europe,Centre Midfield,Low,LaLiga (ESP),1,True,127,Right,4 Stars,High,High,4 Star Moves,74,76,89,89,72,66
1,4000000.0,35,Europe,Centre Back,Low,Serie A (ITA),1,False,103,Left,3 Stars,Medium,High,2 Star Moves,68,46,58,60,90,82
2,14500000.0,34,Europe,Centre Back,Low,LaLiga (ESP),1,True,170,Right,3 Stars,High,Medium,3 Star Moves,72,68,75,73,87,85
3,8000000.0,34,South America,Centre Back,Low,Serie A (ITA),1,True,135,Right,3 Stars,Medium,High,2 Star Moves,60,48,64,61,89,83
4,20000000.0,33,South America,Striker,Low,Ligue 1 (FRA),1,True,116,Right,4 Stars,High,High,3 Star Moves,75,86,72,79,55,83


In [15]:
# fieldplayers
train_set_fp = df_fp.sample(frac=0.80, random_state=rseed)
y_final_fp = df_fp.drop(train_set_fp.index)
y_final_fp = y_final_fp["market_value_in_euro"]

# goal keepers
train_set_gk = df_gk.sample(frac=0.80, random_state=rseed)
y_final_gk = df_gk.drop(train_set_gk.index)
y_final_gk = y_final_gk["market_value_in_euro"]

print("Fieldplayers: ")
print(train_set_fp.shape)
print(y_final_fp.shape)
print("-----------------------")
print("Goalkeepers: ")
print(train_set_gk.shape)
print(y_final_gk.shape)

Fieldplayers: 
(8598, 20)
(2149,)
-----------------------
Goalkeepers: 
(1110, 37)
(277,)


##### Feature importance

In [27]:
###################################
    # 2.1. train-test-split with remaining 80% of data
    ###################################

    from sklearn.model_selection import train_test_split

    X = train_set_fp.drop(["market_value_in_euro"],axis=1)

    y = train_set_fp["market_value_in_euro"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rseed)

    ###################################
    # 2.2. Build preprocessing pipeline
    ###################################

    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler, OneHotEncoder

    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    ###################################
    # 2.3. Apply column transformer
    ###################################

    from sklearn.compose import ColumnTransformer

    numeric_features = train_set_fp.select_dtypes(include=['int64', 'float64']).drop(["market_value_in_euro"],axis=1).columns
    categorical_features = train_set_fp.select_dtypes(include=['object']).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    ###################################
    # 2.4. Fit algorithm - in this case try a simple linear regression
    ###################################

    from sklearn.ensemble import RandomForestRegressor
    import xgboost as xgb

    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', RandomForestRegressor(random_state=rseed))])

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)

    
    ###################################
    # 2.5. Feature importance
    ###################################
    
    import eli5
    
    onehot_columns = list(pipe.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names(input_features=categorical_features))
    numeric_features_list = list(numeric_features)
    numeric_features_list.extend(onehot_columns)
    
    weights = eli5.show_weights(pipe.named_steps['regressor'], top=50, feature_names=numeric_features_list)
    
    ###################################
    # 2.6. Test model
    ###################################

    from sklearn.metrics import mean_squared_error
    from math import sqrt
    from sklearn.metrics import r2_score

    rms = sqrt(mean_squared_error(y_test, y_pred))
    print(rms)
    print(r2_score(y_test, y_pred))
    weights
    
    

2196757.0212540072
0.6304366789351225


Weight,Feature
0.2762  ± 0.0458,dribbling
0.1735  ± 0.0405,defending
0.1443  ± 0.0232,player_age
0.0645  ± 0.0415,league_Premier League (ENG)
0.0578  ± 0.0304,physic
0.0535  ± 0.0263,shooting
0.0520  ± 0.0278,passing
0.0380  ± 0.0200,pace
0.0246  ± 0.0150,national_team_appearances
0.0144  ± 0.0120,league_Serie A (ITA)
