In [None]:
#Desafio de Data Science

# Uma equipe de futebol está reformulando seu elenco e uma das ações planejadas é
# a aquisição de um goleiro. Essa equipe precisa que você indique o valor de mercado
# desse goleiro para que eles possam fazer um bom negócio. Usando o conjunto de dados
# de jogadores de futebol fornecido, crie pelo menos três modelos capazes de auxiliar
# essa equipe na negociação do jogador e, por fim, escolha o melhor modelo.
# Considere que o orçamento disponível para esta compra é de até 1 milhão de euros.

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Loading the file
df = pd.read_csv("../input/fifa-dataset/data.csv")
df.head(10)

In [None]:
# First impressions
#df.head()
#df.info()

In [None]:
# dataset dimensionality
df.shape

In [None]:
# Great, now we need to define our target.
# We want to acquire a GoalKeeper and define his market price (Value)
# So, based on this, we'll need all the GoalKeepers information to start analyzing
# For this, let's extract GoalKeepers information
df.Position == "GK"
df_gk = df.loc[df.Position == "GK"]

# Checking our new dimensionality
df_gk.shape[0]

In [None]:
# DataFrame information
#df_gk.info()
## As we saw in ".info()" our data has shown not fully-filled
#  As expected, the data which doesn't represent the GoalKeepers are empty, that is,
# data from columns 28 to 53

In [None]:
# Let's drop this data
gk_dataset = df_gk.drop(df_gk.iloc[:,28:54],axis=1)
#gk_dataset.info()
#data has been dropped

In [None]:
#let's save the names and values
names = gk_dataset.Name.tolist()

In [None]:
# Heights need to be converted because there's a string
# Weights contains strings, we need to fix that
# Let's fix Heights constructing a function that convert it into meter. Thid method
# seems, by theory, easier now.
def ConvertHeights(varh):
    store = [] # will get the splitted data
    height = 1 # initializes a Global Variable so we can return a value from a local enviroment
    if isinstance(varh, str):
        aux = varh.split("'")
        feet = int(aux[0])
        inches = int(aux[1])
        height = (12*feet+inches)*0.0254
    return (height)

# now we apply the function in our data
gk_dataset["Height"] = gk_dataset["Height"].apply(ConvertHeights)
gk_dataset["Height"].fillna((gk_dataset["Height"].mean()), inplace = True)

In [None]:
# Now, the weight
def WeightConvert(varw):
    
    if isinstance(varw,str):
        
        return (varw.replace("lbs",""))

gk_dataset["Weight"] = gk_dataset["Weight"].apply(WeightConvert).astype("float")
gk_dataset["Weight"].fillna(gk_dataset["Weight"].mean(), inplace = True)

In [None]:
##Now, let's convert, value, wage and realese clause
# Defining a function to check if there are "eurosign" strings availabe and convert it to nothing

def V_W_convertion(value):
    if isinstance(value, str): # Verify if value is intance of string. Returns True|False
        output = value.replace("€","")
        if "M" in output:
            output = float(output.replace("M", ""))*100000 # put value at units
        elif "K" in value:
            output = float(output.replace("K",""))*1000 # put value at units
        return float(output)

# Defined the function, we can apply it to value, wage and realese clause
# We cam use lambda funcation to win time

gk_dataset["Value"] = gk_dataset["Value"].apply(lambda x: V_W_convertion(x))
#gk_dataset["Value"] = gk_dataset["Value"].mask(gk_dataset["Value"] == 0,gk_dataset["Value"].mean())
gk_dataset["Wage"] = gk_dataset["Wage"].apply(lambda x: V_W_convertion(x))
gk_dataset["Release Clause"] = gk_dataset["Release Clause"].apply(lambda x: V_W_convertion(x))

# As we know, realese clause contains missing values
# Let's fill with mean value
gk_dataset["Release Clause"].fillna(gk_dataset["Release Clause"].mean(),inplace=True)
#pd.set_option("display.max_rows", None, "display.max_columns", None)
#gk_dataset
gk_dataset["Value"]

In [None]:
# Let's drop now the columns
gk_dataset.drop(["ID","Photo","Flag","Club Logo","Real Face","Jersey Number", "Loaned From"],axis=1, inplace=True)

In [None]:
#
gk_dataset.drop(["Name", "Nationality","Club","Wage", "Preferred Foot", "Position", "Joined","Contract Valid Until","Release Clause"], axis=1, inplace=True)
#gk_dataset.head()

In [None]:
# Right, now we the right data
gk_dataset.info()

In [None]:
# Let's map the categorical data into a numerical data
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
gk_dataset["Body Type"] = label_encoder.fit_transform(gk_dataset["Body Type"])

In [None]:
# "work_rate" presents two strings into one cell. We need to spit them
work_rate_aux = gk_dataset["Work Rate"].str.split("/", n=1, expand=True)
work_rate_aux

In [None]:
# Now we create two new columns for work_rate_aux[0] and work_rate_aux[1] 
gk_dataset["Work Rate 1"] = work_rate_aux[0]
gk_dataset["Work Rate 2"] = work_rate_aux[1]

#mapping the categorical data into a numerical data
gk_dataset["Work Rate 1"] = label_encoder.fit_transform(gk_dataset["Work Rate 1"])
gk_dataset["Work Rate 2"] = label_encoder.fit_transform(gk_dataset["Work Rate 2"])
# And, finally, drop work rate column
gk_dataset.drop(["Work Rate"], axis=1, inplace=True)
#pd.set_option("display.max_rows", None, "display.max_columns", None)
gk_dataset.columns
gk_dataset["Skill Moves"]

In [None]:
#View the correlations
corr_mat = gk_dataset.corr()
corr_mat

In [None]:

#Correlation With Value (most correlated with positive)
pd.DataFrame(corr_mat["Value"]).sort_values("Value", ascending=False).head(50)

In [None]:
#Correlation With Value (modes correlated with negative)
pd.DataFrame(corr_mat["Value"]).sort_values("Value", ascending=True).head(7)

In [None]:
inputs = gk_dataset[['Age', 'Overall', 'Potential', 'Special',
       'International Reputation', 'Weak Foot', 'Skill Moves', 'Body Type',
       'Height', 'Weight', 'Crossing', 'Finishing', 'HeadingAccuracy',
       'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
       'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility',
       'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength',
       'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
       'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle',
       'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes',
       'Work Rate 1', 'Work Rate 2']]

In [None]:
# Here, we have features available and target available. Therefore, we have supervised learning.
# We have to decide if it's a Regression problem or Classification problem.
# In Regression, we have continuos values and in Classification we have discrete values.
# Based on that, let's verify our target value
target = np.log1p(gk_dataset["Value"]) # high values - skewness
target
sns.distplot(target,kde=False)

In [None]:
## As we can see, the distribution lies between 7 to 17
# Therefore, let's focus at this range and forget these outliers
target_index = target[target>7].index
inputs = inputs.loc[target_index]
target = target[target>7]
print(inputs.shape,target.shape)

In [None]:
# We have 46 features, let's reduce them 
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Normalizing the data
inputs_scaled = MinMaxScaler().fit_transform(inputs)
f_selection = SelectFromModel(Lasso(alpha=0.0001, random_state=41))
f_selection.fit(inputs_scaled, target)

In [None]:



f_selection.get_support() # get index of the features selected

In [None]:
selected_features = inputs.columns[(f_selection.get_support())]
selected_features

In [None]:
#Based on that, we can take some conclusions
print("Total Features: {}".format((inputs.shape[1])))
print("Selected Features: {}".format(len(selected_features)))

In [None]:
## Ok, let's build our model
# First we need to split our data into train-test data
from sklearn.model_selection import train_test_split
x_model = inputs[selected_features] 
x_model.shape #check
xtrain,xtest,ytrain,ytest = train_test_split(x_model,target,test_size=0.3,random_state=41)
print(xtrain.shape,xtest.shape,ytrain.shape,ytest.shape)

In [None]:
## Let's create a function that retuns some informations
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score

def plot_analisys(y,ypred,figsize=(10,4), title=""):
    #setting
    fig,axs = plt.subplots(1,2,figsize=figsize)
    
    #defing the type of plot in axs[0]
    axs[0].scatter(y,ypred)
    
    #defing the boundries
    mn = min(np.min(y),np.min(ypred))
    mx = max(np.max(y),np.max(ypred))
    axs[0].plot([mn,mx],[mn,mx], c="red")
    
    #setting labels
    axs[0].set_xlabel("$y$")
    axs[0].set_ylabel("$\hat{y}$")
    
    #Calculating statistics
    rmse = np.sqrt(mean_squared_error(y,ypred))
    evs = explained_variance_score(y,ypred)
    r2 = r2_score(y,ypred)
    
    # String format axs0
    axs[0].set_title("rmse = {:.2f}, evs = {:.2f}, r2 = {:,.2f}".format(rmse,evs,r2))
    
    #defing the type of plot in axs[1]
    axs[1].hist(y-ypred,bins=50)
    avg = np.mean(y-ypred)
    std = np.std(y-ypred)
    axs[1].set_xlabel("$y - \hat{y}$")
    
    # String format axs1
    axs[1].set_title("Histogram predictor error, $\mu$ = {:.2f}, $\sigma$= {:.2f}".format(avg,std))
    
    if title != "":
        fig.suptitle(title)

In [None]:
# Creating a pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso,LinearRegression, Ridge,LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn import linear_model

model = Pipeline(( ("standard_Scaler", StandardScaler()),
                 ("poly", PolynomialFeatures(degree = 2)),
                 ("lin_reg", Lasso(alpha=0.01)) ))

# Training the model
model.fit(xtrain, ytrain)

# Making predictions for the train data
ytrain_pred = model.predict(xtrain)
plot_analisys(ytrain,ytrain_pred,title = "Polynomial model - Training set")

# Making predictions for the test data
ytest_pred = model.predict(xtest)
plot_analisys(ytest,ytest_pred,title = "Polynomial model - Test set")

In [None]:
#set up the 4 models we're choosing from:

lm = LinearRegression()

#Feature scaling for train, val, and test so that we can run our ridge model on each
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(xtrain.values)
X_test_scaled = scaler.transform(xtest.values)

lm_reg = Ridge(alpha=0.05, normalize=True)

lm_lasso = Lasso(alpha=0.05, normalize=True)



lm.fit(xtrain, ytrain)
print(f'Linear Regression for all data R^2: {lm.score(x_model, target):.3f}')
print(f'Linear Regression for test data R^2: {lm.score(xtest, ytest):.3f}')
print("")

lm_reg.fit(X_train_scaled, ytrain)
print(f'Ridge Regression for test data R^2: {lm_reg.score(X_test_scaled, ytest):.3f}')

print("")


lm_lasso.fit(xtrain,ytrain)
print(f'Lasso Regression for test data R^2: {lm_lasso.score(xtest, ytest):.6f}')

print("")

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
#Model based on most selected columns

X1= gk_dataset.loc[:,['Overall', 'Potential','International Reputation','Reactions','GKDiving', 'GKHandling', 'GKKicking',
                      'GKPositioning', 'GKReflexes']]
y1= gk_dataset.loc[:,"Value"]

player_model1 = sm.OLS(y1, X1, data=gk_dataset)

results1 = player_model1.fit()

print(results1.summary());