In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn import cross_validation
from sklearn.linear_model import LinearRegression

import seaborn as sns
sns.set_style('darkgrid')

%matplotlib inline



In [2]:
def make_dataset():
	"""Return pandas dataframe with raw data set"""
# 	PATH = '/Users/chalpert/Documents/Columbia/Capstone/soccer-capstone/model/data/'
	data = pd.read_csv('Players_Combined_v3.csv')
	data = data.sort_values('season')

	# Make all columsn lowercase
	data.columns = [i.lower() for i in data.columns]

	# Normalize metrics to be on a per 90 minute basis
	data['games'] = data['app_start'] + data['app_sub']
	data['goals_per_90'] = data['goals']/data['mins']*90
	data['assists_per_90'] = data['assists']/data['mins']*90
	data['yel_per_90'] = data['yel']/data['mins']*90
	data['red_per_90'] = data['red']/data['mins']*90
	data['owng_per_90'] = data['owng']/data['mins']*90

	per_games_cols = ['shots_per_game', 'aerialswon', 'tackles', 'inter', 'fouls', 'offside_won', 'clear',
       'drb_past', 'blocks', 'keyp', 'drb', 'fouled', 'offside', 'dispossessed', 'bad_control', 
        'passes_per_game', 'crosses','long_ball', 'through_ball']

	for col in per_games_cols:
		data[col] = data[col]*data['games']/data['mins']*90

	data.rename(columns = {'passes_per_game': 'passes'}, inplace=True)
	# Drop zero values
	data = data[(data['market_val'] > 0) & (data['market_val_prev'] > 0)]
	data['log_market_val'] = np.log(data['market_val'] + 1.)
	data['log_market_val_prev'] = np.log(data['market_val_prev'] + 1.)
	data['market_val_change'] = data['market_val'] - data['market_val_prev']
	data['high_team_net_worth'] = data['team_name'].isin(['real madrid', 'espanyol barcelona', 'fc barcelona', 'fc valencia', 'atletico madrid'])
	return data

In [3]:
df = make_dataset()

In [14]:
#drop most categorical attributes, and non-per90 and log attributes
df1 = df.drop(['name', 'nationality',
               'member_since','team_before','contract_expir',
               'team_name','birthday',
               'goals','assists',
               'yel','red','owng',
               'team','position','is_match',
               'log_market_val','log_market_val_prev',
               'market_val_change','high_team_net_worth'], axis=1)

In [15]:
# PCA only works on numeric variables, so check to see if anything categorical remains
cat_df = df1.select_dtypes(include=['object','bool']).copy()
cat_df.head()

Unnamed: 0,preferred_foot,pos_type
2039,right,Forward
1867,both,Midfielder
1866,left,Midfielder
1865,right,Defender
1864,right,Defender


In [16]:
# look for missing values in categorical variables
cat_df[cat_df.isnull().any(axis=1)]

Unnamed: 0,preferred_foot,pos_type
2008,,Defender
1740,,Defender
1089,,Defender
913,,Midfielder
625,,Midfielder
817,,Defender
314,,Midfielder
521,,Midfielder
481,,Midfielder
236,,Midfielder


In [17]:
# impute missing variables for preferred_foot with right since most common
cat_df["preferred_foot"].value_counts()

right    1248
left      444
both      128
Name: preferred_foot, dtype: int64

In [18]:
cat_df = cat_df.fillna({"preferred_foot": "right"})

In [19]:
#one hot encoding
encod_df = pd.get_dummies(cat_df, columns=["preferred_foot", "pos_type"], prefix=["foot", "pos"])

In [20]:
df_final = df1.drop(["preferred_foot", "pos_type"], axis=1)

In [21]:
result = df_final.join(encod_df)

In [22]:
colnames = result.columns.tolist()

In [23]:
#rearrange columns so response variable is first
colnew = [ 'market_val',
 'height',
 'season',
 'age_current',
 'rank_in_season',
 'mins',
 'shots_per_game',
 'pass_success',
 'aerialswon',
 'motm',
 'rating',
 'tackles',
 'inter',
 'fouls',
 'offside_won',
 'clear',
 'drb_past',
 'blocks',
 'keyp',
 'drb',
 'fouled',
 'offside',
 'dispossessed',
 'bad_control',
 'passes',
 'crosses',
 'long_ball',
 'through_ball',
 'app_start',
 'app_sub',
 'market_val_prev',
 'games',
 'goals_per_90',
 'assists_per_90',
 'yel_per_90',
 'red_per_90',
 'owng_per_90',
 'foot_both',
 'foot_left',
 'foot_right',
 'pos_Defender',
 'pos_Forward',
 'pos_Midfielder']

In [24]:
dataf = result.reindex(columns=colnew)

In [25]:
#check to see if any other missing variables in the data set
dataf[dataf.isnull().any(axis=1)]

Unnamed: 0,market_val,height,season,age_current,rank_in_season,mins,shots_per_game,pass_success,aerialswon,motm,...,assists_per_90,yel_per_90,red_per_90,owng_per_90,foot_both,foot_left,foot_right,pos_Defender,pos_Forward,pos_Midfielder
2008,7200000,,2009,39,281,2369,0.4103,75.4,1.128324,0,...,0.075981,0.227944,0.037991,0.0,0,0,1,1,0,0
1740,5400000,,2010,39,293,2880,0.5,80.1,1.0,0,...,0.0625,0.3125,0.03125,0.0,0,0,1,1,0,0


In [27]:
# Impute missing variables with mean of the column
mean_height = dataf["height"].mean()
dataf = dataf.fillna({"height": mean_height})

In [29]:
dataf.to_csv("pca_data.csv")

In [None]:
########################################