# 특성변수 파악하기

In [1]:
import pandas as pd

### 1) 데이터 전처리
- 포지션 나누기
- 리그 라벨링

#### 포지션 나누기

In [2]:
total_player_df = pd.read_csv("./merged_player.csv", encoding='utf-8')
total_player_df = total_player_df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
total_player_df.columns

Index(['age', 'asists', 'avgp', 'aw', 'blocks', 'clear', 'crosses', 'disp',
       'drb', 'flag', 'fouled', 'fouls', 'full_time', 'goals', 'half_time',
       'inter', 'keyp', 'league', 'longb', 'mins', 'motm', 'name', 'off',
       'offsides', 'owng', 'player_number', 'position', 'ps_x', 'ps_y',
       'rating', 'red', 'spg', 'tackles', 'tall', 'team_name', 'thrb',
       'unstch', 'weight', 'yel'],
      dtype='object')

In [3]:
# 포지션 확인
total_player_df['position'].unique(), len(total_player_df['position'].unique())

(array([' FW', ' AM(LR),FW', ' D(R),M(R)', ' AM(LR)', ' D(C)', ' M(C),FW',
        ' D(LR)', ' M(CR)', ' D(L)', ' D(C),M(C)', ' GK', ' M(CLR)',
        ' D(C),DMC', ' D(R)', ' Defender', ' D(C),M(CR)', ' M(C)',
        ' AM(CL)', ' AM(C)', ' D(L),M(L)', ' Midfielder', ' AM(CLR)',
        ' D(R),M(LR)', ' M(LR)', ' M(LR),FW', ' D(R),M(C)', ' M(CR),FW',
        ' AM(CLR),FW', ' AM(CR),FW', ' Forward', ' M(CLR),FW', ' DMC',
        ' M(CL)', ' D(CLR)', ' D(L),M(CL)', ' AM(L),FW', ' M(R)',
        ' Goalkeeper', ' AM(R),FW', ' D(LR),M(CR)', ' AM(L)', ' AM(C),FW',
        ' AM(CL),FW', ' AM(R)', ' D(L),DMC', ' D(CR)', ' D(CR),DMC',
        ' D(LR),M(C)', ' D(R),M(CR)', ' D(CL),M(L)', ' D(R),DMC',
        ' DMC,M(L)', ' D(CL)', ' AM(CR)', ' D(LR),M(CLR)', ' D(LR),M(R)',
        ' D(R),M(CLR)', ' M(L)', ' D(LR),M(LR)', ' D(CL),DMC', ' D(LR),DMC',
        ' D(CLR),M(R)', ' D(CL),M(C)', ' D(LR),M(L)', ' M(R),FW',
        ' D(L),M(LR)', ' D(L),DMC,M(L)', ' D(R),DMC,M(R)', ' D(CR),M(R)',
        

In [4]:
total_player_df["position"] = total_player_df["position"].apply(lambda x: x.strip())

In [5]:
forward_pos = []
mid_pos = []
defense_pos = []

for position in total_player_df["position"].unique():
    
    
    if position.find("F") == 0:
        forward_pos.append(position)
        
    if position.find("A") == 0:
        forward_pos.append(position)
        
    if position.find("M") == 0:
        mid_pos.append(position)
        
    if position.find("D") == 0:
        defense_pos.append(position)

In [6]:
forward_df = pd.DataFrame(columns=total_player_df.columns)

for position in forward_pos:
    f_df = total_player_df[total_player_df["position"] == position]
    forward_df = pd.concat([forward_df, f_df])
    
mid_df = pd.DataFrame(columns=total_player_df.columns)

for position in mid_pos:
    m_df = total_player_df[total_player_df["position"] == position]
    mid_df = pd.concat([mid_df, m_df])
    
defense_df = pd.DataFrame(columns=total_player_df.columns)

for position in defense_pos:
    d_df = total_player_df[total_player_df["position"] == position]
    defense_df = pd.concat([defense_df, d_df])

#### 리그 라벨링

In [7]:
def labeling_league(x):
    if x == "Bundesliga":
        x = 0
        
    if x == "Ligue1":
        x = 1
        
    if x == "SerieA":
        x = 2
        
    if x == "PremierLeague":
        x = 3
        
    if x == "LaLiga":
        x = 4
        
    if x == "Eredivisie":
        x = 5
        
    return x

In [8]:
forward_df['labeled_league'] = forward_df['league'].apply(lambda x: labeling_league(x))
mid_df['labeled_league'] = mid_df['league'].apply(lambda x: labeling_league(x))
defense_df['labeled_league'] = defense_df['league'].apply(lambda x: labeling_league(x))

In [9]:
forward_df.to_csv('./data/forward.csv')
mid_df.to_csv('./data/mid.csv')
defense_df.to_csv('./data/defense.csv')

### 2) 리그별 특성변수 분석

#### 포워드

In [10]:
forward_league_mean_df = forward_df.groupby(['league']).mean()
forward_league_mean_df

Unnamed: 0_level_0,age,asists,avgp,aw,blocks,clear,crosses,disp,drb,fouled,...,rating,red,spg,tackles,tall,thrb,unstch,weight,yel,labeled_league
league,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bundesliga,25.430769,1.715385,0.748462,1.400769,0.045385,0.320769,68.439231,0.910769,0.475385,0.980769,...,6.606538,0.061538,1.28,0.693846,182.584615,0.545385,1.124615,76.946154,1.838462,0
Eredivisie,23.497175,1.949153,0.850282,0.90565,0.051977,0.418644,72.550282,1.074576,0.459322,0.994915,...,6.607571,0.062147,1.348588,0.70226,174.248588,0.722599,1.40678,71.322034,1.644068,5
LaLiga,25.48951,2.363636,0.777622,0.811888,0.051748,0.321678,74.022378,1.143357,0.418881,1.038462,...,6.60958,0.076923,1.311189,0.757343,177.076923,0.541259,1.279021,72.594406,2.804196,4
Ligue1,24.373418,1.689873,0.698101,0.959494,0.043038,0.293671,73.575949,1.267722,0.512025,0.922152,...,6.583671,0.164557,1.276582,0.729114,172.886076,0.528481,1.446835,70.658228,1.892405,1
PremierLeague,25.57047,1.637584,0.681208,0.963087,0.048993,0.367114,74.632215,1.095302,0.467114,0.766443,...,6.55094,0.080537,1.2,0.672483,178.47651,0.444295,1.265772,72.912752,1.288591,3
SerieA,26.171779,1.944785,0.788344,0.811043,0.063804,0.368712,73.807692,1.069939,0.552761,1.123926,...,6.585337,0.171779,1.423926,0.738037,179.190184,0.623718,1.331288,73.711656,2.619632,2


In [11]:
forward_league_mean_df[forward_league_mean_df['age'].argsort() == 0].index[0]

'LaLiga'

In [12]:
forward_league_mean_df[forward_league_mean_df['age'].argsort() == 5].index[0]

'SerieA'

In [13]:
bundes_high_list = []; bundes_low_list = []
ered_high_list = []; ered_low_list = []
laliga_high_list = []; laliga_low_list = []
ligue1_high_list = []; ligue1_low_list = []
premier_high_list = []; premier_low_list = []
serie_high_list = []; serie_low_list = []

for column in forward_league_mean_df.columns:
    # high value sort
    high_league = forward_league_mean_df[forward_league_mean_df[column].argsort() == 0].index[0]
    
    if high_league == 'Bundesliga':
        bundes_high_list.append(column)
        
    if high_league == 'Eredivisie':
        ered_high_list.append(column)
        
    if high_league == 'LaLiga':
        laliga_high_list.append(column)
        
    if high_league == 'Ligue1':
        ligue1_high_list.append(column)
        
    if high_league == 'PremierLeague':
        premier_high_list.append(column)
        
    if high_league == 'SerieA':
        serie_high_list.append(column)
    
    # low value sort
    low_league = forward_league_mean_df[forward_league_mean_df[column].argsort() == 5].index[0]
    
    if low_league == 'Bundesliga':
        bundes_low_list.append(column)
        
    if low_league == 'Eredivisie':
        ered_low_list.append(column)
        
    if low_league == 'LaLiga':
        laliga_low_list.append(column)
        
    if low_league == 'Ligue1':
        ligue1_low_list.append(column)
        
    if low_league == 'PremierLeague':
        premier_low_list.append(column)
        
    if low_league == 'SerieA':
        serie_low_list.append(column)

In [14]:
bundes_high_list, bundes_low_list

(['crosses', 'disp', 'owng', 'ps_x', 'red', 'unstch', 'labeled_league'],
 ['aw'])

In [15]:
ered_high_list, ered_low_list

(['blocks',
  'clear',
  'goals',
  'half_time',
  'longb',
  'motm',
  'offsides',
  'ps_y',
  'tackles'],
 ['disp', 'owng', 'player_number'])

In [16]:
laliga_high_list, laliga_low_list

(['age',
  'asists',
  'avgp',
  'fouled',
  'full_time',
  'keyp',
  'mins',
  'player_number',
  'spg',
  'yel'],
 ['inter', 'longb', 'rating', 'labeled_league'])

In [17]:
ligue1_high_list, ligue1_low_list

(['drb', 'rating', 'thrb'],
 ['asists', 'crosses', 'half_time', 'mins', 'ps_x', 'unstch'])

In [18]:
premier_high_list, premier_low_list

(['fouls', 'off'],
 ['avgp',
  'clear',
  'full_time',
  'goals',
  'keyp',
  'motm',
  'ps_y',
  'tackles',
  'tall',
  'thrb',
  'weight',
  'yel'])

In [19]:
serie_high_list, serie_low_list

(['aw', 'inter', 'tall', 'weight'],
 ['age', 'blocks', 'drb', 'fouled', 'fouls', 'off', 'offsides', 'red', 'spg'])

#### 미드필더

In [20]:
mid_league_mean_df = mid_df.groupby(['league']).mean()
mid_league_mean_df

Unnamed: 0_level_0,age,asists,avgp,aw,blocks,clear,crosses,disp,drb,fouled,...,rating,red,spg,tackles,tall,thrb,unstch,weight,yel,labeled_league
league,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bundesliga,24.84,1.672,0.6992,1.0336,0.092,0.5136,74.2608,0.6944,0.7,0.9024,...,6.66232,0.072,0.96,1.1896,179.592,1.232,0.8472,73.544,2.272,0
Eredivisie,22.597015,1.223881,0.622388,0.526866,0.123881,0.626866,74.646269,0.734328,0.50597,0.698507,...,6.480597,0.059701,0.808955,0.964179,168.701493,1.122388,0.822388,66.970149,1.343284,5
LaLiga,26.466667,1.637037,0.702222,0.863704,0.119259,0.611852,78.533333,0.917778,0.734074,0.993333,...,6.625407,0.214815,0.78,1.425926,174.17037,1.594815,0.808148,70.77037,4.074074,4
Ligue1,25.166667,1.485507,0.67971,0.705797,0.07029,0.444203,79.031884,0.963043,0.731159,0.760145,...,6.599348,0.152174,0.723913,1.221014,172.942029,1.141304,0.957971,69.427536,2.137681,1
PremierLeague,26.440299,1.873134,0.854478,0.544776,0.123881,0.671642,80.655224,0.95,0.863433,0.713433,...,6.655224,0.097015,0.876866,1.364179,172.425373,1.493284,0.945522,69.029851,2.201493,3
SerieA,27.015038,1.18797,0.772932,0.642857,0.167669,0.627068,80.095935,0.811278,0.783459,1.040602,...,6.630977,0.18797,0.878947,1.337594,178.030075,1.695122,0.805263,72.323308,3.18797,2


In [21]:
bundes_high_list = []; bundes_low_list = []
ered_high_list = []; ered_low_list = []
laliga_high_list = []; laliga_low_list = []
ligue1_high_list = []; ligue1_low_list = []
premier_high_list = []; premier_low_list = []
serie_high_list = []; serie_low_list = []

for column in mid_league_mean_df.columns:
    # high value sort
    high_league = mid_league_mean_df[mid_league_mean_df[column].argsort() == 0].index[0]
    
    if high_league == 'Bundesliga':
        bundes_high_list.append(column)
        
    if high_league == 'Eredivisie':
        ered_high_list.append(column)
        
    if high_league == 'LaLiga':
        laliga_high_list.append(column)
        
    if high_league == 'Ligue1':
        ligue1_high_list.append(column)
        
    if high_league == 'PremierLeague':
        premier_high_list.append(column)
        
    if high_league == 'SerieA':
        serie_high_list.append(column)
    
    # low value sort
    low_league = mid_league_mean_df[mid_league_mean_df[column].argsort() == 5].index[0]
    
    if low_league == 'Bundesliga':
        bundes_low_list.append(column)
        
    if low_league == 'Eredivisie':
        ered_low_list.append(column)
        
    if low_league == 'LaLiga':
        laliga_low_list.append(column)
        
    if low_league == 'Ligue1':
        ligue1_low_list.append(column)
        
    if low_league == 'PremierLeague':
        premier_low_list.append(column)
        
    if low_league == 'SerieA':
        serie_low_list.append(column)

In [22]:
bundes_high_list, bundes_low_list

(['crosses', 'disp', 'longb', 'ps_x', 'labeled_league'],
 ['asists', 'goals', 'unstch'])

In [23]:
ered_high_list, ered_low_list

(['age', 'blocks', 'clear', 'drb', 'half_time', 'ps_y', 'red', 'tackles'],
 ['motm', 'off', 'player_number'])

In [24]:
laliga_high_list, laliga_low_list

(['avgp', 'full_time', 'keyp', 'mins', 'thrb'],
 ['aw', 'disp', 'offsides', 'labeled_league'])

In [25]:
ligue1_high_list, ligue1_low_list

(['fouled', 'fouls', 'player_number', 'unstch', 'yel'],
 ['full_time', 'half_time', 'inter', 'mins', 'owng', 'rating', 'tackles'])

In [26]:
premier_high_list, premier_low_list

(['asists', 'motm', 'offsides'],
 ['avgp',
  'clear',
  'crosses',
  'drb',
  'keyp',
  'longb',
  'ps_x',
  'red',
  'spg',
  'tall',
  'weight',
  'yel'])

In [27]:
serie_high_list, serie_low_list

(['aw', 'goals', 'inter', 'off', 'owng', 'rating', 'spg', 'tall', 'weight'],
 ['age', 'blocks', 'fouled', 'fouls', 'ps_y', 'thrb'])

#### 수비수

In [28]:
defense_league_mean_df = defense_df.groupby(['league']).mean()
defense_league_mean_df

Unnamed: 0_level_0,age,asists,avgp,aw,blocks,clear,crosses,disp,drb,fouled,...,rating,red,spg,tackles,tall,thrb,unstch,weight,yel,labeled_league
league,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bundesliga,26.059701,0.835821,0.423881,1.880597,0.329353,2.4199,74.832836,0.344776,0.696517,0.878607,...,6.739751,0.104478,0.479104,1.564677,184.283582,2.264179,0.527363,78.771144,3.054726,0
Eredivisie,24.828283,1.040404,0.508081,1.281818,0.376768,3.04596,78.480303,0.431818,0.64899,0.706061,...,6.774747,0.156566,0.484343,1.661111,176.30303,3.101515,0.519697,71.479798,2.838384,5
LaLiga,26.88,0.893333,0.356,1.330222,0.324889,2.674222,77.115556,0.428889,0.645778,0.721778,...,6.737956,0.271111,0.412889,1.852444,178.737778,2.240444,0.554667,73.791111,4.457778,4
Ligue1,26.562791,0.734884,0.393023,1.57907,0.297674,2.528372,79.627442,0.554419,0.73907,0.833023,...,6.852698,0.265116,0.411163,1.933488,180.144186,2.297209,0.573953,74.744186,3.455814,1
PremierLeague,26.673171,0.809756,0.414634,1.490244,0.449268,3.34439,77.680976,0.472195,0.735122,0.590732,...,6.827561,0.136585,0.481951,1.79122,180.341463,2.092683,0.533171,75.35122,2.97561,3
SerieA,27.309623,0.74477,0.445188,1.230126,0.385356,2.774895,80.588987,0.410879,0.627615,0.821757,...,6.752301,0.301255,0.440167,1.571967,181.92887,2.313216,0.515481,76.267782,3.991632,2


In [29]:
bundes_high_list = []; bundes_low_list = []
ered_high_list = []; ered_low_list = []
laliga_high_list = []; laliga_low_list = []
ligue1_high_list = []; ligue1_low_list = []
premier_high_list = []; premier_low_list = []
serie_high_list = []; serie_low_list = []

for column in defense_league_mean_df.columns:
    # high value sort
    high_league = defense_league_mean_df[defense_league_mean_df[column].argsort() == 0].index[0]
    
    if high_league == 'Bundesliga':
        bundes_high_list.append(column)
        
    if high_league == 'Eredivisie':
        ered_high_list.append(column)
        
    if high_league == 'LaLiga':
        laliga_high_list.append(column)
        
    if high_league == 'Ligue1':
        ligue1_high_list.append(column)
        
    if high_league == 'PremierLeague':
        premier_high_list.append(column)
        
    if high_league == 'SerieA':
        serie_high_list.append(column)
    
    # low value sort
    low_league = defense_league_mean_df[defense_league_mean_df[column].argsort() == 5].index[0]
    
    if low_league == 'Bundesliga':
        bundes_low_list.append(column)
        
    if low_league == 'Eredivisie':
        ered_low_list.append(column)
        
    if low_league == 'LaLiga':
        laliga_low_list.append(column)
        
    if low_league == 'Ligue1':
        ligue1_low_list.append(column)
        
    if low_league == 'PremierLeague':
        premier_low_list.append(column)
        
    if low_league == 'SerieA':
        serie_low_list.append(column)

In [30]:
bundes_high_list, bundes_low_list

(['clear',
  'crosses',
  'disp',
  'full_time',
  'longb',
  'mins',
  'motm',
  'ps_x',
  'red',
  'tackles',
  'labeled_league'],
 ['aw', 'drb', 'unstch'])

In [31]:
ered_high_list, ered_low_list

(['age', 'fouls', 'goals', 'off', 'rating'],
 ['asists',
  'disp',
  'full_time',
  'inter',
  'mins',
  'owng',
  'player_number',
  'tackles'])

In [32]:
laliga_high_list, laliga_low_list

(['blocks', 'inter', 'owng', 'player_number', 'thrb', 'unstch', 'yel'],
 ['goals', 'motm', 'ps_y', 'rating', 'spg', 'labeled_league'])

In [33]:
ligue1_high_list, ligue1_low_list

(['asists', 'avgp', 'drb', 'keyp', 'ps_y', 'spg'],
 ['clear', 'fouled', 'offsides'])

In [34]:
premier_high_list, premier_low_list

([],
 ['avgp',
  'blocks',
  'half_time',
  'keyp',
  'off',
  'tall',
  'thrb',
  'weight',
  'yel'])

In [35]:
serie_high_list, serie_low_list

(['aw', 'fouled', 'half_time', 'offsides', 'tall', 'weight'],
 ['age', 'crosses', 'fouls', 'longb', 'ps_x', 'red'])

### 3) 팀별 특성변수 분석

In [54]:
bundes_team_mean_df = total_player_df[total_player_df['league'] == 'Bundesliga'].groupby(['team_name']).mean()
ered_team_mean_df = total_player_df[total_player_df['league'] == 'Eredivisie'].groupby(['team_name']).mean()
laliga_team_mean_df = total_player_df[total_player_df['league'] == 'LaLiga'].groupby(['team_name']).mean()
ligue1_team_mean_df = total_player_df[total_player_df['league'] == 'Ligue1'].groupby(['team_name']).mean()
premier_team_mean_df = total_player_df[total_player_df['league'] == 'PremierLeague'].groupby(['team_name']).mean()
serie_team_mean_df = total_player_df[total_player_df['league'] == 'SerieA'].groupby(['team_name']).mean()

In [55]:
bundes_team_mean_df.to_csv('./data/League_team_mean/bundes_team_mean.csv')
ered_team_mean_df.to_csv('./data/League_team_mean/ered_team_mean.csv')
laliga_team_mean_df.to_csv('./data/League_team_mean/laliga_team_mean.csv')
ligue1_team_mean_df.to_csv('./data/League_team_mean/ligue1_team_mean.csv')
premier_team_mean_df.to_csv('./data/League_team_mean/premier_team_mean.csv')
serie_team_mean_df.to_csv('./data/League_team_mean/serie_team_mean.csv')