In [151]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

### Učitavanje podataka i odbacivanje atributa koji neće biti koriščeni u analizi
### Interpolacija zbog elimisanja nedostajućih vrednosti

In [152]:
df = pd.read_csv('../data.csv')
df = df.drop(df.columns[0], axis=1)
df.set_index('ID', inplace=True)
df = df.drop(['Name', 'Photo', 'Club', 'Club Logo',
        'Flag', 'Special', 'Jersey Number',
        'Contract Valid Until', 'Loaned From',
        'Joined', 'Work Rate', 'Body Type', 'Real Face',
        'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 
        'LAM', 'CAM', 'RAM', 'LM', 'LCM',
        'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 
        'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB'], axis=1)
df = df.interpolate()

### Parsiranje nekoliko kolona gde su numerički podaci zadati u nekom obliku stringa

In [153]:
df.replace({
    'Left': 0,
    'Right': 1
}, inplace=True)

In [154]:
df['Wage'] = df[['Wage']].replace(
    {'€':'', 
    'K': '00',
    'M': '00000'
    }, regex=True).convert_objects(convert_numeric=True)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """


In [155]:
def fix_value(x: str) -> str:
    if (x.find('.') == -1):
        return x
    else:
        return x[:len(x)-1].replace('.', '')

In [156]:
df['Release Clause'] = df[['Release Clause']].fillna('nan')
df['Height'] = df[['Height']].fillna('nan')

In [157]:
df['Release Clause'] = df['Release Clause'].replace({
    '€': '',
    'M': '00000',
    'K': '000',
}, regex=True).apply(fix_value).convert_objects(convert_numeric=True)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """


In [158]:
df['Value'] = df['Value'].replace({
    '€': '',
    'M': '000000',
    'K': '000'
}, regex=True).map(fix_value).convert_objects(convert_numeric=True)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """


### Mapiranje pozicija igrača na terenu u numeričke vrednosti koje predstavljaju na kom mestu na terenu je angažovan igrač na toj poziciji

In [159]:
df['Position'].replace({
    'GK': 0.0,
    'CB': 1.0,
    'LCB': 1.0,
    'RCB': 1.0,
    'LB': 1.5,
    'RB': 1.5,
    'RWB': 1.9,
    'LWB': 1.9,
    'CM': 2,
    'LCM': 2,
    'RCM': 2,
    'CDM': 1.5,
    'LDM': 1.5,
    'RDM': 1.5,
    'LM': 2.5,
    'RM': 2.5,
    'RAM': 3,
    'CAM': 3,
    'LAM': 3,
    'LW': 3.5,
    'RW': 3.5,
    'CF': 3.8,
    'LF': 3.8,
    'RF': 3.8,
    'LS': 3.9,
    'RS': 3.9,
    'ST': 4.2
}, inplace=True)

In [160]:
def convert_feet_to_centimeters(x : str) -> float:
    if (x == 'nan'):
        return x
    digits = x.split('\'')
    inches = int(digits[1]) + 12 * int(digits[0])
    return (inches * 2.54)

In [161]:
df['Height'] = df['Height'].map(convert_feet_to_centimeters)

In [162]:
df['Weight'].replace('lbs', '', regex=True, inplace=True)

### Učitvavanje koordinata različitih država napravljene pomoću modula geopandas

In [163]:
geodf = pd.read_csv('../geo_position.csv', index_col=0)

In [164]:
def convert_nationality_to_geo_pos(country):
    if country in ['England', 'Wales', 'Scotland', 'Northern Ireland']:
        ctr = geodf.loc[geodf['Country'] == 'United Kingdom']
    elif country == 'Bosnia Herzegovina':
        ctr = geodf.loc[geodf['Country'] == 'Bosnia and Herz.']
    elif country == 'Korea Republic':
        ctr = geodf.loc[geodf['Country'] == 'South Korea']
    elif country == 'Czech Republic':
        ctr = geodf.loc[geodf['Country'] == 'Czechia']
    elif country == 'DR Congo':
        ctr = geodf.loc[geodf['Country'] == 'Dem. Rep. Congo']
    elif country == 'Ivory Coast':
        ctr = geodf.loc[geodf['Country'] == 'Côte d\'Ivoire']
    elif country == 'Dominican Republic':
        ctr = geodf.loc[geodf['Country'] == 'Dominican Rep.']
    elif country == 'Republic of Ireland':
        ctr = geodf.loc[geodf['Country'] == 'Ireland']
    elif country == 'United States':
        ctr = geodf.loc[geodf['Country'] == 'United States of America']
    elif country == 'Equatorial Guinea':
        ctr = geodf.loc[geodf['Country'] == 'Eq. Guinea']
    elif country == 'FYR Macedonia':
        ctr = geodf.loc[geodf['Country'] == 'Macedonia']
    elif country == 'China PR':
        ctr = geodf.loc[geodf['Country'] == 'China']
    elif country == 'Guinea Bissau':
        ctr = geodf.loc[geodf['Country'] == 'Guinea-Bissau']
    elif country == 'Trinidad & Tobago':
        ctr = geodf.loc[geodf['Country'] == 'Trinidad and Tobago']
    elif country == 'Korea DPR':
        ctr = geodf.loc[geodf['Country'] == 'North Korea']
    elif country == 'Liechtenstein':
        ctr = geodf.loc[geodf['Country'] == 'Switzerland']
    elif country == 'Faroe Islands':
        ctr = geodf.loc[geodf['Country'] == 'Iceland']
    else:
        ctr = geodf.loc[geodf['Country'] == country]
    
    if ctr.empty:
        return (float('nan'), float('nan'))
    return (float(ctr['Longitude']), float(ctr['Latitude']))

In [165]:
t = map(lambda x: convert_nationality_to_geo_pos(x),df['Nationality'])

In [166]:
longitude = []
latitude = []
for (x, y) in t:
    longitude.append(x)
    latitude.append(y)


In [167]:
df['LongOfCountry'] = longitude
df['LatitOfCountry'] = latitude
df.drop('Nationality', axis = 1, inplace=True)

In [171]:
df_num_scaled = pd.DataFrame(MinMaxScaler().fit_transform(df))

  return self.partial_fit(X, y)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39,40,41,42,43,44,45,46,47,48
0,0.517241,1.000000,0.978723,0.932489,1.000000,0.0,1.00,0.75,0.75,0.904762,...,0.285714,0.261364,0.056180,0.109890,0.155556,0.146067,0.075269,0.992982,0.166224,0.056089
1,0.586207,1.000000,0.978723,0.649789,0.716814,1.0,1.00,0.75,1.00,1.000000,...,0.318681,0.227273,0.067416,0.109890,0.155556,0.146067,0.107527,0.556959,0.366432,0.733595
2,0.344828,0.958333,0.957447,1.000000,0.513274,1.0,1.00,1.00,1.00,0.833333,...,0.241758,0.340909,0.089888,0.087912,0.155556,0.157303,0.107527,1.000000,0.208709,0.278433
3,0.379310,0.937500,0.957447,0.607595,0.460177,1.0,0.75,0.50,0.00,0.000000,...,0.208791,0.113636,1.000000,0.923077,0.955556,0.977528,1.000000,0.607404,0.381991,0.740044
4,0.379310,0.937500,0.936170,0.860759,0.628319,1.0,0.75,1.00,0.75,0.476190,...,0.615385,0.545455,0.157303,0.131868,0.044444,0.101124,0.129032,0.860947,0.410725,0.833022
5,0.379310,0.937500,0.914894,0.784810,0.601770,1.0,0.75,0.75,0.75,0.904762,...,0.274725,0.215909,0.112360,0.120879,0.055556,0.078652,0.075269,0.754354,0.410725,0.833022
6,0.551724,0.937500,0.914894,0.565401,0.743363,1.0,0.75,0.75,0.75,0.476190,...,0.813187,0.795455,0.134831,0.087912,0.066667,0.146067,0.086022,0.602141,0.452734,0.782163
7,0.517241,0.937500,0.914894,0.675105,0.805310,1.0,1.00,0.75,0.50,0.928571,...,0.472527,0.397727,0.292135,0.263736,0.333333,0.359551,0.387097,0.718823,0.198373,0.080145
8,0.551724,0.937500,0.914894,0.430380,0.672566,1.0,0.75,0.50,0.50,0.238095,...,0.989011,1.000000,0.112360,0.076923,0.088889,0.067416,0.107527,0.458262,0.381991,0.740044
9,0.310345,0.916667,0.957447,0.573840,0.166372,1.0,0.50,0.50,0.00,0.000000,...,0.109890,0.170455,0.955056,1.000000,0.855556,0.977528,0.946237,0.633285,0.447028,0.792172


In [174]:
df_num_scaled.columns = df.columns
df_num_scaled = pd.DataFrame(df_num_scaled.interpolate())

In [175]:
df_num_scaled.to_csv('../preprocessed_data.csv')