In [2]:
import pandas as pd
import numpy as np
import attancanceUtil as util


attendance_df = pd.read_csv('../data/matches.csv', sep=',', parse_dates=[1], date_parser=util.custom_date_parser)

# Technische NA (16 Spiele wurden nicht durchgeführt und haben keine gültigen Werte).
attendance_df.dropna(inplace=True)
attendance_df = attendance_df[(attendance_df['attendance'] != 'Unter Ausschluss der Öffentlichkeit')]

#Nummerische werte als solche definieren
attendance_df['attendance'] = pd.to_numeric(attendance_df['attendance'])
attendance_df['matchday'] = pd.to_numeric(attendance_df['matchday'], downcast='integer')


#saison auslesen:
season = attendance_df.apply(lambda x: util.determine_season(x['date']), axis=1)
attendance_df['season'] = season

#Hinzufügen der Geografischen Distanz:
team_distance = attendance_df.apply(lambda x: util.calc_dist(x['hometeam'], x['awayteam']), axis=1)
attendance_df['team_distance'] = team_distance


#Hinzufügen der Punkte vor der Runde:
points_home = attendance_df.apply(lambda x: util.calc_point_average_before_game(x['season'], x['matchday'],x['hometeam']), axis=1)
attendance_df['points_home'] = points_home
points_away = attendance_df.apply(lambda x: util.calc_point_average_before_game(x['season'], x['matchday'],x['awayteam']), axis=1)
attendance_df['points_away'] = points_away

#Hinzufügen der Position in der Liga:
position_home = attendance_df.apply(lambda x: util.calc_postion_before_game(x['season'], x['matchday'],x['hometeam']), axis=1)
attendance_df['position_home'] = position_home
position_away = attendance_df.apply(lambda x: util.calc_postion_before_game(x['season'], x['matchday'],x['awayteam']), axis=1)
attendance_df['position_away'] = position_away

#Hinzufügen der Position in der vorsaision:
last_season_position_home = attendance_df.apply(lambda x: util.calc_last_seasons_position(x['season'], x['hometeam']), axis=1)
attendance_df['last_season_position_home'] = last_season_position_home
last_season_position_away = attendance_df.apply(lambda x: util.calc_last_seasons_position(x['season'], x['awayteam']), axis=1)
attendance_df['last_season_position_away'] = last_season_position_away

#Hinzufügen der aktuellen Form:
form_home = attendance_df.apply(lambda x: util.calc_point_average_from_last_five_games(x['season'], x['matchday'],x['hometeam'], attendance_df), axis=1)
attendance_df['form_home'] = form_home
form_away = attendance_df.apply(lambda x: util.calc_point_average_from_last_five_games(x['season'], x['matchday'], x['awayteam'], attendance_df), axis=1)
attendance_df['form_away'] = form_away

attendance_df


Unnamed: 0,weekday,date,time,stadium,attendance,hometeam,awayteam,matchday,result,season,team_distance,points_home,points_away,position_home,position_away,last_season_position_home,last_season_position_away,form_home,form_away
0,Sonntag,2004-03-21,14:30,Stadion Wankdorf,7500,BSC Young Boys,Servette Genève,25,4:2,2003,129637.370939,2.041667,1.750000,2,3,4,6,2.2,1.6
1,Sonntag,2003-11-23,16:15,Stadion Wankdorf,11000,BSC Young Boys,Servette Genève,17,3:0,2003,129637.370939,2.125000,1.750000,2,3,4,6,3.0,1.4
2,Sonntag,2004-02-15,16:00,Stadion Wankdorf,11850,BSC Young Boys,FC Basel,19,0:1,2003,68663.335223,2.111111,2.888889,2,1,4,2,2.6,2.6
3,Dienstag,2003-07-22,19:30,Stadion Wankdorf,11850,BSC Young Boys,FC Basel,2,2:3,2003,68663.335223,3.000000,3.000000,1,3,4,2,3.0,3.0
4,Sonntag,2004-05-09,16:15,Stadion Wankdorf,5000,BSC Young Boys,Neuchâtel Xamax FC,33,3:1,2003,39705.085279,2.031250,0.937500,2,9,4,3,2.4,1.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2855,Mittwoch,2019-05-22,20:00,Letzigrund,100,Grasshopper Club Zürich,FC Sion,35,0:3,2018,155415.844527,0.705882,1.176471,10,8,9,6,0.4,0.6
2856,Dienstag,2018-09-25,20:00,Letzigrund,3500,Grasshopper Club Zürich,FC Thun,8,0:2,2018,97172.740401,1.000000,1.571429,8,2,9,7,1.4,1.6
2857,Samstag,2019-04-20,19:00,Letzigrund,4300,Grasshopper Club Zürich,FC Thun,30,1:1,2018,97172.740401,0.758621,1.344828,10,3,9,7,0.8,0.4
2858,Sonntag,2018-12-09,16:00,Letzigrund,4800,Grasshopper Club Zürich,FC Luzern,17,2:3,2018,39797.513591,1.062500,1.187500,9,7,9,3,1.2,1.4


In [3]:
f = open("../data/matches_prep.csv", "w")
attendance_df.to_csv(f, index= False)
f.close()

In [2]:
attendance_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2860 entries, 0 to 2859
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   weekday     2860 non-null   object        
 1   date        2860 non-null   datetime64[ns]
 2   time        2844 non-null   object        
 3   stadium     2844 non-null   object        
 4   attendance  2844 non-null   object        
 5   hometeam    2860 non-null   object        
 6   awayteam    2860 non-null   object        
 7   matchday    2844 non-null   float64       
 8   result      2860 non-null   object        
 9   season      2860 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(7)
memory usage: 223.6+ KB
