1. [Introduction](#introduction)
2. [Format](#Data)
    1. [DataFrame from File](#Datasub1)
    2. [Prepare DataFrame for Analysis](#Datasub2)


In [1]:
import numpy as np
import pandas as pd
import json
from collections import defaultdict



For the overall statistical modeling and Analysis I need to break the data into time periods these time periods to separate the World Cup tournaments. Basically each time period Is defined by the end of the World Cup and the start of the next World Cup as shown below.  The naming convention for the time periods is set where "wc1974" is the date range of the 1974 world cup tournament and "1974" is the date range from the end of the 1974 World Cup to the beginning of the 1978 World Cup.  







In [2]:
# this file is the includes the begin and end date of world cups from 1970 to 2018
file = 'beg_end.csv'
bgn_end = pd.read_csv(file)

In [5]:
bgn_end.head()

Unnamed: 0,year,bgn_mth,bgn_day,end_mth,end_day
0,1970,5,31,6,21
1,1974,6,13,7,7
2,1978,6,1,6,25
3,1982,6,13,7,11
4,1986,5,31,6,29


 The data for project in a dictionary of dictionaries saved as a JSON file.  The data format is:
 ```
 {1970 : {'year': '1970',     
          'df1': csv file name for DataFrame from result_1().to_csv()
          'df2': csv file name for DataFrame from result_2().to_csv()
          'rtg': dict with all team ratings from end of period,
          'rsl': csv file name for DataFrame from results().to_csv(),
          'bgn': begin date as pandas datetime 64,
          'end': end date as pandas datetime 64,
          }, 
   1974 : {'year': '1974', ......    
  }           
 ```

In [None]:
# reads the saved data
with open('read.json') as data_file:    
    data = json.load(data_file)

In [None]:
# writes any revisions to json
with open('read.json', 'w') as outfile:
    json.dump(data, outfile, sort_keys=True, indent=4)

In [2]:
df = pd.read_csv(r'results.csv', parse_dates=[0])  
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [3]:
df.dtypes

date          datetime64[ns]
home_team             object
away_team             object
home_score             int64
away_score             int64
tournament            object
city                  object
country               object
neutral                 bool
dtype: object

In [None]:
#function returns df within time frame from begin & end parameters
def results_1(begin, end):        #pandas timestamps
    df = pd.read_csv(r'results.csv', parse_dates=[0])   
    df = df[df['date'] >= begin]
    df = df[df['date'] <= end]
    df.index = range(len(df))
    return df

In [None]:
# this code populates the data dictionary with the df1 file, begin date and end date.  
data = defaultdict(dict)
for i, S in bgn_end.iterrows():
    bgn = pd.datetime(S[0], S[1], S[2])
    end = pd.datetime(S[0], S[3], S[4])
    df = results_1(bgn,end)
    name = 'wc' + str(S[0])[2:]
    file = 'df1_' + name  + '.csv'
    df.to_csv(file, index=False)
    data[name] = {'year':str(S[0])}
    data[name]['df1'] = file
    data[name]['bgn'] = str(bgn)
    data[name]['end'] = str(end)
    
    

Second phase of data processing, Final product is a DataFrame as follows:

|  date    |  home   | away   | neutral | gdf | draw | win |  k   | result |
| -------- | ------- | ------ | ------- | --- | ---- | ---  | --- | ------ |
| date obj | string  | string |   bool  | int | bool | bool | int |  bool  |

In [None]:
file = data['1974']['df2']


## Prepare DataFrame for Analysis <a name="Datasub2"></a>

Next I modify the DataFrame prior to apply either rating system.  I keep the *'date'*, *'home_team'* and *'away_team'* columns.  The *'home_score'* and *'away_score'* columns are combined  (winning score - losing score) into a new column **'gdf'** (goal differential as int) and deleted. Along with the that a **'home_win'** (bool) and **'not_draw'** (bool) are added. The *'neutral'* column is cast as a bool.  The *'tournament'* column values are changed to appropriate match weighting constant and the column renamed as **'k'**.  Columns *'city'* and *'country'* are not needed.
```
60 for World Cup, Olympic Games (1908–1980)
50 for Continental championship and intercontinental tournaments
40 for World Cup and Continental qualifiers and major tournaments
30 for All other tournaments
20 for Friendly matchess.
```

In [None]:
def conv_tournament(S):
    L = set(S.unique())
    A = {'FIFA World Cup'}
    B = {'AFC Asian Cup', 'African Cup of Nations', 'Copa América', 'Gold Cup', 'Nations Cup',
         'Copa América', 'UEFA Euro'}
    C = {'FIFA World Cup qualification', 'AFC Asian Cup qualification', 'African Cup of Nations qualification',
        'Copa América qualification', 'Gold Cup qualification', 'CONCACAF Championship', 'UEFA Euro qualification'}
    E = {'Friendly'}
    D = L - A - B - C - E
    return  {60:A, 50:B, 40:C, 30:D, 20:E}


def set_k(S, dict):
    A = []
    for s in S:
        for key, val in dict.items():
            if s in val:
                A.append(key)
    return A


def result(gdf):
    if not gdf:         return 0.50
    if gdf == abs(gdf): return 1.00
    else:               return 0.00

    
def explore_3(df):   
    df.columns = ['date', 'home', 'away', 'home_score', 'away_score', 'tournament', 'city', 'country', 'neutral']  
    df['gdf']      = np.subtract(df.home_score, df.away_score)
    df['draw']     = np.where(df['gdf'] == 0, True, False)
    df['win']      = np.where(df.gdf > 0, True, False)
    df['k']        = set_k(df.tournament, conv_tournament(df.tournament)) 
    df['result']   = df.gdf.apply(lambda x : result(x))  
    df['gdf']      = df.gdf.abs()
    df = df.drop(['home_score', 'away_score', 'city', 'country', 'tournament'], axis=1)    
    return df

In [None]:
A = list(data.keys())
A.remove('2018')
         
for a in A:
    file_df1 = 'df1_' + a + '.csv'
    file_df2 = 'df2_' + a + '.csv'
    df = results_2(pd.read_csv(file_df1))
    df.to_csv(file_df2, index=False)
    data[a]['df2'] = file_df2

In [None]:
# this function calculates G value 
def calc_g(gdf):    # gdf is int >= 0
    if not gdf:  return 1.00
    if gdf == 1: return 1.00
    if gdf == 2: return  1.50 
    else:        return (11 + gdf) / 8  

# calculates p(win) based upon the differnce in ratings 
def win_exp(dfr):                            # dfr is home rating - away rating 
    return 1 / ((10) ** (-dfr / 400) + 1)   # actually home win prob.  away is (1.00 - win_exp(dfr))

# the results df with all countries rating at "begin" date
def start_results(date,dict):
    A = []
    for key, val in dict.items():
        S = pd.Series([date, key, int(val)])
        A.append(S)
    df = pd.DataFrame(A)
    df.columns = ['date', 'country', 'rating']
    return df

#creates ratings dict to begin an analysis for initial time period
def teams_rate_begin(df):
    all = set(df.home.unique()) | set(df.away.unique())
    rtgs = {}
    for team in all:
        rtgs[team] = 1500   
    return rtgs

def teams_rate_next(df, dict):
    all_team = set(df.home.unique()) | set(df.away.unique())
    add_team = all_team - set(dict.keys())
    for team in add_team:
        rtgs[team] = 1500   
    return rtgs

In [None]:
rtgs = teams_rate_begin(df)
results = 
win_probs = []

In [None]:
A = ['1970']

B = [ 'wc70', '1974', 'wc74', '1978', 'wc78', '1982', 'wc82', '1986', 'wc86', '1990',
      'wc90', '1994', 'wc94', '1998', 'wc98', '2002', 'wc02', '2006',  'wc06', '2010'
    ]

C =  ['wc10', '2014', 'wc14', '2018', 'wc18']


In [None]:
def result_3(df):

    for (i, S) in df.iterrows():

        date = S[0]                                         #date
        home, away = S[1], S[2]                             # strings
        nuetral, draw, win = S[3], S[5], S[6]               # bools
        gdf, k =  S[4], S[7]                                # ints 
        result = S[8]                                       # float

        home_rtg, away_rtg = rtgs[home], rtgs[away]

        if not nuetral:   dfr = home_rtg + 100 - away_rtg                    # not nuetral? ==> home rating + 100                                    
        else:    dfr = home_rtg - away_rtg   

        expect = win_exp(dfr)
        adjust = int(np.round(calc_g(gdf) * k * (result - expect), decimals=0))  
        home_rtg += adjust
        away_rtg -= adjust

        if not draw:
            if win: win_probs.append(1 - expect)                            # appends list with winning team's P(expect)
            else:   win_probs.append(- expect)

        results = results.append(pd.DataFrame(pd.Series({'date':date, 'country':home, 'rating':home_rtg})).T) 
        results = results.append(pd.DataFrame(pd.Series({'date':date, 'country':away, 'rating':away_rtg})).T) 


    rtgs[home] = home_rtg; rtgs[away] = away_rtg 

    results.index = range(len(results))

In [None]:
def result_3_initial(df, dict):
    rtgs = teams_rate_begin(df)
    results = start_results(pd.Timestamp(dict['bgn']), rtgs)
    def result_3(df, dict)
    
    
    return dict
    
    
    
    
    
    