This notebook creates the large df used for the subsequent EPV calculations. 
It loads the data, and then determines the starting/ending buckets for each player. 
It then uses this to create a transition probability matrix. The actual computation of 
the TP is done on my lab computer. 

In [None]:
import PlottingFunction as pf
import DataManipulation as dm
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import numpy as np
import winsound
# from sklearn.mixture import GaussianMixture

%matplotlib qt5

In [None]:
# Import data
df_serve = pd.read_csv(Path.cwd() / 'data' / 'vast' / "vast_serve_200k.csv")
df_serve_return = pd.read_csv(Path.cwd() / 'data' / 'vast' / "vast_serve_return_200k.csv")
df_rally = pd.read_csv(Path.cwd() / 'data' / 'vast' / "vast_rally_600k.csv")
df = pd.concat([df_serve,df_serve_return, df_rally])
# Delete the smaller dfs
df_list = [df_serve, df_serve_return, df_rally]
del df_serve, df_serve_return, df_rally
del df_list
# Move state variables to the end
df['Type'] = df.type
df['Error'] = df.error
df['Prediction'] = df.prediction
df = df.drop(['error', 'type', 'prediction'], axis=1)
# Correct prediction values for errors and winners
df['Prediction'] = np.where((df.Error == 1),0,df.Prediction)
df['Prediction'] = np.where((df.Prediction > 0.80),1,df.Prediction)
# Remove outliers in the data
df = dm.keep_data_in_frame(df, compressed=True, only_half=False, big_buckets=True)
df = df.reset_index(drop=True)

In [None]:
df['impact_starting_bucket'] = dm.get_starting_bucket(df, 'impact')
df['receiver_starting_bucket'] = dm.get_starting_bucket(df, 'receiver')
df['impact_ending_bucket'] = dm.get_ending_bucket(df, 'impact')
df['receiver_ending_bucket'] = dm.get_ending_bucket(df, 'receiver')

# drop buckets with nan
df = df[df['impact_starting_bucket'].notna()]
df = df[df['receiver_starting_bucket'].notna()]
df = df[df['impact_ending_bucket'].notna()]
df = df[df['receiver_ending_bucket'].notna()]

# Keep players on their side (start and end)
df = df[(df.impact_starting_bucket <= 42) & (df.impact_ending_bucket <= 42) &
        (df.receiver_starting_bucket >= 43) & (df.receiver_ending_bucket >= 43)]

In [None]:
# df = df.iloc[0:10,]
# df['Prediction'] = np.where((df.Prediction > 0.75),1,df.Prediction)
# df.at[7,'receiver_ending_bucket'] = 83
# df.at[9,'receiver_starting_bucket'] = 83
# df.at[9,'Prediction'] = 1
# df.at[9,'Type'] = 'servereturn'
# df.at[8,'impact_starting_bucket'] = 15
# df.at[8,'receiver_starting_bucket'] = 83
# df.at[8,'Type'] = 'servereturn'
# df.at[0,'impact_starting_bucket'] = 15
# df.at[0,'receiver_starting_bucket'] = 83
# df.at[0,'Type'] = 'servereturn'
# df.at[0,'Prediction'] = 1

# OR do this
# df = pd.concat([df.iloc[0:100,], df.iloc[250000:250100,], df.iloc[500000:500100,]])
# df = df.reset_index(drop=True)

In [None]:
impact_second_start_bucket = pd.DataFrame(np.zeros((len(df), 1)))
receiver_second_start_bucket = pd.DataFrame(np.zeros((len(df), 1)))
second_score = pd.DataFrame(np.zeros((len(df), 1)))
second_error = pd.DataFrame(np.zeros((len(df), 1)))

next_play = {'serve':'servereturn', 'servereturn': 'rally', 'rally':'rally' }

for index, shot in df.iterrows():
    
    # if first shot ended, then don't need to sample a second shot
    if shot['Error'] == 1 or shot['Prediction'] == 1:
        impact_second_start_bucket.loc[index] = 'na'
        receiver_second_start_bucket.loc[index] = 'na' 
        second_error.loc[index] = 'na' 
        second_score.loc[index] = 'na'
    
    # sample a second shot
    else:
        # get starting locations of second shot, based on ending locations of first shot
        p_start_2 = 85 - shot['receiver_ending_bucket']
        o_start_2 = 85 - shot['impact_ending_bucket']
        # get second shot posibilities
        df_filter = df[(df.Type == next_play[shot['Type']]) & (df.impact_starting_bucket == p_start_2) & (df.receiver_starting_bucket == o_start_2)]
        # sample one instance of a second shot
        if df_filter.shape[0] != 0:
            return_shot = df_filter.sample()
        
        # if second shot errors/wins, record that
        if return_shot.Error.values == 1:
            second_error.loc[index] = 1  
        if return_shot.Prediction.values == 1:
            second_score.loc[index] = 1

        # get starting positions of the double prime
        impact_second_start_bucket.loc[index] = 85 - return_shot.receiver_ending_bucket.values
        receiver_second_start_bucket.loc[index] = 85 - return_shot.impact_ending_bucket.values 

In [None]:
df['second_error'] = second_error
df['second_score'] = second_score
df['impact_second_start_bucket'] = impact_second_start_bucket
df['receiver_second_start_bucket'] = receiver_second_start_bucket

In [None]:
df = df[df.second_error != 'none']
TP = pd.DataFrame(np.zeros((5297, 5297)))
Counts = pd.DataFrame(np.zeros((5297, 1)))
types = ['serve','servereturn','rally']
col_offset = [2,2,2]
k=0

for type in types:
    for i in range(1,43):
        for j in range(43,85):
            # filter data for that type and pair of starting locations (aka the state)
            df_filter = df[(df.Type == type) & (df.impact_starting_bucket == i) & (df.receiver_starting_bucket == j)]

            n = df_filter.shape[0]
            row = k*1764 + (i-1)*42 +(j-42) - 1 # find the row of TP that this state corresponds to
            Counts.at[row,0] = n
            if n != 0:
                for index, shot in df_filter.iterrows(): # loop through the rows for the particular state
                    if shot['Error'] == 1:
                        TP.loc[row, 5292] += 1 # add probabilities for MY error shots
                    elif shot['second_error'] == 1:
                        TP.loc[row, 5294] += 1 # add probabilities for THEIR error shots
                    elif shot['Prediction'] == 1:
                        TP.loc[row, 5293] += 1 # add probabilities for MY winning shots
                    elif shot['second_score'] == 1:
                        TP.loc[row, 5295] += 1 # add probabilities for their winning shots
                    else:
                        col = col_offset[k]*1764 + (shot['impact_second_start_bucket']-1)*42 +(shot['receiver_second_start_bucket']-42) - 1 # find the column for the ending state                      
                        TP.loc[row,col] += 1 # add probabilities for MY shots still in play
                        
                TP.loc[row] = TP.loc[row].div(n) # get the proportion by dividing by n
    k = k + 1
TP.loc[5292:5296,5296] = 1 # add transitions for scoring states going to delta

winsound.Beep(2000, 800)

