In [102]:
import pprint
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from json_shot_scraper import flatten_shot, flatten_goal, flatten_complete_pass, flatten_incomplete_pass, flatten_corner
from  player_scraper import flatten_player, flatten_sub
from dataframe_cleaner import (pass_to_shot, corner_to_shot, transpose_coordinates, coord_to_yards, 
                               shot_distance_angle, dummy_columns, drop_own_goals, goal_dummy)

In [103]:
pd.set_option('display.max_columns', 50)


In [104]:
from html_scraper import db

In [151]:
from mongo_to_db import game_to_cleaned_df, create_frame, game_to_player_df, create_player_min_frame

In [106]:
db.list_collection_names()

['games']

In [107]:
db.games.count()

  """Entry point for launching an IPython kernel.


159

In [108]:
game = db.games.find_one()

# Player Dataframe

In [152]:
to_attach_df = create_player_min_frame()

In [153]:
games = db.games.find()
for game in games:
    df = game_to_player_df(game)
    master_df = pd.concat([to_attach_df, df], axis=0, ignore_index=True)
    to_attach_df = master_df.copy()

In [156]:
master_df.columns

Index(['game_id', 'name', 'player_id', 'position_id', 'squad_number',
       'substitute', 'team_id', 'minutes_played'],
      dtype='object')

In [145]:
player_sub_df = db_to_playersub_df(game)

In [147]:
player_sub_df.columns

Index(['game_id', 'name', 'player_id', 'position_id', 'squad_number',
       'substitute', 'team_id', 'minutes_played'],
      dtype='object')

# games stuff

In [110]:
shots = list(game['incidences']['shots'].items())
game_id = game['match']['matchId']
shot_list_dicts = [flatten_shot(shot, game_id) for shot in shots]
shot_df = pd.DataFrame(shot_list_dicts)
shot_df.head()

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min)
0,448548,99779.0,0.85,1.0,0.05,-0.04,-1,0.83,22336297,11,20,1.033333
1,448548,60730.0,0.52,0.73,0.14,-0.03,-1,0.73,22336386,35,20,9.683333
2,448548,177285.0,0.73,0.96,0.0,-0.1,-1,0.21,22336583,13,20,26.833333
3,448548,185745.0,0.81,0.98,0.48,-0.22,-1,-1.0,22336723,33,20,38.883333
4,448548,77892.0,-0.5,-0.99,-0.46,0.28,-1,-1.0,22336739,33,13,40.083333


In [111]:
completed_passes = list(game['incidences']['correctPasses'].items())
completed_list_dicts = [flatten_complete_pass(apass, game_id) for apass in completed_passes]
completed_passes_df = pd.DataFrame(completed_list_dicts)
completed_passes_df.head()

Unnamed: 0,game_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,pass_player,pass_type,rec_player,team_id,time_of_event(min)
0,448548,-0.02,-0.33,-0.04,-0.14,-1,-1,42505.0,180,60730.0,20,0.116667
1,448548,-0.61,-0.61,-0.63,-0.63,-1,-1,60730.0,180,208534.0,20,0.15
2,448548,-0.56,-0.56,-0.25,-0.25,-1,-1,208534.0,180,94225.0,20,0.2
3,448548,-0.52,-0.52,0.79,0.79,-1,-1,94225.0,180,30896.0,20,0.25
4,448548,-0.48,-0.48,-0.05,-0.05,-1,-1,30896.0,180,99779.0,20,0.283333


In [112]:
shot_pass_df = pass_to_shot(shot_df, completed_passes_df)
shot_pass_df.head()

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2
0,448548,99779.0,0.85,1.0,0.05,-0.04,-1,0.83,22336297,11,20,1.033333,193955.0,0.89,0.85,0.47,-0.19,-1.0,-1.0
1,448548,60730.0,0.52,0.73,0.14,-0.03,-1,0.73,22336386,35,20,9.683333,,,,,,,
2,448548,177285.0,0.73,0.96,0.0,-0.1,-1,0.21,22336583,13,20,26.833333,,,,,,,
3,448548,185745.0,0.81,0.98,0.48,-0.22,-1,-1.0,22336723,33,20,38.883333,177285.0,0.49,0.86,0.13,0.65,-1.0,-1.0
4,448548,77892.0,-0.5,-0.99,-0.46,0.28,-1,-1.0,22336739,33,13,40.083333,,,,,,,


In [113]:
corners = list(game['incidences']['cornerKicks'].items())
corner_dicts = [flatten_corner(kick, game_id) for kick in corners]
corner_df = pd.DataFrame(corner_dicts)
corner_df.head()

Unnamed: 0,ck_coord_x1,ck_coord_x2,ck_coord_y1,ck_coord_y2,ck_coord_z1,ck_coord_z2,ck_id,game_id,player_id,time_of_event(min)
0,0.98,0.72,0.97,-0.16,-1,-1,22336296,448548,177285.0,0.8
1,0.96,0.81,-0.94,-0.29,-1,-1,22336338,448548,177285.0,4.333333
2,0.98,0.74,1.0,-0.02,-1,-1,22336570,448548,185745.0,24.75
3,0.97,0.72,-0.95,-0.06,-1,-1,22336593,448548,185745.0,29.133333
4,0.99,0.73,0.99,-0.1,-1,-1,22336716,448548,185745.0,38.45


In [114]:
shot_pass_corner = corner_to_shot(shot_pass_df, corner_df)
shot_pass_corner.head()

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick
0,448548,99779.0,0.85,1.0,0.05,-0.04,-1,0.83,22336297,11,20,1.033333,177285.0,0.98,0.72,0.97,-0.16,-1.0,-1.0,1.0
1,448548,60730.0,0.52,0.73,0.14,-0.03,-1,0.73,22336386,35,20,9.683333,,,,,,,,0.0
2,448548,177285.0,0.73,0.96,0.0,-0.1,-1,0.21,22336583,13,20,26.833333,,,,,,,,0.0
3,448548,185745.0,0.81,0.98,0.48,-0.22,-1,-1.0,22336723,33,20,38.883333,177285.0,0.49,0.86,0.13,0.65,-1.0,-1.0,0.0
4,448548,77892.0,-0.5,-0.99,-0.46,0.28,-1,-1.0,22336739,33,13,40.083333,,,,,,,,0.0


In [115]:
transposed_df = transpose_coordinates(shot_pass_corner)
transposed_df.head()

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick
0,448548,99779.0,0.85,1.0,0.05,-0.04,-1,0.83,22336297,11,20,1.033333,177285.0,0.98,0.72,0.97,-0.16,-1.0,-1.0,1.0
1,448548,60730.0,0.52,0.73,0.14,-0.03,-1,0.73,22336386,35,20,9.683333,,,,,,,,0.0
2,448548,177285.0,0.73,0.96,0.0,-0.1,-1,0.21,22336583,13,20,26.833333,,,,,,,,0.0
3,448548,185745.0,0.81,0.98,0.48,-0.22,-1,-1.0,22336723,33,20,38.883333,177285.0,0.49,0.86,0.13,0.65,-1.0,-1.0,0.0
4,448548,77892.0,0.5,0.99,0.46,-0.28,-1,-1.0,22336739,33,13,40.083333,,,,,,,,0.0


In [116]:
yard_df = coord_to_yards(transposed_df)
yard_df

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick
0,448548,99779.0,6.67,0.0,1.51,-1.21,-1,0.83,22336297,11,20,1.033333,177285.0,0.89,12.44,29.36,-4.84,-1.0,-1.0,1.0
1,448548,60730.0,21.33,12.0,4.24,-0.91,-1,0.73,22336386,35,20,9.683333,,,,,,,,0.0
2,448548,177285.0,12.0,1.78,0.0,-3.03,-1,0.21,22336583,13,20,26.833333,,,,,,,,0.0
3,448548,185745.0,8.44,0.89,14.53,-6.66,-1,-1.0,22336723,33,20,38.883333,177285.0,22.67,6.22,3.93,19.67,-1.0,-1.0,0.0
4,448548,77892.0,22.22,0.44,13.92,-8.48,-1,-1.0,22336739,33,13,40.083333,,,,,,,,0.0
5,448548,77892.0,11.56,0.44,10.59,-1.21,-1,0.14,22336802,35,13,44.483333,,,,,,,,0.0
6,448548,42505.0,11.11,1.33,11.5,4.24,-1,0.25,22336958,35,20,54.983333,177285.0,49.78,16.44,-11.5,19.67,-1.0,-1.0,0.0
7,448548,94225.0,12.44,1.78,-2.42,0.61,-1,0.6,22337005,35,20,59.516667,,,,,,,,0.0
8,448548,185745.0,8.89,0.44,-3.03,1.82,-1,-1.0,22337018,33,20,61.283333,,,,,,,,0.0
9,448548,36681.0,13.78,1.78,8.78,-3.03,-1,0.19,22337074,35,13,68.55,,,,,,,,0.0


In [117]:
shot_distance_df = shot_distance_angle(yard_df)
shot_distance_df.head()

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick,shot_distance,shot_angle
0,448548,99779.0,6.67,0.0,1.51,-1.21,-1,0.83,22336297,11,20,1.033333,177285.0,0.89,12.44,29.36,-4.84,-1.0,-1.0,1.0,6.838786,12.75599
1,448548,60730.0,21.33,12.0,4.24,-0.91,-1,0.73,22336386,35,20,9.683333,,,,,,,,0.0,21.747333,11.242763
2,448548,177285.0,12.0,1.78,0.0,-3.03,-1,0.21,22336583,13,20,26.833333,,,,,,,,0.0,12.0,0.0
3,448548,185745.0,8.44,0.89,14.53,-6.66,-1,-1.0,22336723,33,20,38.883333,177285.0,22.67,6.22,3.93,19.67,-1.0,-1.0,0.0,16.803407,59.849102
4,448548,77892.0,22.22,0.44,13.92,-8.48,-1,-1.0,22336739,33,13,40.083333,,,,,,,,0.0,26.220122,32.065607


In [118]:
df = dummy_columns(shot_distance_df)
df1 = drop_own_goals(df)
df2 = goal_dummy(df1)

In [119]:
df2

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick,shot_distance,shot_angle,assisted_shot,is_penalty_attempt,is_goal
0,448548,99779.0,6.67,0.0,1.51,-1.21,-1,0.83,22336297,11,20,1.033333,177285.0,0.89,12.44,29.36,-4.84,-1.0,-1.0,1.0,6.838786,12.75599,1,0,1
1,448548,60730.0,21.33,12.0,4.24,-0.91,-1,0.73,22336386,35,20,9.683333,,,,,,,,0.0,21.747333,11.242763,0,0,0
2,448548,177285.0,12.0,1.78,0.0,-3.03,-1,0.21,22336583,13,20,26.833333,,,,,,,,0.0,12.0,0.0,0,1,1
3,448548,185745.0,8.44,0.89,14.53,-6.66,-1,-1.0,22336723,33,20,38.883333,177285.0,22.67,6.22,3.93,19.67,-1.0,-1.0,0.0,16.803407,59.849102,1,0,0
4,448548,77892.0,22.22,0.44,13.92,-8.48,-1,-1.0,22336739,33,13,40.083333,,,,,,,,0.0,26.220122,32.065607,0,0,0
5,448548,77892.0,11.56,0.44,10.59,-1.21,-1,0.14,22336802,35,13,44.483333,,,,,,,,0.0,15.677426,42.492487,0,0,0
6,448548,42505.0,11.11,1.33,11.5,4.24,-1,0.25,22336958,35,20,54.983333,177285.0,49.78,16.44,-11.5,19.67,-1.0,-1.0,0.0,15.990063,45.988197,1,0,0
7,448548,94225.0,12.44,1.78,-2.42,0.61,-1,0.6,22337005,35,20,59.516667,,,,,,,,0.0,12.6732,-11.008472,0,0,0
8,448548,185745.0,8.89,0.44,-3.03,1.82,-1,-1.0,22337018,33,20,61.283333,,,,,,,,0.0,9.392178,-18.820773,0,0,0
9,448548,36681.0,13.78,1.78,8.78,-3.03,-1,0.19,22337074,35,13,68.55,,,,,,,,0.0,16.339425,32.503466,0,0,0


# Takes a game from the collection and cleans it

In [120]:
cleaned_df = game_to_cleaned_df(game)
cleaned_df.head()

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick,shot_distance,shot_angle,assisted_shot,is_penalty_attempt,is_goal
0,448548,99779.0,6.67,0.0,1.51,-1.21,-1,0.83,22336297,11,20,1.033333,177285.0,0.89,12.44,29.36,-4.84,-1.0,-1.0,1.0,6.838786,12.75599,1,0,1
1,448548,60730.0,21.33,12.0,4.24,-0.91,-1,0.73,22336386,35,20,9.683333,,,,,,,,0.0,21.747333,11.242763,0,0,0
2,448548,177285.0,12.0,1.78,0.0,-3.03,-1,0.21,22336583,13,20,26.833333,,,,,,,,0.0,12.0,0.0,0,1,1
3,448548,185745.0,8.44,0.89,14.53,-6.66,-1,-1.0,22336723,33,20,38.883333,177285.0,22.67,6.22,3.93,19.67,-1.0,-1.0,0.0,16.803407,59.849102,1,0,0
4,448548,77892.0,22.22,0.44,13.92,-8.48,-1,-1.0,22336739,33,13,40.083333,,,,,,,,0.0,26.220122,32.065607,0,0,0


In [121]:
cleaned_df.columns

Index(['game_id', 'player_id', 'shot_coord_x1', 'shot_coord_x2',
       'shot_coord_y1', 'shot_coord_y2', 'shot_coord_z1', 'shot_coord_z2',
       'shot_id', 'shot_type', 'team_id', 'time_of_event(min)',
       'passed_from_id', 'pass_coord_x1', 'pass_coord_x2', 'pass_coord_y1',
       'pass_coord_y2', 'pass_coord_z1', 'pass_coord_z2', 'corner_kick',
       'shot_distance', 'shot_angle', 'assisted_shot', 'is_penalty_attempt',
       'is_goal'],
      dtype='object')

In [122]:
attach_to_df = pd.DataFrame(columns=['game_id', 'player_id', 'shot_coord_x1', 'shot_coord_x2',
       'shot_coord_y1', 'shot_coord_y2', 'shot_coord_z1', 'shot_coord_z2',
       'shot_id', 'shot_type', 'team_id', 'time_of_event(min)',
       'passed_from_id', 'pass_coord_x1', 'pass_coord_x2', 'pass_coord_y1',
       'pass_coord_y2', 'pass_coord_z1', 'pass_coord_z2', 'corner_kick',
       'shot_distance', 'shot_angle', 'assisted_shot', 'is_penalty_attempt',
       'goal'])

In [123]:
attach_to_df

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick,shot_distance,shot_angle,assisted_shot,is_penalty_attempt,goal


In [124]:
pd.concat([attach_to_df, cleaned_df])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,assisted_shot,corner_kick,game_id,goal,is_goal,is_penalty_attempt,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,passed_from_id,player_id,shot_angle,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_distance,shot_id,shot_type,team_id,time_of_event(min)
0,1,1.0,448548,,1.0,0,0.89,12.44,29.36,-4.84,-1.0,-1.0,177285.0,99779.0,12.75599,6.67,0.0,1.51,-1.21,-1,0.83,6.838786,22336297,11,20,1.033333
1,0,0.0,448548,,0.0,0,,,,,,,,60730.0,11.242763,21.33,12.0,4.24,-0.91,-1,0.73,21.747333,22336386,35,20,9.683333
2,0,0.0,448548,,1.0,1,,,,,,,,177285.0,0.0,12.0,1.78,0.0,-3.03,-1,0.21,12.0,22336583,13,20,26.833333
3,1,0.0,448548,,0.0,0,22.67,6.22,3.93,19.67,-1.0,-1.0,177285.0,185745.0,59.849102,8.44,0.89,14.53,-6.66,-1,-1.0,16.803407,22336723,33,20,38.883333
4,0,0.0,448548,,0.0,0,,,,,,,,77892.0,32.065607,22.22,0.44,13.92,-8.48,-1,-1.0,26.220122,22336739,33,13,40.083333
5,0,0.0,448548,,0.0,0,,,,,,,,77892.0,42.492487,11.56,0.44,10.59,-1.21,-1,0.14,15.677426,22336802,35,13,44.483333
6,1,0.0,448548,,0.0,0,49.78,16.44,-11.5,19.67,-1.0,-1.0,177285.0,42505.0,45.988197,11.11,1.33,11.5,4.24,-1,0.25,15.990063,22336958,35,20,54.983333
7,0,0.0,448548,,0.0,0,,,,,,,,94225.0,-11.008472,12.44,1.78,-2.42,0.61,-1,0.6,12.6732,22337005,35,20,59.516667
8,0,0.0,448548,,0.0,0,,,,,,,,185745.0,-18.820773,8.89,0.44,-3.03,1.82,-1,-1.0,9.392178,22337018,33,20,61.283333
9,0,0.0,448548,,0.0,0,,,,,,,,36681.0,32.503466,13.78,1.78,8.78,-3.03,-1,0.19,16.339425,22337074,35,13,68.55


In [125]:
create_frame()

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick,shot_distance,shot_angle,assisted_shot,is_penalty_attempt,goal


In [126]:
# games_played = []
# games = db.games.find()
# for game in games:
#     games_played.append([game['match']['matchId'], game['status']['value']])
# games_played

In [127]:
create_frame()

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick,shot_distance,shot_angle,assisted_shot,is_penalty_attempt,goal


In [47]:
game_to_cleaned_df(game)

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick,shot_distance,shot_angle,assisted_shot,is_penalty_attempt,is_goal
0,448702,94965.0,10.22,0.89,1.21,0.3,-1,0.17,23925811,35,490,2.45,61919.0,0.44,9.78,28.76,0.91,-1.0,-1.0,1.0,10.29138,6.752119,1,0,0
1,448702,70512.0,10.22,1.78,-5.45,-2.42,-1,0.49,23926501,35,490,10.9,94965.0,17.33,1.78,23.91,0.91,-1.0,-1.0,0.0,11.582353,-28.069577,1,0,0
2,448702,19622.0,25.33,12.89,-4.24,-1.21,-1,0.28,23927163,35,6,22.05,,,,,,,,0.0,25.682416,-9.502666,0,0,0
3,448702,52937.0,24.44,0.44,-3.63,8.17,-1,-1.0,23927262,33,490,23.916667,98359.0,57.33,33.78,10.9,-2.42,-1.0,-1.0,0.0,24.708106,-8.448209,1,0,0
4,448702,15113.0,23.56,2.22,2.12,0.61,-1,0.13,23928087,35,490,38.583333,,,,,,,,0.0,23.65519,5.1418,0,0,0
5,448702,172730.0,28.0,0.89,-3.33,-0.3,-1,-1.0,23928388,33,6,44.9,,,,,,,,0.0,28.197321,-6.782249,0,0,0
6,448702,172730.0,9.78,0.44,-3.03,-15.44,-1,-1.0,23929341,33,6,55.666667,89017.0,5.33,8.89,18.77,1.21,-1.0,-1.0,0.0,10.238618,-17.213815,1,0,0
7,448702,15113.0,11.11,0.44,-8.17,-8.17,-1,-1.0,23929412,33,490,57.833333,,,,,,,,0.0,13.790613,-36.329779,0,0,0
8,448702,15113.0,12.89,0.0,5.45,6.66,-1,-1.0,23929520,33,490,60.7,186030.0,7.56,10.22,8.78,-1.82,-1.0,-1.0,0.0,13.994806,22.919049,1,0,0
9,448702,52937.0,12.44,0.89,-1.82,-7.26,-1,-1.0,23929534,33,490,61.533333,94965.0,17.33,3.11,19.37,-0.0,-1.0,-1.0,0.0,12.57243,-8.323451,1,0,0


# Clean and Comine All Dataframes

In [128]:
games = db.games.find()
for game in games:
    df = game_to_cleaned_df(game)
    master_df = pd.concat([attach_to_df, df], axis=0, ignore_index=True)
    attach_to_df = master_df.copy()


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  input_df['goal'] = input_df['shot_type'].isin([9, 11, 12, 13]).astype(int)


In [129]:
master_df.head()

Unnamed: 0,assisted_shot,corner_kick,game_id,goal,is_goal,is_penalty_attempt,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,passed_from_id,player_id,shot_angle,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_distance,shot_id,shot_type,team_id,time_of_event(min)
0,1,1.0,448548,,1.0,0,0.89,12.44,29.36,-4.84,-1.0,-1.0,177285.0,99779.0,12.75599,6.67,0.0,1.51,-1.21,-1,0.83,6.838786,22336297,11,20,1.033333
1,0,0.0,448548,,0.0,0,,,,,,,,60730.0,11.242763,21.33,12.0,4.24,-0.91,-1,0.73,21.747333,22336386,35,20,9.683333
2,0,0.0,448548,,1.0,1,,,,,,,,177285.0,0.0,12.0,1.78,0.0,-3.03,-1,0.21,12.0,22336583,13,20,26.833333
3,1,0.0,448548,,0.0,0,22.67,6.22,3.93,19.67,-1.0,-1.0,177285.0,185745.0,59.849102,8.44,0.89,14.53,-6.66,-1,-1.0,16.803407,22336723,33,20,38.883333
4,0,0.0,448548,,0.0,0,,,,,,,,77892.0,32.065607,22.22,0.44,13.92,-8.48,-1,-1.0,26.220122,22336739,33,13,40.083333


In [130]:
master_df.describe()

Unnamed: 0,corner_kick,is_goal,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,passed_from_id,player_id,shot_angle,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z2,shot_distance,time_of_event(min)
count,2945.0,2955.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0
mean,0.086927,0.109983,20.738591,17.941474,1.218714,0.186478,-1.0,-1.0,85681.678133,83417.628088,0.275928,15.376156,2.049354,0.202876,0.057773,-0.252988,17.779827,50.018252
std,0.281776,0.312922,16.754803,11.353598,18.873264,10.908477,0.0,0.0,62086.040107,64229.759131,31.213869,6.624601,3.816875,8.80382,5.448328,0.73858,6.458381,26.590282
min,0.0,0.0,0.0,0.44,-30.27,-28.45,-1.0,-1.0,51.0,51.0,-90.0,0.0,0.0,-29.06,-24.52,-1.0,0.3,0.25
25%,0.0,0.0,6.22,10.67,-14.23,-6.96,-1.0,-1.0,37152.0,30012.0,-22.223473,10.67,0.44,-6.05,-3.03,-1.0,12.441808,27.616667
50%,0.0,0.0,20.0,14.22,1.21,-0.3,-1.0,-1.0,70512.0,69857.0,0.0,13.33,0.89,0.0,0.3,0.1,16.716028,50.6
75%,0.0,0.0,30.22,23.56,18.16,6.96,-1.0,-1.0,122008.0,126644.0,22.826916,20.89,2.22,6.66,3.03,0.38,23.315904,73.208333
max,1.0,1.0,88.89,83.56,30.27,28.76,-1.0,-1.0,220396.0,220396.0,88.149038,40.89,88.44,27.54,26.03,1.29,47.678323,98.85


In [131]:
columns = ['game_id', 'player_id', 'shot_coord_x1', 'shot_coord_x2',
       'shot_coord_y1', 'shot_coord_y2', 'shot_coord_z1', 'shot_coord_z2',
       'shot_id', 'shot_type', 'team_id', 'time_of_event(min)',
       'passed_from_id', 'pass_coord_x1', 'pass_coord_x2', 'pass_coord_y1',
       'pass_coord_y2', 'pass_coord_z1', 'pass_coord_z2', 'corner_kick',
       'shot_distance', 'shot_angle', 'assisted_shot', 'is_penalty_attempt',
       'is_goal']

In [132]:
df_final = master_df[columns].copy()

In [133]:
df_final.head()

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick,shot_distance,shot_angle,assisted_shot,is_penalty_attempt,is_goal
0,448548,99779.0,6.67,0.0,1.51,-1.21,-1,0.83,22336297,11,20,1.033333,177285.0,0.89,12.44,29.36,-4.84,-1.0,-1.0,1.0,6.838786,12.75599,1,0,1.0
1,448548,60730.0,21.33,12.0,4.24,-0.91,-1,0.73,22336386,35,20,9.683333,,,,,,,,0.0,21.747333,11.242763,0,0,0.0
2,448548,177285.0,12.0,1.78,0.0,-3.03,-1,0.21,22336583,13,20,26.833333,,,,,,,,0.0,12.0,0.0,0,1,1.0
3,448548,185745.0,8.44,0.89,14.53,-6.66,-1,-1.0,22336723,33,20,38.883333,177285.0,22.67,6.22,3.93,19.67,-1.0,-1.0,0.0,16.803407,59.849102,1,0,0.0
4,448548,77892.0,22.22,0.44,13.92,-8.48,-1,-1.0,22336739,33,13,40.083333,,,,,,,,0.0,26.220122,32.065607,0,0,0.0


In [134]:
df_final.describe()

Unnamed: 0,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z2,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick,shot_distance,shot_angle,is_goal
count,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,2945.0,2955.0,2955.0,2955.0
mean,83417.628088,15.376156,2.049354,0.202876,0.057773,-0.252988,50.018252,85681.678133,20.738591,17.941474,1.218714,0.186478,-1.0,-1.0,0.086927,17.779827,0.275928,0.109983
std,64229.759131,6.624601,3.816875,8.80382,5.448328,0.73858,26.590282,62086.040107,16.754803,11.353598,18.873264,10.908477,0.0,0.0,0.281776,6.458381,31.213869,0.312922
min,51.0,0.0,0.0,-29.06,-24.52,-1.0,0.25,51.0,0.0,0.44,-30.27,-28.45,-1.0,-1.0,0.0,0.3,-90.0,0.0
25%,30012.0,10.67,0.44,-6.05,-3.03,-1.0,27.616667,37152.0,6.22,10.67,-14.23,-6.96,-1.0,-1.0,0.0,12.441808,-22.223473,0.0
50%,69857.0,13.33,0.89,0.0,0.3,0.1,50.6,70512.0,20.0,14.22,1.21,-0.3,-1.0,-1.0,0.0,16.716028,0.0,0.0
75%,126644.0,20.89,2.22,6.66,3.03,0.38,73.208333,122008.0,30.22,23.56,18.16,6.96,-1.0,-1.0,0.0,23.315904,22.826916,0.0
max,220396.0,40.89,88.44,27.54,26.03,1.29,98.85,220396.0,88.89,83.56,30.27,28.76,-1.0,-1.0,1.0,47.678323,88.149038,1.0


# EDA

In [135]:
len(df_final['player_id'].unique())

492

In [136]:
df_final[df_final['player_id'] == 51.0]

Unnamed: 0,game_id,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z1,shot_coord_z2,shot_id,shot_type,team_id,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick,shot_distance,shot_angle,assisted_shot,is_penalty_attempt,is_goal
40,448542,51.0,25.78,0.44,7.26,-1.82,-1,-1.0,22347673,33,2,21.566667,,,,,,,,0.0,26.782756,15.727945,0,0,0.0
300,448552,51.0,24.0,0.44,3.03,6.96,-1,-1.0,22445059,33,2,18.9,189736.0,33.33,23.11,-0.3,9.38,-1.0,-1.0,0.0,24.190513,7.195523,1,0,0.0
301,448552,51.0,24.44,1.33,-10.59,-3.63,-1,0.7,22445412,35,2,29.283333,,,,,,,,0.0,26.635722,-23.427377,0,0,0.0
303,448552,51.0,19.56,0.0,10.9,-1.82,-1,-1.0,22447854,33,2,69.566667,,,,,,,,0.0,22.392043,29.129163,0,0,0.0
1976,448641,51.0,21.33,0.44,-13.32,-2.12,-1,0.18,23370922,35,2,41.45,,,,,,,,0.0,25.147392,-31.983647,0,0,0.0
2304,448662,51.0,28.44,0.89,-9.38,-0.91,-1,-1.0,23519142,33,2,63.75,,,,,,,,0.0,29.94692,-18.253442,0,0,0.0
2452,448673,51.0,25.78,2.22,12.71,-0.61,-1,0.36,23595855,35,2,79.066667,,,,,,,,0.0,28.742869,26.24412,0,0,0.0
2459,448673,51.0,24.0,0.89,-14.53,-7.87,-1,-1.0,23596646,33,2,92.766667,,,,,,,,0.0,28.055675,-31.191411,0,0,0.0
2684,448687,51.0,27.11,0.0,-8.48,7.87,-1,-1.0,23777810,33,2,19.65,,,,,,,,0.0,28.405325,-17.369667,0,0,0.0


In [137]:
len(df_final['shot_id'].unique())


2955

In [138]:
len(df_final['time_of_event(min)'].unique())

2304

In [139]:
len(df_final['game_id'].unique())

159

3180

Unnamed: 0,player_id,shot_coord_x1,shot_coord_x2,shot_coord_y1,shot_coord_y2,shot_coord_z2,time_of_event(min),passed_from_id,pass_coord_x1,pass_coord_x2,pass_coord_y1,pass_coord_y2,pass_coord_z1,pass_coord_z2,corner_kick,shot_distance,shot_angle,is_goal
count,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,2945.0,2955.0,2955.0,2955.0
mean,83417.628088,15.376156,2.049354,0.202876,0.057773,-0.252988,50.018252,85681.678133,20.738591,17.941474,1.218714,0.186478,-1.0,-1.0,0.086927,17.779827,0.275928,0.109983
std,64229.759131,6.624601,3.816875,8.80382,5.448328,0.73858,26.590282,62086.040107,16.754803,11.353598,18.873264,10.908477,0.0,0.0,0.281776,6.458381,31.213869,0.312922
min,51.0,0.0,0.0,-29.06,-24.52,-1.0,0.25,51.0,0.0,0.44,-30.27,-28.45,-1.0,-1.0,0.0,0.3,-90.0,0.0
25%,30012.0,10.67,0.44,-6.05,-3.03,-1.0,27.616667,37152.0,6.22,10.67,-14.23,-6.96,-1.0,-1.0,0.0,12.441808,-22.223473,0.0
50%,69857.0,13.33,0.89,0.0,0.3,0.1,50.6,70512.0,20.0,14.22,1.21,-0.3,-1.0,-1.0,0.0,16.716028,0.0,0.0
75%,126644.0,20.89,2.22,6.66,3.03,0.38,73.208333,122008.0,30.22,23.56,18.16,6.96,-1.0,-1.0,0.0,23.315904,22.826916,0.0
max,220396.0,40.89,88.44,27.54,26.03,1.29,98.85,220396.0,88.89,83.56,30.27,28.76,-1.0,-1.0,1.0,47.678323,88.149038,1.0


In [86]:
df_final[df_final['player_id'] == 24256]['shot_id'].unique()

array(['23678386', '23679592', '23680365', '23681468', '22425491',
       '22426095', '22426286', '22427128', '22534020', '22819915',
       '22820864', '22820877', '22821436', '22821573', '22821891',
       '22953338', '23032336', '23032617', '23188442', '23190013',
       '23377909', '23378383', '23383816', '23389814', '23389862',
       '23511284', '23593861', '23596164', '23596233', '23596613',
       '23749413', '23749466', '23753168'], dtype=object)

In [77]:
df_final[df_final['is_goal']==1]['player_id'].value_counts()

24256.0     24
488.0       18
16081.0     14
98292.0     12
101.0       10
119431.0    10
55210.0      8
172801.0     8
85272.0      8
30266.0      8
3148.0       8
39414.0      8
39578.0      8
117606.0     6
40397.0      6
136659.0     6
1325.0       6
114108.0     6
15113.0      6
17266.0      6
66503.0      6
8907.0       6
121814.0     6
121341.0     6
99008.0      6
47913.0      6
68605.0      6
31939.0      6
84739.0      6
144.0        6
            ..
52390.0      2
176537.0     2
147633.0     2
215457.0     2
121313.0     2
85311.0      2
65598.0      2
60772.0      2
42505.0      2
150346.0     2
36681.0      2
44918.0      2
118186.0     2
19622.0      2
30027.0      2
17276.0      2
60771.0      2
126700.0     2
16183.0      2
49610.0      2
16278.0      2
193485.0     2
97738.0      2
52937.0      2
8951.0       2
193905.0     2
147570.0     2
163302.0     2
172760.0     2
148044.0     2
Name: player_id, Length: 174, dtype: int64

In [99]:
#if need to debug
game.keys()
game['match']['matchId']
game['match']
game['incidences']

dict_keys(['_id', 'Revision', 'lastChangedDate', 'lastChangeBy', 'match', 'status', 'venueInformation', 'broadcast', 'scoreStatus', 'players', 'teams', 'officials', 'incidences', 'deletedIncidences', 'summary', 'commentary'])