In [1]:
#################################################
#################################################

# Python code for plot 2
#For 7 stats, find whether winning team had more in stat than losing team had
#7 stats: Points, Assists, Total rebounds, 3PT'ers made, Offensive rebounds, Turnovers, Personal fouls

#organize in DataFrame and save to CSV for future analysis in R

#################################################
#################################################

In [2]:
import pandas as pd
import numpy as np

import copy

In [3]:
#all team stats of games from '08-'09 season to '17-'18 season
all_team_stats = pd.read_csv("all_team_stats_2009_to_2018.csv").loc[:,'first_qtr_points':]

In [4]:
all_team_stats.head()

Unnamed: 0,first_qtr_points,second_qtr_points,third_qtr_points,fourth_qtr_points,total_points,fg_made,fg_attempted,fg_percentage,threept_made,threept_attempted,...,fast_break_points,points_in_paint,personal_fouls,technical_fouls,flagrant_fouls,number_of_ot_periods,ot_points,won,away_or_home,matchup_id
0,21,23,25,27,96,39,78,50.0,9,23,...,8,0,16,1,0,0,[],1,Away,281217028
1,33,16,14,23,86,36,84,42.9,4,11,...,8,0,14,0,0,0,[],0,Home,281217028
2,26,19,17,26,88,36,90,40.0,5,21,...,10,38,21,1,0,0,[],0,Away,290304004
3,20,31,24,35,110,39,86,45.3,5,8,...,18,46,16,0,0,0,[],1,Home,290304004
4,20,19,19,15,73,28,75,37.3,4,18,...,10,24,23,0,0,0,[],0,Away,290304005


In [5]:
#only interested in 7 stats
interesting_columns = ['total_points', 'threept_made', 'total_rebounds', 'offensive_rebounds', 'assists', 'total_turnovers', 'personal_fouls', 'won', 'matchup_id']

#restrict to those stats along with result of game and Matchup ID
some_team_stats = all_team_stats[interesting_columns]

some_team_stats.head(10)

Unnamed: 0,total_points,threept_made,total_rebounds,offensive_rebounds,assists,total_turnovers,personal_fouls,won,matchup_id
0,96,9,52,8,21,28,16,1,281217028
1,86,4,39,7,17,12,14,0,281217028
2,88,5,48,10,13,32,21,0,290304004
3,110,5,60,17,19,26,16,1,290304004
4,73,4,46,8,19,36,23,0,290304005
5,91,9,54,9,20,24,20,1,290304005
6,102,7,49,13,16,14,22,0,290304006
7,107,9,37,6,25,20,19,1,290304006
8,101,13,44,3,26,32,20,1,290517002
9,82,4,44,7,21,26,22,0,290517002


In [6]:
#split into two dataframes: winning teams and losing teams

win_bool = some_team_stats['won'] == 1 #team won
lose_bool = some_team_stats['won'] == 0 #team lost

#stats of winning teams
winning_team_stats = some_team_stats[win_bool].reset_index(drop=True)

#stats of losing teams
losing_team_stats = some_team_stats[lose_bool].reset_index(drop=True)

In [7]:
winning_team_stats.head()

Unnamed: 0,total_points,threept_made,total_rebounds,offensive_rebounds,assists,total_turnovers,personal_fouls,won,matchup_id
0,96,9,52,8,21,28,16,1,281217028
1,110,5,60,17,19,26,16,1,290304004
2,91,9,54,9,20,24,20,1,290304005
3,107,9,37,6,25,20,19,1,290304006
4,101,13,44,3,26,32,20,1,290517002


In [8]:
losing_team_stats.head()

Unnamed: 0,total_points,threept_made,total_rebounds,offensive_rebounds,assists,total_turnovers,personal_fouls,won,matchup_id
0,86,4,39,7,17,12,14,0,281217028
1,88,5,48,10,13,32,21,0,290304004
2,73,4,46,8,19,36,23,0,290304005
3,102,7,49,13,16,14,22,0,290304006
4,82,4,44,7,21,26,22,0,290517002


In [9]:
#subtract losing team stats from winning team stats
difference_team_stats = winning_team_stats

difference_team_stats.loc[:,'total_points':'personal_fouls'] = winning_team_stats.loc[:,'total_points':'personal_fouls'] - losing_team_stats.loc[:,'total_points':'personal_fouls']

difference_team_stats.head()

Unnamed: 0,total_points,threept_made,total_rebounds,offensive_rebounds,assists,total_turnovers,personal_fouls,won,matchup_id
0,10,5,13,1,4,16,2,1,281217028
1,22,0,12,7,6,-6,-5,1,290304004
2,18,5,8,1,1,-12,-3,1,290304005
3,5,2,-12,-7,9,6,-3,1,290304006
4,19,9,0,-4,5,6,-2,1,290517002


In [10]:
#want to replace each interesting stat with 1, 0, or 0.5 based on if win, loss, or tie

def boolerize_num(num):
    '''
    Turns a number into 1, 0, or 0.5 based on if positive, negative, zero, respectively.
    
    Input: 
        num: number
    
    Output:
        1, 0, or 0.5
    '''
    if num > 0:
        return 1
    elif num < 0:
        return 0
    elif num == 0:
        return 0.5

In [11]:
for col in interesting_columns[:-2]:
    difference_team_stats.loc[:, col] = difference_team_stats.loc[:,col].apply(boolerize_num)
    
#change total points column to float (to match other stat columns)
difference_team_stats.loc[:,'total_points'] = difference_team_stats.loc[:,'total_points'].apply(lambda num: float(num))

In [12]:
difference_team_stats.head()

Unnamed: 0,total_points,threept_made,total_rebounds,offensive_rebounds,assists,total_turnovers,personal_fouls,won,matchup_id
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,281217028
1,1.0,0.5,1.0,1.0,1.0,0.0,0.0,1,290304004
2,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1,290304005
3,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1,290304006
4,1.0,1.0,0.5,0.0,1.0,1.0,0.0,1,290517002


In [13]:
#create list of stat names, whether it was larger, and matchup ID (remembering each team won in list)

stats_comparison_list = []

for idx in range(difference_team_stats.shape[0]):
    for col in interesting_columns[:-2]:
        stats_comparison_list.append([col, difference_team_stats.loc[idx,col], difference_team_stats.loc[idx,'matchup_id']])
        
stats_comparison_df = pd.DataFrame(stats_comparison_list, columns=['stat_name', 'greater', 'matchup_id'])

In [14]:
stats_comparison_df.loc[:,'matchup_id'] = stats_comparison_df.loc[:,'matchup_id'].apply(lambda num: int(num))

In [15]:
stats_comparison_df.to_csv("win_team_stats_comparison.csv")

In [16]:
'''
Running some SQL queries in R, we find that the teams tie in various statistics a surprisingly high amount of the time (at least to me!). 
Here are the percentage of the number of games which result in ties for the various stats:

- assists: 5.96% of games
- offensive rebounds: 8.01% of games
- personal fouls: 7.32%
- three point shots made: 8.83%
- total rebounds: 3.87% 
- total_turnovers: 8.07%

To decide what to do with these ties, we think ahead to what types of plots we are interested in creating. 

We wish to plot the percentages of games in which the winning team had more of each statistic. 
Since the word we're using is "more" and equal is not more, we will view all of the ties as "less's". 
More concretely, we will convert all of the 0.5's in `stats_comparison_df` into 0's.
'''

'\nRunning some SQL queries in R, we find that the teams tie in various statistics a surprisingly high amount of the time (at least to me!). \nHere are the percentage of the number of games which result in ties for the various stats:\n\n- assists: 5.96% of games\n- offensive rebounds: 8.01% of games\n- personal fouls: 7.32%\n- three point shots made: 8.83%\n- total rebounds: 3.87% \n- total_turnovers: 8.07%\n\nTo decide what to do with these ties, we think ahead to what types of plots we are interested in creating. \n\nWe wish to plot the percentages of games in which the winning team had more of each statistic. \nSince the word we\'re using is "more" and equal is not more, we will view all of the ties as "less\'s". \nMore concretely, we will convert all of the 0.5\'s in `stats_comparison_df` into 0\'s.\n'

In [17]:
stats_comparison_df.head()

Unnamed: 0,stat_name,greater,matchup_id
0,total_points,1.0,281217028
1,threept_made,1.0,281217028
2,total_rebounds,1.0,281217028
3,offensive_rebounds,1.0,281217028
4,assists,1.0,281217028


In [18]:
stats_comparison_df_no_ties = copy.deepcopy(stats_comparison_df)


#convert all 0.5's to 0's, turn everything else into int with same value
stats_comparison_df_no_ties.loc[:,"greater"] = stats_comparison_df_no_ties.loc[:,"greater"].apply(lambda num: 0 if num == 0.5 else int(num))

In [19]:
stats_comparison_df_no_ties.head()

Unnamed: 0,stat_name,greater,matchup_id
0,total_points,1,281217028
1,threept_made,1,281217028
2,total_rebounds,1,281217028
3,offensive_rebounds,1,281217028
4,assists,1,281217028


In [20]:
#save to CSV for further exploration in R
stats_comparison_df_no_ties.to_csv("stats_comparison_no_ties.csv")

#################################################
#################################################