In [1]:
import numpy as np
import pandas as pd
import os
import glob

pd.set_option('display.max_rows', 500)

# Sport - NIBRS Join

[Football poll data](#poll)\
[Football / basketball data alignment](#alignment)\
[Spark SQL join](#join)\
[Checking full DataFrame](#check)

<a id='poll'></a>


&nbsp;

## Football poll data



[College Football Data Glossary](https://collegefootballdata.com/glossary)


In [2]:
# create DataFrame 'df' from cfbd .json files, sourced from their /games API endpoint - one .json for each year
df = pd.concat(map(lambda file: pd.read_json(file), 
                   glob.glob(os.path.join('', r'../02_sport_rawdata/cfbd_games_json/' + '*.json'))))
df.reset_index(inplace=True, drop=True)

# create DataFrame 'df_poll' from cfbd .json files, sourced from their /rankings API endpoint - one .json for each year
df_poll = pd.concat(map(lambda file: pd.read_json(file, orient='records'), 
                        glob.glob(os.path.join('', r'../02_sport_rawdata/cfbd_poll_json/' + '*.json'))))
df_poll.reset_index(inplace=True, drop=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8966 entries, 0 to 8965
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  8966 non-null   int64  
 1   season              8966 non-null   int64  
 2   week                8966 non-null   int64  
 3   season_type         8966 non-null   object 
 4   start_date          8966 non-null   object 
 5   start_time_tbd      0 non-null      float64
 6   neutral_site        8966 non-null   bool   
 7   conference_game     8966 non-null   bool   
 8   attendance          8963 non-null   float64
 9   venue_id            8821 non-null   float64
 10  venue               8821 non-null   object 
 11  home_id             8966 non-null   int64  
 12  home_team           8966 non-null   object 
 13  home_conference     8955 non-null   object 
 14  home_points         8966 non-null   int64  
 15  home_line_scores    8966 non-null   object 
 16  home_p

In [4]:
df_poll.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   season      168 non-null    int64 
 1   seasonType  168 non-null    object
 2   week        168 non-null    int64 
 3   polls       168 non-null    object
dtypes: int64(2), object(2)
memory usage: 5.4+ KB


In [5]:
df_lst = []

def GetRankings(poll_data):
    ''' 
    This function takes a row of data and determines from which poll to extract rankings.
    There are 5 polls from which to select for a team's ranking on any given week.
    Note: This is not scaleable for large data due to looping of rows.
    '''
    
    tmp_df = pd.DataFrame(poll_data['polls'])

    search_term = ""
    if poll_data['season'] > 2013:
        if poll_data['week'] < 10: 
            search_term = 'AP Top 25'
        else:
            search_term = 'Playoff Committee'
        
    elif poll_data['season'] == 2013:
        if poll_data['week'] < 9:
            search_term = 'AP Top 25'
        else:
            search_term = 'BCS'
    else:
        if poll_data['week'] < 8:
            search_term = 'AP Top 25'
        else:
            search_term = 'BCS'
    
    if search_term == "":
        search_term = 'AP Top 25 i'
    rank_type = tmp_df[tmp_df['poll'].str.contains(search_term) == True]
    
    if rank_type.shape[0] == 0:
        search_term = 'AP Top 25'
        rank_type = tmp_df[tmp_df['poll'].str.contains('AP') == True]
    try:
        rankings = pd.json_normalize(rank_type.iloc[0]['ranks']).explode(['rank', 'school', 'conference'])
        # temp_ap = pd.json_normalize(r['polls']['poll']['ranks']).explode(['rank','school','conference'])
    except Exception:
        print(poll_data['season'], poll_data['week'], search_term)
        quit()
    
    rankings['season'] = poll_data['season']
    rankings['season_type'] = poll_data['seasonType']
    rankings['week'] = poll_data['week']
    rankings['poll'] = search_term
    
    df_lst.append(rankings)


# We don't want to return anything when we loop through the rows 
# as there will be a list of DataFrames stored we'll use in subsequent cells.
_ = df_poll.apply(lambda r: GetRankings(r),axis=1)

In [6]:
# Create the dataframe from the list of frames and validate the different polls that we're using and use some basic manipulation
cmb_ranks = pd.concat(df_lst)
cmb_ranks['school'] = cmb_ranks['school'].str.lower()

# Duplicate column for merging for opponent rank values later
cmb_ranks['opponent'] = cmb_ranks['school']
cmb_ranks

Unnamed: 0,rank,school,conference,firstPlaceVotes,points,season,season_type,week,poll,opponent
0,25,wisconsin,Big Ten,0,0,2009,regular,15,BCS,wisconsin
1,10,iowa,Big Ten,0,0,2009,regular,15,BCS,iowa
2,8,ohio state,Big Ten,0,0,2009,regular,15,BCS,ohio state
3,24,usc,Pac-10,0,0,2009,regular,15,BCS,usc
4,21,stanford,Pac-10,0,0,2009,regular,15,BCS,stanford
...,...,...,...,...,...,...,...,...,...,...
20,21,cincinnati,American Athletic,0,0,2019,regular,16,Playoff Committee,cincinnati
21,22,usc,Pac-12,0,0,2019,regular,16,Playoff Committee,usc
22,23,navy,American Athletic,0,0,2019,regular,16,Playoff Committee,navy
23,24,virginia,ACC,0,0,2019,regular,16,Playoff Committee,virginia


<a id='alignment'></a>


&nbsp;

## Football / basketball data alignment


The football data is not specific to a school but rather a record of games.\
\
The Basketball data is generated by school and thus we have two representations of the same game based on the school.\
\
We need to translate the football data to a similar style.

In [7]:
# reduce 'df' to football games Michigan played
um = df[(df['home_team'] =='Michigan') | (df['away_team']=='Michigan')].copy() # where Michigan played home or away
um['school'] = 'michigan'
um['opponent'] = um.apply(lambda row: row['away_team'] if row['home_team']=='Michigan' else row['home_team'], axis=1)
um['h_a'] = um.apply(lambda row: 'H' if row['home_team']=='Michigan' else 'A', axis=1) # H if home, A if away

# reduce 'df' to football games Michigan State played
msu = df[(df['home_team']=='Michigan State') | (df['away_team']=='Michigan State')].copy() # where MSU played home or away
msu['school'] = 'michigan state'
msu['opponent'] = msu.apply(lambda row: row['away_team'] if row['home_team']=='Michigan State' else row['home_team'], axis=1)
msu['h_a'] = msu.apply(lambda row: 'H' if row['home_team']=='Michigan State' else 'A', axis=1)

In [8]:
um_msu = pd.concat([um, msu]) # concatenate 'um', 'msu' together for football DataFrame
um_msu['sport'] = 'football'

In [9]:
def DetermineResult(h_a, home_points, away_points):
    if h_a == 'H': # if it's home
        if home_points > away_points:
            return 'W'
        elif home_points < away_points:
            return 'L'
        else:
            return 'T'
    else: # if it's away
        if home_points > away_points:
            return 'L'
        elif home_points < away_points:
            return 'W'
        else:
            return 'T'
        

# DetermineResult() for 'W', 'L', 'T'
um_msu['result'] = um_msu.apply(lambda r: DetermineResult(r['h_a'], r['home_points'], r['away_points']), axis=1)

In [10]:
# always Michigan or Michigan State's points regardless of whether the game was home or away
um_msu['team_points'] = um_msu.apply(lambda r: r['home_points'] if r['h_a'] == 'H' else r['away_points'], axis=1)

# always the opponent's points
um_msu['opp_points'] = um_msu.apply(lambda r: r['home_points'] if r['h_a'] == 'A' else r['away_points'], axis=1)

In [11]:
# overtime column
um_msu['ot'] = um_msu.apply(lambda r: None if (len(r['home_line_scores']) -  4) == 0 else "OT" , axis=1)


#### Merging football data together


In [12]:
# merge Michigan / Michigan State football data w/ Michigan / Michigan State poll data
um_msu = um_msu.merge(cmb_ranks[['rank', 'school', 'season', 'season_type', 'week']], 
                      how='left', 
                      on=['school', 'season', 'week', 'season_type'])
um_msu.opponent = um_msu.opponent.str.lower()

# new column created before merge to set proper rank of opponent
cmb_ranks['opponent_rank'] = cmb_ranks['rank']

# merge opponent data into Michigan / Michigan State DataFrame
um_msu = um_msu.merge(cmb_ranks[['opponent_rank', 'opponent', 'season', 'season_type', 'week']], 
                      how='left', 
                      on=['opponent', 'season', 'week', 'season_type'])

In [13]:
# um_msu['rank'] = um_msu['rank'].astype('int64')
# um_msu['opponent_rank'] = um_msu['opponent_rank'].astype('int64')

# um_msu[um_msu['rank'].isna() == False]['rank'].unique()

# um_msu[['start_date','home_team','away_team','h_a','home_points','away_points','result','team_points','opp_points','ot']].sample(10)

In [14]:
print(len(um_msu)) # number of football games played by Michigan and Michigan State over 10 year period
assert len(um_msu) == 267, f"Incorrect number of Michigan/MSU football games, got {len(um_msu)}"

267


In [15]:
um_msu[['school', 'rank', 'opponent', 'opponent_rank', 'season', 'week', 'attendance']].sample(10)

Unnamed: 0,school,rank,opponent,opponent_rank,season,week,attendance
57,michigan,,northwestern,,2013,12,47330.0
197,michigan state,9.0,wyoming,,2014,5,74227.0
91,michigan,2.0,michigan state,,2016,9,75802.0
219,michigan state,12.0,furman,,2016,1,74516.0
206,michigan state,5.0,western michigan,,2015,1,30885.0
115,michigan,6.0,michigan state,24.0,2018,8,0.0
2,michigan,25.0,eastern michigan,,2009,3,0.0
265,michigan state,,rutgers,,2019,13,24641.0
245,michigan state,24.0,indiana,,2018,4,0.0
224,michigan state,,northwestern,,2016,7,75625.0


In [16]:
um_msu[um_msu['start_date'] == '2012-10-13T16:00:00.000Z'][['home_team','away_team','away_line_scores','ot']]

Unnamed: 0,home_team,away_team,away_line_scores,ot
175,Michigan State,Iowa,"[0, 3, 0, 10, 3, 3]",OT


In [17]:
um_msu['start_dt'] = pd.to_datetime(um_msu['start_date']) # convert to datetime
um_msu['end_inc_window'] = um_msu['start_dt'] + pd.to_timedelta(11, unit='h') # set incident time window from game start time

In [18]:
um_msu.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 267 entries, 0 to 266
Data columns (total 42 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   id                  267 non-null    int64              
 1   season              267 non-null    int64              
 2   week                267 non-null    int64              
 3   season_type         267 non-null    object             
 4   start_date          267 non-null    object             
 5   start_time_tbd      0 non-null      float64            
 6   neutral_site        267 non-null    bool               
 7   conference_game     267 non-null    bool               
 8   attendance          267 non-null    float64            
 9   venue_id            263 non-null    float64            
 10  venue               263 non-null    object             
 11  home_id             267 non-null    int64              
 12  home_team           267 non-null    

In [19]:
# drop columns - reduce from 42 columns to 14
um_msu = um_msu[['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result', 'team_points', 
                 'opp_points', 'ot', 'rank', 'opponent_rank', 'end_inc_window', 'attendance', 'venue', 'excitement_index']]
um_msu.columns

Index(['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result',
       'team_points', 'opp_points', 'ot', 'rank', 'opponent_rank',
       'end_inc_window', 'attendance', 'venue', 'excitement_index'],
      dtype='object')


### Basketball Data


In [20]:
# Read in college basketball data and match column names w/ football before merge

cbb = pd.read_csv('../04_finaldata/df_basketball_final.csv', parse_dates=['start_dt', 'end_inc_window'])

In [21]:
# drop columns
cbb = cbb[['school', 'sport', 'start_dt', 'type', 
           'opponent', 'result', 'team_points', 'opp_points', 
           'ot', 'rank', 'opponent_rank', 'end_inc_window']]

In [22]:
# text formatting
cbb = cbb.rename({'type': 'season_type'}, axis=1)
cbb['opponent'] = cbb['opponent'].str.lower()
cbb['season_type'] = cbb['season_type'].apply(lambda x: "regular" if x=='REG' else x)
cbb.columns
# fbb = spark.createDataFrame(um_msu[['school','sport','game_dt','season_type','opponent','result','team_points','opp_points','ot','end_inc_window']])

Index(['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result',
       'team_points', 'opp_points', 'ot', 'rank', 'opponent_rank',
       'end_inc_window'],
      dtype='object')


#### Final sports merge


In [23]:
# We join the football and basketball data and do some manipulation across the sports including binarizing results and OT, 
# cleaning up / naming the indexes (used for a unique key)
cbb.school = cbb.school.replace({'michigan-state':'michigan state'})
sports = pd.concat([um_msu, cbb])
sports['ot'] = sports.ot.replace([np.nan], [None])
sports['ot'] = sports.ot.replace({None: 0, "OT": 1, "2OT": 1})
sports['result'] = sports.result.replace({"W": 1,"L": 0})
sports = sports.reset_index(drop=True)
sports.index.names = ['game_id']
sports

Unnamed: 0_level_0,school,sport,start_dt,season_type,opponent,result,team_points,opp_points,ot,rank,opponent_rank,end_inc_window,attendance,venue,excitement_index
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,michigan,football,2009-09-05 19:30:00+00:00,regular,western michigan,1,31,7,0,,,2009-09-06 06:30:00+00:00,0.0,Michigan Stadium,
1,michigan,football,2009-09-12 19:30:00+00:00,regular,notre dame,1,38,34,0,,18.0,2009-09-13 06:30:00+00:00,0.0,Michigan Stadium,
2,michigan,football,2009-09-19 16:00:00+00:00,regular,eastern michigan,1,45,17,0,25.0,,2009-09-20 03:00:00+00:00,0.0,Michigan Stadium,
3,michigan,football,2009-09-26 16:00:00+00:00,regular,indiana,1,36,33,0,23.0,,2009-09-27 03:00:00+00:00,0.0,Michigan Stadium,
4,michigan,football,2009-10-03 16:00:00+00:00,regular,michigan state,0,20,26,1,22.0,,2009-10-04 03:00:00+00:00,0.0,Spartan Stadium,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,michigan state,basketball,2019-03-21 14:45:00,NCAA,bradley,1,76,65,0,,,2019-03-22 00:45:00,,,
989,michigan state,basketball,2019-03-23 19:45:00,NCAA,minnesota,1,70,50,0,,,2019-03-24 05:45:00,,,
990,michigan state,basketball,2019-03-29 19:00:00,NCAA,louisiana state (12),1,80,63,0,,12.0,2019-03-30 05:00:00,,,
991,michigan state,basketball,2019-03-31 17:05:00,NCAA,duke (1),1,68,67,0,,1.0,2019-04-01 03:05:00,,,


In [24]:
sports.columns # verify our columms

Index(['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result',
       'team_points', 'opp_points', 'ot', 'rank', 'opponent_rank',
       'end_inc_window', 'attendance', 'venue', 'excitement_index'],
      dtype='object')

In [25]:
# Reduce number of columns that will be used in our analysis (prior to join with incident data)
sports = sports[['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result', 'team_points',
                 'opp_points', 'ot', 'rank', 'opponent_rank', 'end_inc_window', 'attendance', 'venue', 'excitement_index']]

In [26]:
sports.season_type.unique()

array(['regular', 'CTOURN', 'NCAA'], dtype=object)

In [27]:
print(len(sports)) # 726 basketball games + 267 football games
assert len(sports) == 993, f"Incorrect number of football + basketball games, got {len(sports)}, should be 267 football games + 726 basketball games"

993


In [28]:
sports.to_csv('../04_finaldata/sports.csv', header=True)

<a id='join'></a>


&nbsp;

## Join Sports and Incidents via Spark SQL

Now that the sports and incident data are manipulated, we'll use Spark SQL to create a dataset of all the games and their associated incidents / offenses.\
\
We'll ultimately end up with a large table that identifies every offense and its attribution to a particular incident and game.



#### sportrdd


In [29]:
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('Merge') \
    .getOrCreate() 

sc = spark.sparkContext

In [30]:
# Convert the sports DataFrame from prior step into a Spark DataFrame and ensure dates are casted correctly

sportsrdd = spark.read.csv('../04_finaldata/sports.csv', header=True, inferSchema=True)
sportsrdd = sportsrdd.withColumn('start_dt', col('start_dt').cast(TimestampType())) # convert start_dt to Timestamp
sportsrdd = sportsrdd.withColumn('end_inc_window', col('end_inc_window').cast(TimestampType())) # convert end_inc_window to Timestamp

In [31]:
print(sportsrdd.count())
sportsrdd.printSchema()

993
root
 |-- game_id: integer (nullable = true)
 |-- school: string (nullable = true)
 |-- sport: string (nullable = true)
 |-- start_dt: timestamp (nullable = true)
 |-- season_type: string (nullable = true)
 |-- opponent: string (nullable = true)
 |-- result: integer (nullable = true)
 |-- team_points: integer (nullable = true)
 |-- opp_points: integer (nullable = true)
 |-- ot: integer (nullable = true)
 |-- rank: string (nullable = true)
 |-- opponent_rank: double (nullable = true)
 |-- end_inc_window: timestamp (nullable = true)
 |-- attendance: double (nullable = true)
 |-- venue: string (nullable = true)
 |-- excitement_index: double (nullable = true)



In [32]:
# Let's run a quick test to see that the columns came over properly
test = sportsrdd.sample(withReplacement=False, fraction=.1)
test.show(5)

+-------+--------+--------+-------------------+-----------+----------------+------+-----------+----------+---+----+-------------+-------------------+----------+----------------+----------------+
|game_id|  school|   sport|           start_dt|season_type|        opponent|result|team_points|opp_points| ot|rank|opponent_rank|     end_inc_window|attendance|           venue|excitement_index|
+-------+--------+--------+-------------------+-----------+----------------+------+-----------+----------+---+----+-------------+-------------------+----------+----------------+----------------+
|      5|michigan|football|2009-10-10 20:05:00|    regular|            iowa|     0|         28|        30|  0|null|         12.0|2009-10-11 07:05:00|       0.0| Kinnick Stadium|            null|
|     11|michigan|football|2009-11-21 12:00:00|    regular|      ohio state|     0|         10|        21|  0|null|         10.0|2009-11-21 23:00:00|       0.0|Michigan Stadium|            null|
|     24|michigan|footbal


#### Incident Spark DataFrame


In [33]:
# Let's get the incident data for UM and MSU and do the same thing with the dates
incs = spark.read.csv('../04_finaldata/UM_MSU_Offs.csv', header=True, inferSchema=True)
incs = incs.withColumn('INCIDENT_DATE', col('INCIDENT_DATE').cast(TimestampType())) # convert 'INCIDENT_DATE' to Timestamp type

In [34]:
assert incs.count() == 97009, "Incorrect number of incidents - check 03_eda_baseline_analysis.ipynb for correct number"

In [35]:
# Create Spark SQL Table Views from both RDDs
sportsrdd.createOrReplaceTempView("sports")
incs.createOrReplaceTempView("incs")

In [36]:
sportsrdd.printSchema()

root
 |-- game_id: integer (nullable = true)
 |-- school: string (nullable = true)
 |-- sport: string (nullable = true)
 |-- start_dt: timestamp (nullable = true)
 |-- season_type: string (nullable = true)
 |-- opponent: string (nullable = true)
 |-- result: integer (nullable = true)
 |-- team_points: integer (nullable = true)
 |-- opp_points: integer (nullable = true)
 |-- ot: integer (nullable = true)
 |-- rank: string (nullable = true)
 |-- opponent_rank: double (nullable = true)
 |-- end_inc_window: timestamp (nullable = true)
 |-- attendance: double (nullable = true)
 |-- venue: string (nullable = true)
 |-- excitement_index: double (nullable = true)



In [37]:
# We need to get a list of every incident / offense associated to a game
# This grabs columns from 'sports' pandas DataFrame and the 'incs' Spark DataFrame and saves into a Spark DataFrame

query = """

SELECT
    s.game_id,
    s.sport, 
    s.school, 
    s.start_dt, 
    s.end_inc_window,
    s.rank,
    s.season_type,
    s.opponent,
    s.opponent_rank,
    s.result,
    s.team_points,
    s.opp_points,
    s.ot,
    s.attendance, 
    i.INCIDENT_ID as incident_id,
    i.OFFENSE_ID as offense_id,
    i.OFFENSE_NAME as offense_name,
    i.INCIDENT_DATE as incident_date

FROM
    sports s,
    incs i

WHERE
    i.INCIDENT_DATE BETWEEN s.start_dt AND s.end_inc_window AND
    i.school = s.school
"""


result = spark.sql(query)

In [38]:
df = result.toPandas() # convert Spark DataFrame to pandas DataFrame
print(df.shape)
df.head()

(6690, 18)


Unnamed: 0,game_id,sport,school,start_dt,end_inc_window,rank,season_type,opponent,opponent_rank,result,team_points,opp_points,ot,attendance,incident_id,offense_id,offense_name,incident_date
0,0,football,michigan,2009-09-05 15:30:00,2009-09-06 02:30:00,,regular,western michigan,,1,31,7,0,0.0,50917845,55723958,Shoplifting,2009-09-05 20:00:00
1,0,football,michigan,2009-09-05 15:30:00,2009-09-06 02:30:00,,regular,western michigan,,1,31,7,0,0.0,50954457,55757376,Pocket-picking,2009-09-05 16:00:00
2,0,football,michigan,2009-09-05 15:30:00,2009-09-06 02:30:00,,regular,western michigan,,1,31,7,0,0.0,50917864,55728651,Credit Card/Automated Teller Machine Fraud,2009-09-05 20:00:00
3,0,football,michigan,2009-09-05 15:30:00,2009-09-06 02:30:00,,regular,western michigan,,1,31,7,0,0.0,50917837,55728647,Destruction/Damage/Vandalism of Property,2009-09-05 20:00:00
4,0,football,michigan,2009-09-05 15:30:00,2009-09-06 02:30:00,,regular,western michigan,,1,31,7,0,0.0,50917882,55725838,Destruction/Damage/Vandalism of Property,2009-09-05 20:00:00


In [39]:
assert df.shape[0] == 6690, "Incorrect number of joined rows between Sport RDD and Incident Spark DataFrame"

In [40]:
len(df.game_id.unique()) # check how many unique games had some type of incident associated to them

953

In [41]:
df.isna().sum()

game_id              0
sport                0
school               0
start_dt             0
end_inc_window       0
rank              1542
season_type          0
opponent             0
opponent_rank     4489
result               0
team_points          0
opp_points           0
ot                   0
attendance        3636
incident_id          0
offense_id           0
offense_name         0
incident_date        0
dtype: int64

Now, since we performed an SQL operation that returned results that matched our criteria, we need to join these results to the games dataset.\
\
We'll create another table view and call it `game_incs` and use a left join to the sports data.

In [42]:
result.createOrReplaceTempView('game_incs')

In [43]:
query2 = """

SELECT s.*, g.incident_id, g.offense_id, g.offense_name, g.incident_date

FROM
    sports s LEFT JOIN game_incs g
    ON s.game_id = g.game_id
    
"""

result2 = spark.sql(query2)


In [44]:
full = result2.toPandas().set_index('game_id')

In [45]:
len(full.index.unique()), full.shape

(993, (6730, 19))

In [46]:
assert full.shape[0] == 6730, "Incorrect number of incidents joined to game_id values"

In [47]:
full.isna().sum() # na values mostly from games where no incidents occurred w/in time window

school                 0
sport                  0
start_dt               0
season_type            0
opponent               0
result                 0
team_points            0
opp_points             0
ot                     0
rank                1542
opponent_rank       4521
end_inc_window         0
attendance          3673
venue               3736
excitement_index    4278
incident_id           40
offense_id            40
offense_name          40
incident_date         40
dtype: int64

<a id='check'></a>


&nbsp;

## Checking the full DataFrame


In [48]:
full.sample(20)

Unnamed: 0_level_0,school,sport,start_dt,season_type,opponent,result,team_points,opp_points,ot,rank,opponent_rank,end_inc_window,attendance,venue,excitement_index,incident_id,offense_id,offense_name,incident_date
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
367,michigan,basketball,2012-03-16 18:20:00,NCAA,ohio,0,60,65,0,,,2012-03-17 04:20:00,,,,64736525.0,68683402.0,Simple Assault,2012-03-16 23:00:00
38,michigan,football,2012-09-15 15:30:00,regular,umass,1,63,13,0,17.0,,2012-09-16 02:30:00,110708.0,Michigan Stadium,2.029581,64753997.0,68701870.0,Drug/Narcotic Violations,2012-09-15 17:00:00
850,michigan state,basketball,2015-11-17 22:00:00,regular,kansas (4),1,79,73,0,13,4.0,2015-11-18 08:00:00,,,,81417366.0,89053926.0,Theft From Building,2015-11-18 07:00:00
330,michigan,basketball,2011-03-11 14:30:00,CTOURN,illinois,1,60,55,0,-,,2011-03-12 00:30:00,,,,59271742.0,62411268.0,Drug/Narcotic Violations,2011-03-12 00:00:00
702,michigan state,basketball,2011-11-20 18:00:00,regular,little rock,1,69,47,0,-,,2011-11-21 04:00:00,,,,60150490.0,62386759.0,All Other Larceny,2011-11-20 18:00:00
689,michigan state,basketball,2011-02-15 21:00:00,regular,ohio state (2),0,61,71,0,-,2.0,2011-02-16 07:00:00,,,,60144732.0,62385870.0,Theft From Building,2011-02-15 21:00:00
685,michigan state,basketball,2011-01-30 18:00:00,regular,indiana,1,84,83,1,25,,2011-01-31 04:00:00,,,,60072366.0,66485711.0,Destruction/Damage/Vandalism of Property,2011-01-31 02:00:00
21,michigan,football,2010-11-13 12:00:00,regular,purdue,1,27,16,0,,,2010-11-13 23:00:00,0.0,Ross-Ade Stadium,,55953625.0,57592702.0,Stolen Property Offenses,2010-11-13 19:00:00
67,michigan,football,2014-10-25 15:30:00,regular,michigan state,0,11,35,0,,8.0,2014-10-26 02:30:00,76331.0,Spartan Stadium,1.37687,74922684.0,81328753.0,Motor Vehicle Theft,2014-10-25 20:00:00
93,michigan,football,2016-11-12 20:00:00,regular,iowa,0,13,14,0,3.0,,2016-11-13 07:00:00,70585.0,Kinnick Stadium,5.953105,88537941.0,96895297.0,Theft From Building,2016-11-13 00:00:00


In [49]:
full[(full['school']=='michigan state') & (full['sport']=='basketball')]

Unnamed: 0_level_0,school,sport,start_dt,season_type,opponent,result,team_points,opp_points,ot,rank,opponent_rank,end_inc_window,attendance,venue,excitement_index,incident_id,offense_id,offense_name,incident_date
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
833,michigan state,basketball,2015-02-10 19:00:00,regular,northwestern,1,68,44,0,-,,2015-02-11 05:00:00,,,,,,,NaT
737,michigan state,basketball,2012-11-13 20:30:00,regular,kansas (7),1,67,64,0,21,7.0,2012-11-14 06:30:00,,,,66991130.0,69033726.0,Drug/Narcotic Violations,2012-11-14 00:00:00
737,michigan state,basketball,2012-11-13 20:30:00,regular,kansas (7),1,67,64,0,21,7.0,2012-11-14 06:30:00,,,,66991129.0,69033725.0,Theft From Building,2012-11-13 22:00:00
737,michigan state,basketball,2012-11-13 20:30:00,regular,kansas (7),1,67,64,0,21,7.0,2012-11-14 06:30:00,,,,66910632.0,68926517.0,Simple Assault,2012-11-13 21:00:00
858,michigan state,basketball,2015-12-09 19:00:00,regular,maryland-eastern shore,1,78,35,0,1,,2015-12-10 05:00:00,,,,81355133.0,86293754.0,Theft From Motor Vehicle,2015-12-10 02:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963,michigan state,basketball,2018-12-08 12:00:00,regular,florida,1,63,59,0,10,,2018-12-08 22:00:00,,,,102879319.0,126709921.0,All Other Larceny,2018-12-08 16:00:00
954,michigan state,basketball,2018-11-06 19:00:00,regular,kansas (1),0,87,92,0,10,1.0,2018-11-07 05:00:00,,,,101773494.0,125445404.0,Theft From Building,2018-11-07 00:00:00
954,michigan state,basketball,2018-11-06 19:00:00,regular,kansas (1),0,87,92,0,10,1.0,2018-11-07 05:00:00,,,,101779877.0,125449009.0,All Other Larceny,2018-11-06 19:00:00
954,michigan state,basketball,2018-11-06 19:00:00,regular,kansas (1),0,87,92,0,10,1.0,2018-11-07 05:00:00,,,,101774189.0,125451561.0,Drug/Narcotic Violations,2018-11-06 19:00:00


In [50]:
# Quick calculation to determine the number of sporting events that have an actual incident associated to them
grp = full.groupby('start_dt')[['incident_id','offense_name']].count()
print('{:.3f}'.format(grp[grp['incident_id'] > 0].shape[0] / 993))

0.899


In [51]:
full.to_csv('../04_finaldata/full_data.csv') # DataFrame to be used for main report analysis

In [52]:
# Let's do a quick validation to make sure that the data associated ok. Take the game start_date with the highest number of incidents and check what offenses were created. Validate that we have the same number.
grp.sort_values('incident_id', ascending=False)

Unnamed: 0_level_0,incident_id,offense_name
start_dt,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-10-25 15:30:00,53,53
2011-10-15 12:00:00,49,49
2016-10-29 12:00:00,46,46
2013-11-02 15:30:00,41,41
2009-10-03 12:00:00,40,40
...,...,...
2015-11-23 19:00:00,0,0
2016-12-30 18:00:00,0,0
2015-02-22 19:30:00,0,0
2009-12-10 19:00:00,0,0


In [53]:
# Seeing 10-25-2014 has the highest number of offenses.  Let's see what incidents/offenses were generated
testsql = "SELECT i.school, i.INCIDENT_ID,i.OFFENSE_NAME FROM incs i WHERE i.INCIDENT_DATE BETWEEN '2014-10-25 15:30:00' AND '2014-10-26 2:30:00'"
r = spark.sql(testsql)
r.show(50)

+--------------+-----------+--------------------+
|        school|INCIDENT_ID|        OFFENSE_NAME|
+--------------+-----------+--------------------+
|michigan state|   73350634| Theft From Building|
|michigan state|   73418637|      Simple Assault|
|michigan state|   73351423|      Simple Assault|
|michigan state|   73418636|      Simple Assault|
|michigan state|   73350593|Destruction/Damag...|
|      michigan|   74955897| Theft From Building|
|michigan state|   73350592|  Aggravated Assault|
|michigan state|   73418625|      Simple Assault|
|michigan state|   73418627|Destruction/Damag...|
|michigan state|   73350608|Burglary/Breaking...|
|michigan state|   73350599|Destruction/Damag...|
|michigan state|   73350583|  Aggravated Assault|
|michigan state|   73418618|      Simple Assault|
|michigan state|   73418619|      Simple Assault|
|michigan state|   73350603| Motor Vehicle Theft|
|      michigan|   74955899|Drug/Narcotic Vio...|
|michigan state|   73418617|  Aggravated Assault|


In [54]:
assert r.count() == 53, "Incorrect number of incidents for Michigan vs. Michigan State 2014"

^\
|

Looks like 53 offenses - most attributed to MSU's campus (assault).\
\
This was a game where Michigan State and played Michigan and won, btw. It was played at Spartan Stadium.


&nbsp;

## Result checks


In [55]:
sports.columns

Index(['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result',
       'team_points', 'opp_points', 'ot', 'rank', 'opponent_rank',
       'end_inc_window', 'attendance', 'venue', 'excitement_index'],
      dtype='object')

In [56]:
test_query = """SELECT s.school home, s.opponent, s.start_dt start, s.end_inc_window end
                FROM sports s
                WHERE s.school == 'michigan'"""
spark.sql(test_query).show(3)

+--------+----------------+-------------------+-------------------+
|    home|        opponent|              start|                end|
+--------+----------------+-------------------+-------------------+
|michigan|western michigan|2009-09-05 15:30:00|2009-09-06 02:30:00|
|michigan|      notre dame|2009-09-12 15:30:00|2009-09-13 02:30:00|
|michigan|eastern michigan|2009-09-19 12:00:00|2009-09-19 23:00:00|
+--------+----------------+-------------------+-------------------+
only showing top 3 rows



In [57]:
# Determine if number of incidents for a game matches on Timestamp alone
test_query = """SELECT i.INCIDENT_ID id 
                FROM incs i 
                WHERE i.INCIDENT_DATE BETWEEN '2009-09-12 15:30:00' AND '2009-09-13 02:30:00'"""
spark.sql(test_query).count()

35

In [58]:
test_query = """SELECT s.school home, s.opponent, i.INCIDENT_ID id, i.OFFENSE_NAME offense
                FROM sports s, incs i
                WHERE (s.sport == 'football' 
                        AND s.school == 'michigan' 
                        AND s.opponent == 'notre dame' 
                        AND i.INCIDENT_DATE BETWEEN '2012-01-01 00:00:00' AND '2012-12-31 23:59:59')"""
spark.sql(test_query).count()

76184

In [59]:
df_full = pd.read_csv('../04_finaldata/full_data.csv', parse_dates=['start_dt', 'end_inc_window'])