In [56]:
import numpy as np
import pandas as pd
import os
import glob

pd.set_option('display.max_rows', 500)

# Sport - NIBRS Join

[Football poll data](#poll)\
[Football / basketball data alignment](#alignment)\
[Spark SQL join](#join)\
[Checking full DataFrame](#check)

<a id='poll'></a>


&nbsp;

## Football poll data



[College Football Data Glossary](https://collegefootballdata.com/glossary)


In [57]:
# create DataFrame 'df' from cfbd .json files, sourced from their /games API endpoint - one .json for each year
df = pd.concat(map(lambda file: pd.read_json(file), 
                   glob.glob(os.path.join('', r'../02_sport_rawdata/cfbd_games_json/' + '*.json'))))
df.reset_index(inplace=True, drop=True)

# create DataFrame 'df_poll' from cfbd .json files, sourced from their /rankings API endpoint - one .json for each year
df_poll = pd.concat(map(lambda file: pd.read_json(file, orient='records'), 
                        glob.glob(os.path.join('', r'../02_sport_rawdata/cfbd_poll_json/' + '*.json'))))
df_poll.reset_index(inplace=True, drop=True)

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8966 entries, 0 to 8965
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  8966 non-null   int64  
 1   season              8966 non-null   int64  
 2   week                8966 non-null   int64  
 3   season_type         8966 non-null   object 
 4   start_date          8966 non-null   object 
 5   start_time_tbd      0 non-null      float64
 6   neutral_site        8966 non-null   bool   
 7   conference_game     8966 non-null   bool   
 8   attendance          8963 non-null   float64
 9   venue_id            8821 non-null   float64
 10  venue               8821 non-null   object 
 11  home_id             8966 non-null   int64  
 12  home_team           8966 non-null   object 
 13  home_conference     8955 non-null   object 
 14  home_points         8966 non-null   int64  
 15  home_line_scores    8966 non-null   object 
 16  home_p

In [59]:
df_poll.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   season      168 non-null    int64 
 1   seasonType  168 non-null    object
 2   week        168 non-null    int64 
 3   polls       168 non-null    object
dtypes: int64(2), object(2)
memory usage: 5.4+ KB


In [60]:
df_lst = []

def GetRankings(poll_data):
    ''' 
    This function takes a row of data and determines from which poll to extract rankings.
    There are 5 polls from which to select for a team's ranking on any given week.
    Note: This is not scaleable for large data due to looping of rows.
    '''
    
    tmp_df = pd.DataFrame(poll_data['polls'])

    search_term = ""
    if poll_data['season'] > 2013:
        if poll_data['week'] < 10: 
            search_term = 'AP Top 25'
        else:
            search_term = 'Playoff Committee'
        
    elif poll_data['season'] == 2013:
        if poll_data['week'] < 9:
            search_term = 'AP Top 25'
        else:
            search_term = 'BCS'
    else:
        if poll_data['week'] < 8:
            search_term = 'AP Top 25'
        else:
            search_term = 'BCS'
    
    if search_term == "":
        search_term = 'AP Top 25 i'
    rank_type = tmp_df[tmp_df['poll'].str.contains(search_term) == True]
    
    if rank_type.shape[0] == 0:
        search_term = 'AP Top 25'
        rank_type = tmp_df[tmp_df['poll'].str.contains('AP') == True]
    try:
        rankings = pd.json_normalize(rank_type.iloc[0]['ranks']).explode(['rank', 'school', 'conference'])
        # temp_ap = pd.json_normalize(r['polls']['poll']['ranks']).explode(['rank','school','conference'])
    except Exception:
        print(poll_data['season'], poll_data['week'], search_term)
        quit()
    
    rankings['season'] = poll_data['season']
    rankings['season_type'] = poll_data['seasonType']
    rankings['week'] = poll_data['week']
    rankings['poll'] = search_term
    
    df_lst.append(rankings)


# We don't want to return anything when we loop through the rows 
# as there will be a list of DataFrames stored we'll use in subsequent cells.
_ = df_poll.apply(lambda r: GetRankings(r),axis=1)

In [62]:
# Create the dataframe from the list of frames and validate the different polls that we're using and use some basic manipulation
cmb_ranks = pd.concat(df_lst)
cmb_ranks['school'] = cmb_ranks['school'].str.lower()

# Duplicate column for merging for opponent rank values later
cmb_ranks['opponent'] = cmb_ranks['school']
cmb_ranks

Unnamed: 0,rank,school,conference,firstPlaceVotes,points,season,season_type,week,poll,opponent
0,1,alabama,SEC,30,1503,2019,regular,8,AP Top 25,alabama
1,2,lsu,SEC,12,1449,2019,regular,8,AP Top 25,lsu
2,3,clemson,ACC,11,1427,2019,regular,8,AP Top 25,clemson
3,4,ohio state,Big Ten,9,1404,2019,regular,8,AP Top 25,ohio state
4,5,oklahoma,Big 12,0,1333,2019,regular,8,AP Top 25,oklahoma
...,...,...,...,...,...,...,...,...,...,...
20,2,ohio state,Big Ten,3,1414,2017,regular,1,AP Top 25,ohio state
21,11,michigan,Big Ten,0,881,2017,regular,1,AP Top 25,michigan
22,6,penn state,Big Ten,0,1196,2017,regular,1,AP Top 25,penn state
23,9,wisconsin,Big Ten,0,926,2017,regular,1,AP Top 25,wisconsin


<a id='alignment'></a>


&nbsp;

## Football / basketball data alignment


The football data is not specific to a school but rather a record of games.\
\
The Basketball data is generated by school and thus we have two representations of the same game based on the school.\
\
We need to translate the football data to a similar style.

In [63]:
# reduce 'df' to football games Michigan played
um = df[(df['home_team'] =='Michigan') | (df['away_team']=='Michigan')].copy() # where Michigan played home or away
um['school'] = 'michigan'
um['opponent'] = um.apply(lambda row: row['away_team'] if row['home_team']=='Michigan' else row['home_team'], axis=1)
um['h_a'] = um.apply(lambda row: 'H' if row['home_team']=='Michigan' else 'A', axis=1) # H if home, A if away

# reduce 'df' to football games Michigan State played
msu = df[(df['home_team']=='Michigan State') | (df['away_team']=='Michigan State')].copy() # where MSU played home or away
msu['school'] = 'michigan state'
msu['opponent'] = msu.apply(lambda row: row['away_team'] if row['home_team']=='Michigan State' else row['home_team'], axis=1)
msu['h_a'] = msu.apply(lambda row: 'H' if row['home_team']=='Michigan State' else 'A', axis=1)

In [64]:
um_msu = pd.concat([um, msu]) # concatenate 'um', 'msu' together for football DataFrame
um_msu['sport'] = 'football'

In [65]:
def DetermineResult(h_a, home_points, away_points):
    if h_a == 'H': # if it's home
        if home_points > away_points:
            return 'W'
        elif home_points < away_points:
            return 'L'
        else:
            return 'T'
    else: # if it's away
        if home_points > away_points:
            return 'L'
        elif home_points < away_points:
            return 'W'
        else:
            return 'T'
        

# DetermineResult() for 'W', 'L', 'T'
um_msu['result'] = um_msu.apply(lambda r: DetermineResult(r['h_a'], r['home_points'], r['away_points']), axis=1)

In [66]:
# always Michigan or Michigan State's points regardless of whether the game was home or away
um_msu['team_points'] = um_msu.apply(lambda r: r['home_points'] if r['h_a'] == 'H' else r['away_points'], axis=1)

# always the opponent's points
um_msu['opp_points'] = um_msu.apply(lambda r: r['home_points'] if r['h_a'] == 'A' else r['away_points'], axis=1)

In [67]:
# overtime column
um_msu['ot'] = um_msu.apply(lambda r: None if (len(r['home_line_scores']) -  4) == 0 else "OT" , axis=1)


#### Merging football data together


In [68]:
# merge Michigan / Michigan State football data w/ Michigan / Michigan State poll data
um_msu = um_msu.merge(cmb_ranks[['rank', 'school', 'season', 'season_type', 'week']], 
                      how='left', 
                      on=['school', 'season', 'week', 'season_type'])
um_msu.opponent = um_msu.opponent.str.lower()

# new column created before merge to set proper rank of opponent
cmb_ranks['opponent_rank'] = cmb_ranks['rank']

# merge opponent data into Michigan / Michigan State DataFrame
um_msu = um_msu.merge(cmb_ranks[['opponent_rank', 'opponent', 'season', 'season_type', 'week']], 
                      how='left', 
                      on=['opponent', 'season', 'week', 'season_type'])

In [69]:
# um_msu['rank'] = um_msu['rank'].astype('int64')
# um_msu['opponent_rank'] = um_msu['opponent_rank'].astype('int64')

# um_msu[um_msu['rank'].isna() == False]['rank'].unique()

# um_msu[['start_date','home_team','away_team','h_a','home_points','away_points','result','team_points','opp_points','ot']].sample(10)

In [70]:
print(len(um_msu)) # number of football games played by Michigan and Michigan State over 10 year period

267


In [71]:
um_msu[['school', 'rank', 'opponent', 'opponent_rank', 'season', 'week', 'attendance']].sample(10)

Unnamed: 0,school,rank,opponent,opponent_rank,season,week,attendance
190,michigan state,,rutgers,,2016,11,73701.0
266,michigan state,16.0,rutgers,,2017,13,35021.0
33,michigan,,purdue,,2009,10,0.0
12,michigan,7.0,middle tennessee,,2019,1,110811.0
141,michigan state,14.0,minnesota,,2010,10,0.0
212,michigan state,8.0,indiana,,2014,8,44403.0
9,michigan,,purdue,,2010,11,0.0
120,michigan,11.0,florida,17.0,2017,1,75802.0
107,michigan,15.0,ohio state,,2011,13,0.0
69,michigan,,northwestern,,2013,12,47330.0


In [72]:
um_msu[um_msu['start_date'] == '2012-10-13T16:00:00.000Z'][['home_team','away_team','away_line_scores','ot']]

Unnamed: 0,home_team,away_team,away_line_scores,ot
249,Michigan State,Iowa,"[0, 3, 0, 10, 3, 3]",OT


In [73]:
um_msu['start_dt'] = pd.to_datetime(um_msu['start_date']) # convert to datetime
um_msu['end_inc_window'] = um_msu['start_dt'] + pd.to_timedelta(11, unit='h') # set incident time window from game start time

In [74]:
um_msu.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 267 entries, 0 to 266
Data columns (total 42 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   id                  267 non-null    int64              
 1   season              267 non-null    int64              
 2   week                267 non-null    int64              
 3   season_type         267 non-null    object             
 4   start_date          267 non-null    object             
 5   start_time_tbd      0 non-null      float64            
 6   neutral_site        267 non-null    bool               
 7   conference_game     267 non-null    bool               
 8   attendance          267 non-null    float64            
 9   venue_id            263 non-null    float64            
 10  venue               263 non-null    object             
 11  home_id             267 non-null    int64              
 12  home_team           267 non-null    

In [20]:
# drop columns - reduce from 42 columns to 14
um_msu = um_msu[['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result', 'team_points', 
                 'opp_points', 'ot', 'rank', 'opponent_rank', 'end_inc_window', 'attendance', 'venue', 'excitement_index']]
um_msu.columns

Index(['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result',
       'team_points', 'opp_points', 'ot', 'rank', 'opponent_rank',
       'end_inc_window', 'attendance', 'venue', 'excitement_index'],
      dtype='object')


### Basketball Data


In [21]:
# Read in college basketball data and match column names w/ football before merge

cbb = pd.read_csv('../04_finaldata/df_basketball_final.csv', parse_dates=['start_dt', 'end_inc_window'])

In [22]:
# drop columns
cbb = cbb[['school', 'sport', 'start_dt', 'type', 
           'opponent', 'result', 'team_points', 'opp_points', 
           'ot', 'rank', 'opponent_rank', 'end_inc_window']]

In [23]:
# text formatting
cbb.school = cbb.school.replace({'michigan-state':'michigan state'})
cbb = cbb.rename({'type': 'season_type'}, axis=1)
cbb['opponent'] = cbb['opponent'].str.lower()
cbb['season_type'] = cbb['season_type'].apply(lambda x: "regular" if x=='REG' else x)
cbb.columns
# fbb = spark.createDataFrame(um_msu[['school','sport','game_dt','season_type','opponent','result','team_points','opp_points','ot','end_inc_window']])

Index(['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result',
       'team_points', 'opp_points', 'ot', 'rank', 'opponent_rank',
       'end_inc_window'],
      dtype='object')


#### Final sports merge


In [24]:
# We join the football and basketball data and do some manipulation across the sports including binarizing results and OT, 
# cleaning up / naming the indexes (used for a unique key)
cbb.school = cbb.school.replace({'michigan-state':'michigan state'})
sports = pd.concat([um_msu, cbb])
sports['ot'] = sports.ot.replace([np.nan], [None])
sports['ot'] = sports.ot.replace({None: 0, "OT": 1, "2OT": 1})
sports['result'] = sports.result.replace({"W": 1,"L": 0})
sports = sports.reset_index(drop=True)
sports.index.names = ['game_id']
sports

Unnamed: 0_level_0,school,sport,start_dt,season_type,opponent,result,team_points,opp_points,ot,rank,opponent_rank,end_inc_window,attendance,venue,excitement_index
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,michigan,football,2010-09-04 19:30:00+00:00,regular,connecticut,1,30,10,0,,,2010-09-05 06:30:00+00:00,0.0,Michigan Stadium,
1,michigan,football,2010-09-11 19:30:00+00:00,regular,notre dame,1,28,24,0,,,2010-09-12 06:30:00+00:00,0.0,Notre Dame Stadium,
2,michigan,football,2010-09-18 16:00:00+00:00,regular,umass,1,42,37,0,20.0,,2010-09-19 03:00:00+00:00,0.0,Michigan Stadium,
3,michigan,football,2010-09-25 16:00:00+00:00,regular,bowling green,1,65,21,0,21.0,,2010-09-26 03:00:00+00:00,0.0,Michigan Stadium,
4,michigan,football,2010-10-02 19:30:00+00:00,regular,indiana,1,42,35,0,19.0,,2010-10-03 06:30:00+00:00,0.0,Memorial Stadium,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,michigan state,basketball,2019-03-21 14:45:00,NCAA,bradley,1,76,65,0,,,2019-03-22 00:45:00,,,
989,michigan state,basketball,2019-03-23 19:45:00,NCAA,minnesota,1,70,50,0,,,2019-03-24 05:45:00,,,
990,michigan state,basketball,2019-03-29 19:00:00,NCAA,louisiana state (12),1,80,63,0,,12.0,2019-03-30 05:00:00,,,
991,michigan state,basketball,2019-03-31 17:05:00,NCAA,duke (1),1,68,67,0,,1.0,2019-04-01 03:05:00,,,


In [25]:
sports.columns # verify our columms

Index(['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result',
       'team_points', 'opp_points', 'ot', 'rank', 'opponent_rank',
       'end_inc_window', 'attendance', 'venue', 'excitement_index'],
      dtype='object')

In [26]:
# Reduce number of columns that will be used in our analysis (prior to join with incident data)
sports = sports[['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result', 'team_points',
                 'opp_points', 'ot', 'rank', 'opponent_rank', 'end_inc_window', 'attendance', 'venue', 'excitement_index']]

In [27]:
sports.season_type.unique()

array(['regular', 'CTOURN', 'NCAA'], dtype=object)

In [28]:
print(len(sports)) # 726 basketball games + 267 football games

993


In [29]:
sports.to_csv('../04_finaldata/sports.csv', header=True)

<a id='join'></a>


&nbsp;

## Join Sports and Incidents via Spark SQL

Now that the sports and incident data are manipulated, we'll use Spark SQL to create a dataset of all the games and their associated incidents / offenses.\
\
We'll ultimately end up with a large table that identifies every offense and its attribution to a particular incident and game.



#### sportrdd


In [30]:
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('Merge') \
    .getOrCreate() 

sc = spark.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/21 14:11:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/21 14:11:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/21 14:11:10 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [31]:
# Convert the sports DataFrame from prior step into a Spark DataFrame and ensure dates are casted correctly

sportsrdd = spark.read.csv('../04_finaldata/sports.csv', header=True, inferSchema=True)
sportsrdd = sportsrdd.withColumn('start_dt', col('start_dt').cast(TimestampType())) # convert start_dt to Timestamp
sportsrdd = sportsrdd.withColumn('end_inc_window', col('end_inc_window').cast(TimestampType())) # convert end_inc_window to Timestamp

In [32]:
print(sportsrdd.count())
sportsrdd.printSchema()

993
root
 |-- game_id: integer (nullable = true)
 |-- school: string (nullable = true)
 |-- sport: string (nullable = true)
 |-- start_dt: timestamp (nullable = true)
 |-- season_type: string (nullable = true)
 |-- opponent: string (nullable = true)
 |-- result: integer (nullable = true)
 |-- team_points: integer (nullable = true)
 |-- opp_points: integer (nullable = true)
 |-- ot: integer (nullable = true)
 |-- rank: string (nullable = true)
 |-- opponent_rank: double (nullable = true)
 |-- end_inc_window: timestamp (nullable = true)
 |-- attendance: double (nullable = true)
 |-- venue: string (nullable = true)
 |-- excitement_index: double (nullable = true)



In [33]:
# Let's run a quick test to see that the columns came over properly
test = sportsrdd.sample(withReplacement=False, fraction=.1)
test.show(5)

+-------+--------+--------+-------------------+-----------+----------------+------+-----------+----------+---+----+-------------+-------------------+----------+--------------------+-----------------+
|game_id|  school|   sport|           start_dt|season_type|        opponent|result|team_points|opp_points| ot|rank|opponent_rank|     end_inc_window|attendance|               venue| excitement_index|
+-------+--------+--------+-------------------+-----------+----------------+------+-----------+----------+---+----+-------------+-------------------+----------+--------------------+-----------------+
|      5|michigan|football|2010-10-09 15:30:00|    regular|  michigan state|     0|         17|        34|  0|18.0|         17.0|2010-10-10 02:30:00|       0.0|    Michigan Stadium|             null|
|     14|michigan|football|2019-09-21 12:00:00|    regular|       wisconsin|     0|         14|        35|  0|11.0|         13.0|2019-09-21 23:00:00|   80245.0|Camp Randall Stadium|     1.5613870332|



#### Incident Spark DataFrame


In [34]:
# Let's get the incident data for UM and MSU and do the same thing with the dates
incs = spark.read.csv('../04_finaldata/UM_MSU_Offs.csv', header=True, inferSchema=True)
incs = incs.withColumn('INCIDENT_DATE', col('INCIDENT_DATE').cast(TimestampType())) # convert 'INCIDENT_DATE' to Timestamp type

In [35]:
# Create Spark SQL Table Views from both RDDs
sportsrdd.createOrReplaceTempView("sports")
incs.createOrReplaceTempView("incs")

In [36]:
# We need to get a list of every incident / offense associated to a game
# This grabs columns from 'sports' pandas DataFrame and the 'incs' Spark DataFrame and saves into a Spark DataFrame

query = """

SELECT
    s.game_id,
    s.sport, 
    s.school, 
    s.start_dt, 
    s.end_inc_window,
    s.rank,
    s.season_type,
    s.opponent,
    s.opponent_rank,
    s.result,
    s.team_points,
    s.opp_points,
    s.ot,
    s.attendance, 
    i.INCIDENT_ID as incident_id,
    i.OFFENSE_ID as offense_id,
    i.OFFENSE_NAME as offense_name,
    i.INCIDENT_DATE as incident_date

FROM
    sports s,
    incs i

WHERE
    i.INCIDENT_DATE BETWEEN s.start_dt AND s.end_inc_window AND
    i.school = s.school
"""


result = spark.sql(query)

In [37]:
df = result.toPandas() # convert Spark DataFrame to pandas DataFrame
print(df.shape)
df.head()

(1625, 18)


Unnamed: 0,game_id,sport,school,start_dt,end_inc_window,rank,season_type,opponent,opponent_rank,result,team_points,opp_points,ot,attendance,incident_id,offense_id,offense_name,incident_date
0,27,football,michigan,2009-09-26 12:00:00,2009-09-26 23:00:00,23.0,regular,indiana,,1,36,33,0,0.0,50954556,55764025,Pocket-picking,2009-09-26 15:00:00
1,35,football,michigan,2009-11-21 12:00:00,2009-11-21 23:00:00,,regular,ohio state,10.0,0,10,21,0,0.0,50956387,55769038,Destruction/Damage/Vandalism of Property,2009-11-21 22:00:00
2,163,football,michigan state,2009-10-24 19:00:00,2009-10-25 06:00:00,,regular,iowa,6.0,0,13,15,0,0.0,51479688,55427802,Simple Assault,2009-10-24 23:00:00
3,294,basketball,michigan,2010-02-27 12:00:00,2010-02-27 22:00:00,-,regular,ohio state (9),9.0,0,55,66,0,,55980511,61583743,Theft From Motor Vehicle,2010-02-27 13:00:00
4,26,football,michigan,2009-09-19 12:00:00,2009-09-19 23:00:00,25.0,regular,eastern michigan,,1,45,17,0,0.0,50954526,55762040,All Other Larceny,2009-09-19 17:00:00


In [38]:
len(df.game_id.unique()) # check how many unique games had some type of incident associated to them

683

In [39]:
df.isna().sum()

game_id              0
sport                0
school               0
start_dt             0
end_inc_window       0
rank               359
season_type          0
opponent             0
opponent_rank     1135
result               0
team_points          0
opp_points           0
ot                   0
attendance         911
incident_id          0
offense_id           0
offense_name         0
incident_date        0
dtype: int64

Now, since we performed an SQL operation that returned results that matched our criteria, we need to join these results to the games dataset.\
\
We'll create another table view and call it `game_incs` and use a left join to the sports data.

In [40]:
result.createOrReplaceTempView('game_incs')

In [41]:
query2 = """

SELECT s.*, g.incident_id, g.offense_id, g.offense_name, g.incident_date

FROM
    sports s LEFT JOIN game_incs g
    ON s.game_id = g.game_id
    
"""

result2 = spark.sql(query2)


In [42]:
full = result2.toPandas().set_index('game_id')

In [43]:
len(full.index.unique()), full.shape

(993, (1935, 19))

In [44]:
full.isna().sum() # na values mostly from games where no incidents occurred w/in time window

school                 0
sport                  0
start_dt               0
season_type            0
opponent               0
result                 0
team_points            0
opp_points             0
ot                     0
rank                 393
opponent_rank       1362
end_inc_window         0
attendance          1176
venue               1186
excitement_index    1313
incident_id          310
offense_id           310
offense_name         310
incident_date        310
dtype: int64

<a id='check'></a>


&nbsp;

## Checking the full DataFrame


In [45]:
full.sample(20)

Unnamed: 0_level_0,school,sport,start_dt,season_type,opponent,result,team_points,opp_points,ot,rank,opponent_rank,end_inc_window,attendance,venue,excitement_index,incident_id,offense_id,offense_name,incident_date
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
870,michigan state,basketball,2016-01-28 21:00:00,regular,northwestern,1,76,45,0,12,,2016-01-29 07:00:00,,,,90500831.0,99142930.0,Drug/Narcotic Violations,2016-01-29 01:00:00
906,michigan state,basketball,2017-02-02 19:00:00,regular,nebraska,1,72,61,0,-,,2017-02-03 05:00:00,,,,,,,NaT
543,michigan,basketball,2017-03-09 12:20:00,CTOURN,illinois,1,75,55,0,-,,2017-03-09 22:20:00,,,,90103572.0,112107040.0,Theft From Building,2017-03-09 14:00:00
93,michigan,football,2018-11-10 15:30:00,regular,rutgers,1,42,7,0,4.0,,2018-11-11 02:30:00,0.0,High Point Solutions Stadium,0.374048,,,,NaT
297,michigan,basketball,2010-03-11 14:30:00,CTOURN,iowa,1,59,52,0,-,,2010-03-12 00:30:00,,,,55981389.0,61571778.0,Theft From Building,2010-03-11 16:00:00
109,michigan,football,2012-09-08 15:30:00,regular,air force,1,31,25,0,19.0,,2012-09-09 02:30:00,112522.0,Michigan Stadium,3.742049,64805805.0,68767778.0,Simple Assault,2012-09-08 20:00:00
444,michigan,basketball,2014-11-15 14:00:00,regular,hillsdale,1,92,68,0,24,,2014-11-16 00:00:00,,,,74955966.0,82002757.0,Stolen Property Offenses,2014-11-15 23:00:00
663,michigan state,basketball,2010-03-28 14:20:00,NCAA,tennessee (15),1,70,69,0,,15.0,2010-03-29 00:20:00,,,,,,,NaT
919,michigan state,basketball,2017-11-10 20:00:00,regular,north florida,1,98,66,0,2,,2017-11-11 06:00:00,,,,94351552.0,116948488.0,Burglary/Breaking & Entering,2017-11-10 21:00:00
843,michigan state,basketball,2015-03-15 15:30:00,CTOURN,wisconsin (6),0,69,80,1,-,6.0,2015-03-16 01:30:00,,,,81413551.0,89048441.0,Fondling,2015-03-16 00:00:00


In [46]:
full[(full['school']=='michigan state') & (full['sport']=='basketball')]

Unnamed: 0_level_0,school,sport,start_dt,season_type,opponent,result,team_points,opp_points,ot,rank,opponent_rank,end_inc_window,attendance,venue,excitement_index,incident_id,offense_id,offense_name,incident_date
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
833,michigan state,basketball,2015-02-10 19:00:00,regular,northwestern,1,68,44,0,-,,2015-02-11 05:00:00,,,,,,,NaT
737,michigan state,basketball,2012-11-13 20:30:00,regular,kansas (7),1,67,64,0,21,7.0,2012-11-14 06:30:00,,,,66991130.0,69033726.0,Drug/Narcotic Violations,2012-11-14 00:00:00
737,michigan state,basketball,2012-11-13 20:30:00,regular,kansas (7),1,67,64,0,21,7.0,2012-11-14 06:30:00,,,,66991129.0,69033725.0,Theft From Building,2012-11-13 22:00:00
858,michigan state,basketball,2015-12-09 19:00:00,regular,maryland-eastern shore,1,78,35,0,1,,2015-12-10 05:00:00,,,,81418091.0,89043819.0,Theft From Building,2015-12-09 19:00:00
858,michigan state,basketball,2015-12-09 19:00:00,regular,maryland-eastern shore,1,78,35,0,1,,2015-12-10 05:00:00,,,,81418071.0,89046110.0,Drug/Narcotic Violations,2015-12-10 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963,michigan state,basketball,2018-12-08 12:00:00,regular,florida,1,63,59,0,10,,2018-12-08 22:00:00,,,,102879319.0,126709921.0,All Other Larceny,2018-12-08 16:00:00
963,michigan state,basketball,2018-12-08 12:00:00,regular,florida,1,63,59,0,10,,2018-12-08 22:00:00,,,,102875943.0,126709923.0,Theft From Building,2018-12-08 16:00:00
954,michigan state,basketball,2018-11-06 19:00:00,regular,kansas (1),0,87,92,0,10,1.0,2018-11-07 05:00:00,,,,101779877.0,125449009.0,All Other Larceny,2018-11-06 19:00:00
954,michigan state,basketball,2018-11-06 19:00:00,regular,kansas (1),0,87,92,0,10,1.0,2018-11-07 05:00:00,,,,101774189.0,125451561.0,Drug/Narcotic Violations,2018-11-06 19:00:00


In [47]:
# Quick calculation to determine the number of sporting events that have an actual incident associated to them
grp = full.groupby('start_dt')[['incident_id','offense_name']].count()
grp[grp['incident_id'] > 0].shape[0] / 993

0.6525679758308157

In [48]:
full.to_csv('../04_finaldata/full_data.csv') # DataFrame to be used for main report analysis

In [49]:
# Let's do a quick validation to make sure that the data associated ok. Take the game start_date with the highest number of incidents and check what offenses were created. Validate that we have the same number.
grp.sort_values('incident_id', ascending=False)

Unnamed: 0_level_0,incident_id,offense_name
start_dt,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-10-25 15:30:00,14,14
2011-11-19 12:00:00,12,12
2016-10-29 12:00:00,12,12
2017-10-21 15:30:00,12,12
2011-10-15 12:00:00,11,11
...,...,...
2012-11-13 20:00:00,0,0
2012-11-21 21:30:00,0,0
2017-01-04 18:30:00,0,0
2012-11-23 12:00:00,0,0


In [50]:
# Seeing 10-25-2014 has the highest number of offenses.  Let's see what incidents/offenses were generated
testsql = "SELECT i.school, i.INCIDENT_ID,i.OFFENSE_NAME FROM incs i WHERE i.INCIDENT_DATE BETWEEN '2014-10-25 15:30:00' AND '2014-10-26 2:30:00'"
r = spark.sql(testsql)
r.show()

+--------------+-----------+--------------------+
|        school|INCIDENT_ID|        OFFENSE_NAME|
+--------------+-----------+--------------------+
|      michigan|   74955899|Drug/Narcotic Vio...|
|michigan state|   73419507| Theft From Building|
|michigan state|   73418616|Destruction/Damag...|
|michigan state|   73418627|Destruction/Damag...|
|      michigan|   74955896| Theft From Building|
|michigan state|   73418619|      Simple Assault|
|michigan state|   73418637|      Simple Assault|
|michigan state|   73418625|      Simple Assault|
|michigan state|   73418636|      Simple Assault|
|michigan state|   73418620|      Simple Assault|
|michigan state|   73418617|  Aggravated Assault|
|michigan state|   73418618|      Simple Assault|
|      michigan|   74955897| Theft From Building|
|michigan state|   73418623|      Simple Assault|
+--------------+-----------+--------------------+



^\
|

Looks like 14 offenses - most attributed to MSU's campus (assault).\
\
This was a game where Michigan State and played Michigan and won, btw. It was played at Spartan Stadium.


&nbsp;

## Result checks


In [51]:
sports.columns

Index(['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result',
       'team_points', 'opp_points', 'ot', 'rank', 'opponent_rank',
       'end_inc_window', 'attendance', 'venue', 'excitement_index'],
      dtype='object')

In [52]:
test_query = """SELECT s.school home, s.opponent, s.start_dt start, s.end_inc_window end
                FROM sports s
                WHERE s.school == 'michigan'"""
spark.sql(test_query).show(3)

+--------+-----------+-------------------+-------------------+
|    home|   opponent|              start|                end|
+--------+-----------+-------------------+-------------------+
|michigan|connecticut|2010-09-04 15:30:00|2010-09-05 02:30:00|
|michigan| notre dame|2010-09-11 15:30:00|2010-09-12 02:30:00|
|michigan|      umass|2010-09-18 12:00:00|2010-09-18 23:00:00|
+--------+-----------+-------------------+-------------------+
only showing top 3 rows



In [53]:
# Determine if number of incidents for a game matches on Timestamp alone
test_query = """SELECT i.INCIDENT_ID id 
                FROM incs i 
                WHERE i.INCIDENT_DATE BETWEEN '2009-09-12 15:30:00' AND '2009-09-13 02:30:00'"""
spark.sql(test_query).count()

9

In [54]:
test_query = """SELECT s.school home, s.opponent, i.INCIDENT_ID id, i.OFFENSE_NAME offense
                FROM sports s, incs i
                WHERE (s.sport == 'football' 
                        AND s.school == 'michigan' 
                        AND s.opponent == 'notre dame' 
                        AND i.INCIDENT_DATE BETWEEN '2012-01-01 00:00:00' AND '2012-12-31 23:59:59')"""
spark.sql(test_query).count()

19432

In [55]:
df_full = pd.read_csv('../04_finaldata/full_data.csv', parse_dates=['start_dt', 'end_inc_window'])