In [63]:
import numpy as np
import pandas as pd
import os
import glob

pd.set_option('display.max_rows', 500)

# Sport - NIBRS Join

[Football poll data](#poll)\
[Football / basketball data alignment](#alignment)\
[Spark SQL join](#join)\
[Checking full DataFrame](#check)

<a id='poll'></a>


&nbsp;

## Football poll data


In [64]:
# create DataFrame 'df' from cfbd .json files, sourced from their /games API endpoint - one .json for each year
df = pd.concat(map(lambda file: pd.read_json(file), 
                   glob.glob(os.path.join('', r'../02_sport_rawdata/cfbd_games_json/' + '*.json'))))

# create DataFrame 'df_poll' from cfbd .json files, sourced from their /rankings API endpoint - one .json for each year
df_poll = pd.concat(map(lambda file: pd.read_json(file, orient='records'), 
                        glob.glob(os.path.join('', r'../02_sport_rawdata/cfbd_poll_json/' + '*.json'))))
df_poll.reset_index(inplace=True, drop=True)


df_lst = []

def GetRankings(poll_data):
    ''' 
    This function takes a row of data and determines from which poll to extract rankings.
    There are 5 polls from which to select for a team's ranking on any given week.
    Note: This is not scaleable for large data due to looping of rows.
    '''
    
    tmp_df = pd.DataFrame(poll_data['polls'])

    search_term = ""
    if poll_data['season'] > 2013:
        if poll_data['week'] < 10: 
            search_term = 'AP Top 25'
        else:
            search_term = 'Playoff Committee'
        
    elif poll_data['season'] == 2013:
        if poll_data['week'] < 9:
            search_term = 'AP Top 25'
        else:
            search_term = 'BCS'
    else:
        if poll_data['week'] < 8:
            search_term = 'AP Top 25'
        else:
            search_term = 'BCS'
    
    if search_term == "":
        search_term = 'AP Top 25 i'
    rank_type = tmp_df[tmp_df['poll'].str.contains(search_term) == True]
    
    if rank_type.shape[0] == 0:
        search_term = 'AP Top 25'
        rank_type = tmp_df[tmp_df['poll'].str.contains('AP') == True]
    try:
        rankings = pd.json_normalize(rank_type.iloc[0]['ranks']).explode(['rank', 'school', 'conference'])
        # temp_ap = pd.json_normalize(r['polls']['poll']['ranks']).explode(['rank','school','conference'])
    except Exception:
        print(poll_data['season'], poll_data['week'], search_term)
        quit()
    
    rankings['season'] = poll_data['season']
    rankings['season_type'] = poll_data['seasonType']
    rankings['week'] = poll_data['week']
    rankings['poll'] = search_term
    
    df_lst.append(rankings)

# We don't want to return anything when we loop through the rows 
# as there will be a list of DataFrames stored we'll use in subsequent cells.
_ = df_poll.apply(lambda r: GetRankings(r),axis=1)

In [65]:
# Create the dataframe from the list of frames and validate the different polls that we're using and use some basic manipulation
cmb_ranks = pd.concat(df_lst)
cmb_ranks.poll.value_counts()
cmb_ranks['school'] = cmb_ranks['school'].str.lower()

#creating a duplicate column so that I can use that when merging for opponent ranks
cmb_ranks['opponent'] = cmb_ranks['school']
cmb_ranks

Unnamed: 0,rank,school,conference,firstPlaceVotes,points,season,season_type,week,poll,opponent
0,25,wisconsin,Big Ten,0,0,2009,regular,15,BCS,wisconsin
1,10,iowa,Big Ten,0,0,2009,regular,15,BCS,iowa
2,8,ohio state,Big Ten,0,0,2009,regular,15,BCS,ohio state
3,24,usc,Pac-10,0,0,2009,regular,15,BCS,usc
4,21,stanford,Pac-10,0,0,2009,regular,15,BCS,stanford
...,...,...,...,...,...,...,...,...,...,...
20,21,cincinnati,American Athletic,0,0,2019,regular,16,Playoff Committee,cincinnati
21,22,usc,Pac-12,0,0,2019,regular,16,Playoff Committee,usc
22,23,navy,American Athletic,0,0,2019,regular,16,Playoff Committee,navy
23,24,virginia,ACC,0,0,2019,regular,16,Playoff Committee,virginia


In [66]:
df.columns

Index(['id', 'season', 'week', 'season_type', 'start_date', 'start_time_tbd',
       'neutral_site', 'conference_game', 'attendance', 'venue_id', 'venue',
       'home_id', 'home_team', 'home_conference', 'home_points',
       'home_line_scores', 'home_post_win_prob', 'home_pregame_elo',
       'home_postgame_elo', 'away_id', 'away_team', 'away_conference',
       'away_points', 'away_line_scores', 'away_post_win_prob',
       'away_pregame_elo', 'away_postgame_elo', 'excitement_index',
       'highlights', 'notes'],
      dtype='object')

<a id='alignment'></a>


&nbsp;

## Football / basketball data alignment


The football data is not specific to a school but rather a record of games.\
\
The Basketball data is generated by school and thus we have two representations of the same game based on the school.\
\
We need to translate the football data to a similar style.

In [67]:
def DetermineResult(h_a, home_points, away_points):
    if h_a == 'H':
        if home_points > away_points:
            return 'W'
        elif home_points < away_points:
            return 'L'
        else:
            return 'T'
    else:
        if home_points > away_points:
            return 'L'
        elif home_points < away_points:
            return 'W'
        else:
            return 'T'


# reduce 'df' to football games Michigan played
um = df[(df['home_team'] =='Michigan') | (df['away_team']=='Michigan')].copy()
um['school'] = 'michigan'
um['opponent'] = um.apply(lambda row: row['away_team'] if row['home_team']=='Michigan' else row['home_team'], axis=1)
um['h_a'] = um.apply(lambda row: 'H' if row['home_team']=='Michigan' else 'A', axis=1)

# reduce 'df' to football games Michigan State played
msu = df[(df['home_team']=='Michigan State') | (df['away_team']=='Michigan State')].copy()
msu['school'] = 'michigan state'
msu['opponent'] = msu.apply(lambda row: row['away_team'] if row['home_team']=='Michigan State' else row['home_team'], axis=1)
msu['h_a'] = msu.apply(lambda row: 'H' if row['home_team']=='Michigan State' else 'A', axis=1)

um_msu = pd.concat([um, msu]) # concatenate 'um', 'msu' together for football DataFrame
um_msu['sport'] = 'football'

um_msu['result'] = um_msu.apply(lambda r: DetermineResult(r['h_a'], r['home_points'], r['away_points']), axis=1)
um_msu['team_points'] = um_msu.apply(lambda r: r['home_points'] if r['h_a'] == 'H' else r['away_points'], axis=1)
um_msu['opp_points'] = um_msu.apply(lambda r: r['home_points'] if r['h_a'] == 'A' else r['away_points'], axis=1)
um_msu['ot'] = um_msu.apply(lambda r: None if (len(r['home_line_scores']) -  4) == 0 else "OT" , axis=1)

um_msu = um_msu.merge(cmb_ranks[['rank','school','season','season_type','week']], how='left', on=['school','season','week','season_type'])
cmb_ranks['opponent_rank'] = cmb_ranks['rank']
um_msu.opponent = um_msu.opponent.str.lower()
um_msu = um_msu.merge(cmb_ranks[['opponent_rank','opponent','season','season_type','week']], how='left', on=['opponent', 'season', 'week', 'season_type'])
# um_msu['rank'] = um_msu['rank'].astype('int64')
# um_msu['opponent_rank'] = um_msu['opponent_rank'].astype('int64')


um_msu[um_msu['rank'].isna() == False]['rank'].unique()

# um_msu[['start_date','home_team','away_team','h_a','home_points','away_points','result','team_points','opp_points','ot']].sample(10)

array([25., 23., 22., 20., 21., 19., 18., 12., 11., 15., 24.,  8., 17.,
       14., 10.,  7.,  5.,  4.,  3.,  2.,  6., 16., 13.,  9.])

In [68]:
print(len(um_msu))

267


In [69]:
um_msu[['school','rank','opponent','opponent_rank','season','week','attendance']].sample(10)

Unnamed: 0,school,rank,opponent,opponent_rank,season,week,attendance
113,michigan,15.0,maryland,,2018,6,0.0
193,michigan state,10.0,ohio state,2.0,2013,15,66002.0
186,michigan state,,indiana,,2013,7,73815.0
260,michigan state,25.0,ohio state,4.0,2019,6,104797.0
103,michigan,,rutgers,,2017,9,111213.0
162,michigan state,16.0,wisconsin,6.0,2011,8,0.0
99,michigan,8.0,purdue,,2017,4,60042.0
27,michigan,22.0,san diego state,,2011,4,0.0
161,michigan state,23.0,michigan,11.0,2011,7,0.0
82,michigan,12.0,penn state,,2015,12,107418.0


In [70]:
um_msu[um_msu['start_date'] == '2012-10-13T16:00:00.000Z'][['home_team','away_team','away_line_scores','ot']]

Unnamed: 0,home_team,away_team,away_line_scores,ot
175,Michigan State,Iowa,"[0, 3, 0, 10, 3, 3]",OT


In [71]:
um_msu['start_dt'] = pd.to_datetime(um_msu['start_date']) # convert to datetime
um_msu['end_inc_window'] = um_msu['start_dt'] + pd.to_timedelta(11, unit='h') # set incident time window from game start time

In [72]:
um_msu = um_msu[['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result', 'team_points', 
                 'opp_points', 'ot', 'rank', 'opponent_rank', 'end_inc_window', 'attendance']]
um_msu.columns

In [74]:
# Read in college basketball data and do some quick manipulation to ensure columns match.

cbb = pd.read_csv('../04_finaldata/df_basketball_final.csv', parse_dates=['start_dt', 'end_inc_window'])
cbb = cbb[['school','sport','start_dt','type','opponent','result','team_points','opp_points','ot','rank','opponent_rank','end_inc_window']]
cbb = cbb.rename({'type': 'season_type'}, axis=1)
cbb['opponent'] = cbb['opponent'].str.lower()
cbb['season_type'] = cbb['season_type'].apply(lambda x: "regular" if x=='REG' else x)
cbb.columns
# fbb = spark.createDataFrame(um_msu[['school','sport','game_dt','season_type','opponent','result','team_points','opp_points','ot','end_inc_window']])

Index(['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result',
       'team_points', 'opp_points', 'ot', 'rank', 'opponent_rank',
       'end_inc_window'],
      dtype='object')

In [75]:
# We join the football and basketball data and do some manipulation across the sports including binarizing results and OT, 
# cleaning up / naming the indexes (used for a unique key)
sports = pd.concat([um_msu, cbb])
sports['ot'] = sports.ot.replace([np.nan], [None])
sports['ot'] = sports.ot.replace({None: 0, "OT": 1, "2OT": 1})
sports['result'] = sports.result.replace({"W": 1,"L": 0})
sports = sports.reset_index(drop=True)
sports.index.names = ['game_id']
sports

Unnamed: 0_level_0,school,sport,start_dt,season_type,opponent,result,team_points,opp_points,ot,rank,opponent_rank,end_inc_window,attendance
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,michigan,football,2009-09-05 19:30:00+00:00,regular,western michigan,1,31,7,0,,,2009-09-06 06:30:00+00:00,0.0
1,michigan,football,2009-09-12 19:30:00+00:00,regular,notre dame,1,38,34,0,,18.0,2009-09-13 06:30:00+00:00,0.0
2,michigan,football,2009-09-19 16:00:00+00:00,regular,eastern michigan,1,45,17,0,25.0,,2009-09-20 03:00:00+00:00,0.0
3,michigan,football,2009-09-26 16:00:00+00:00,regular,indiana,1,36,33,0,23.0,,2009-09-27 03:00:00+00:00,0.0
4,michigan,football,2009-10-03 16:00:00+00:00,regular,michigan state,0,20,26,1,22.0,,2009-10-04 03:00:00+00:00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,michigan-state,basketball,2019-03-21 14:45:00,NCAA,bradley,1,76,65,0,,,2019-03-22 00:45:00,
989,michigan-state,basketball,2019-03-23 19:45:00,NCAA,minnesota,1,70,50,0,,,2019-03-24 05:45:00,
990,michigan-state,basketball,2019-03-29 19:00:00,NCAA,louisiana state (12),1,80,63,0,,12.0,2019-03-30 05:00:00,
991,michigan-state,basketball,2019-03-31 17:05:00,NCAA,duke (1),1,68,67,0,,1.0,2019-04-01 03:05:00,


In [76]:
sports.columns # verify our columms

Index(['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result',
       'team_points', 'opp_points', 'ot', 'rank', 'opponent_rank',
       'end_inc_window', 'attendance'],
      dtype='object')

In [77]:
# Reduce number of columns that will be used in our analysis (prior to join with incident data)
sports = sports[['school', 'sport', 'start_dt', 'season_type', 'opponent', 'result', 'team_points',
                 'opp_points', 'ot', 'rank', 'opponent_rank', 'end_inc_window', 'attendance']]

In [78]:
sports.season_type.unique()

array(['regular', 'CTOURN', 'NCAA'], dtype=object)

In [79]:
print(len(sports)) # 726 basketball games + 267 football games

993


In [80]:
sports.to_csv('../04_finaldata/sports.csv', header=True)

<a id='join'></a>


&nbsp;

## Join Sports and Incidents via Spark SQL

Now that the sports and incident data are manipulated, we'll use Spark SQL to create a dataset of all the games and their associated incidents / offenses.\
\
We'll ultimately end up with a large table that identifies every offense and its attribution to a particular incident and game.



#### sportrdd


In [81]:
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('Merge') \
    .getOrCreate() 

sc = spark.sparkContext

In [100]:
# Convert the sports DataFrame from prior step into a Spark DataFrame and ensure dates are casted correctly

sportsrdd = spark.read.csv('../04_finaldata/sports.csv', header=True, inferSchema=True)
sportsrdd = sportsrdd.withColumn('start_dt', col('start_dt').cast(TimestampType())) # convert start_dt to Timestamp
sportsrdd = sportsrdd.withColumn('end_inc_window', col('end_inc_window').cast(TimestampType())) # convert end_inc_window to Timestamp

In [101]:
print(sportsrdd.count())
sportsrdd.printSchema()

993
root
 |-- game_id: integer (nullable = true)
 |-- school: string (nullable = true)
 |-- sport: string (nullable = true)
 |-- start_dt: timestamp (nullable = true)
 |-- season_type: string (nullable = true)
 |-- opponent: string (nullable = true)
 |-- result: integer (nullable = true)
 |-- team_points: integer (nullable = true)
 |-- opp_points: integer (nullable = true)
 |-- ot: integer (nullable = true)
 |-- rank: string (nullable = true)
 |-- opponent_rank: double (nullable = true)
 |-- end_inc_window: timestamp (nullable = true)
 |-- attendance: double (nullable = true)



In [96]:
# Let's run a quick test to see that the columns came over properly
test = sportsrdd.sample(withReplacement=False, fraction=.1)
test.show(5)

+-------+--------+--------+-------------------+-----------+----------------+------+-----------+----------+---+----+-------------+-------------------+----------+
|game_id|  school|   sport|           start_dt|season_type|        opponent|result|team_points|opp_points| ot|rank|opponent_rank|     end_inc_window|attendance|
+-------+--------+--------+-------------------+-----------+----------------+------+-----------+----------+---+----+-------------+-------------------+----------+
|     18|michigan|football|2010-10-16 15:30:00|    regular|            iowa|     0|         28|        38|  0|null|         15.0|2010-10-17 02:30:00|       0.0|
|     20|michigan|football|2010-11-06 12:00:00|    regular|        illinois|     1|         67|        65|  1|null|         null|2010-11-06 23:00:00|       0.0|
|     26|michigan|football|2011-09-17 12:00:00|    regular|eastern michigan|     1|         31|         3|  0|null|         null|2011-09-17 23:00:00|       0.0|
|     29|michigan|football|2011-10


#### Incident Spark DataFrame


In [87]:
# Let's get the incident data for UM and MSU and do the same thing with the dates
incs = spark.read.csv('../01_nibrs_rawdata/UM_MSU_Offs.csv', header=True, inferSchema=True)
incs = incs.withColumn('INCIDENT_DATE', col('INCIDENT_DATE').cast(TimestampType())) # convert 'INCIDENT_DATE' to Timestamp type

In [88]:
# Create Spark SQL Table Views from both RDDs
sportsrdd.createOrReplaceTempView("sports")
incs.createOrReplaceTempView("incs")

In [105]:
# We need to get a list of every incident / offense associated to a game
# This grabs columns from 'sports' pandas DataFrame and the 'incs' Spark DataFrame and saves into a Spark DataFrame

query = """

SELECT
    s.game_id,
    s.sport, 
    s.school, 
    s.start_dt, 
    s.end_inc_window,
    s.rank,
    s.season_type,
    s.opponent,
    s.opponent_rank,
    s.result,
    s.team_points,
    s.opp_points,
    s.ot,
    s.attendance, 
    i.INCIDENT_ID as incident_id,
    i.OFFENSE_ID as offense_id,
    i.OFFENSE_NAME as offense_name,
    i.INCIDENT_DATE as incident_date

FROM
    sports s,
    incs i

WHERE
    i.INCIDENT_DATE BETWEEN s.start_dt AND s.end_inc_window AND
    i.school = s.school
"""


result = spark.sql(query)

In [90]:
df = result.toPandas() # convert Spark DataFrame to pandas DataFrame
print(df.shape)
df.head()

(1172, 18)


Unnamed: 0,game_id,sport,school,start_dt,end_inc_window,rank,season_type,opponent,opponent_rank,result,team_points,opp_points,ot,attendance,incident_id,offense_id,offense_name,incident_date
0,188,football,michigan state,2013-10-26 15:30:00,2013-10-27 02:30:00,,regular,illinois,,1,42,3,0,45895.0,68584649,77593987,Rape,2013-10-27 01:00:00
1,3,football,michigan,2009-09-26 12:00:00,2009-09-26 23:00:00,23.0,regular,indiana,,1,36,33,0,0.0,50954556,55764025,Pocket-picking,2009-09-26 15:00:00
2,11,football,michigan,2009-11-21 12:00:00,2009-11-21 23:00:00,,regular,ohio state,10.0,0,10,21,0,0.0,50956387,55769038,Destruction/Damage/Vandalism of Property,2009-11-21 22:00:00
3,55,football,michigan,2013-11-02 15:30:00,2013-11-03 01:30:00,21.0,regular,michigan state,22.0,0,6,29,0,76306.0,69063561,75222618,All Other Larceny,2013-11-02 22:00:00
4,193,football,michigan state,2013-12-07 20:17:00,2013-12-08 07:17:00,10.0,regular,ohio state,2.0,1,34,24,0,66002.0,68586276,77590661,Arson,2013-12-08 00:00:00


In [91]:
len(df.game_id.unique()) # check how many unique games had some type of incident associated to them

451

Now, since we performed an SQL operation that returned results that matched our criteria, we need to join these results to the games dataset.\
\
We'll create another table view and call it `game_incs` and use a left join to the sports data.

In [92]:
result.createOrReplaceTempView('game_incs')

In [93]:
query2 = """

SELECT s.*, g.incident_id, g.offense_id, g.offense_name, g.incident_date

FROM
    sports s LEFT JOIN game_incs g
    ON s.game_id = g.game_id
    
"""

result2 = spark.sql(query2)


In [94]:
full = result2.toPandas().set_index('game_id')

In [95]:
len(full.index.unique()), full.shape

(993, (1714, 17))

<a id='check'></a>


&nbsp;

## Checking the full DataFrame


In [35]:
full.sample(20)

Unnamed: 0_level_0,school,sport,start_dt,season_type,opponent,result,team_points,opp_points,ot,rank,opponent_rank,end_inc_window,attendance,incident_id,offense_id,offense_name,incident_date
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
92,michigan,football,2016-11-05 15:30:00,regular,maryland,1,59,3,0,3.0,,2016-11-06 01:30:00,110626.0,88535126.0,96893178.0,Theft From Building,2016-11-05 17:00:00
433,michigan,basketball,2014-02-26 19:00:00,regular,purdue,1,77,76,1,16,,2014-02-27 05:00:00,,74950904.0,81996544.0,Destruction/Damage/Vandalism of Property,2014-02-27 00:00:00
326,michigan,basketball,2011-02-19 16:30:00,regular,iowa,1,75,72,1,-,,2011-02-20 02:30:00,,59271695.0,62410646.0,Drug/Narcotic Violations,2011-02-20 02:00:00
563,michigan,basketball,2017-12-21 21:00:00,regular,alabama a&m,1,97,47,0,-,,2017-12-22 07:00:00,,,,,NaT
696,michigan-state,basketball,2011-03-11 18:30:00,CTOURN,purdue (9),1,74,56,0,-,9.0,2011-03-12 04:30:00,,,,,NaT
547,michigan,basketball,2017-03-17 12:15:00,NCAA,oklahoma state,1,92,91,0,,,2017-03-17 22:15:00,,90103582.0,112107050.0,Intimidation,2017-03-17 17:00:00
237,michigan state,football,2017-10-21 15:30:00,regular,indiana,1,17,9,0,18.0,,2017-10-22 02:30:00,74111.0,96011770.0,118856075.0,Theft of Motor Vehicle Parts or Accessories,2017-10-21 23:00:00
516,michigan,basketball,2016-11-23 17:00:00,regular,south carolina,0,46,61,0,25,,2016-11-24 03:00:00,,89208456.0,97661651.0,Embezzlement,2016-11-24 00:00:00
24,michigan,football,2011-09-03 15:30:00,regular,western michigan,1,34,10,1,,,2011-09-04 02:30:00,0.0,59274502.0,62414074.0,Simple Assault,2011-09-04 00:00:00
227,michigan state,football,2016-11-05 12:00:00,regular,illinois,0,27,31,0,,,2016-11-05 23:00:00,47144.0,88501256.0,96857112.0,Theft From Building,2016-11-05 20:00:00


In [36]:
# Quick calculation to determine the number of sporting events that have an actual incident associated to them
grp = full.groupby('start_dt')[['incident_id','offense_name']].count()
grp[grp['incident_id'] > 0].shape[0] / 993

0.43202416918429004

In [37]:
full.to_csv('../04_finaldata/full_data.csv')

In [38]:
# Let's do a quick validation to make sure that the data associated ok. Take the game start_date with the highest number of incidents and check what offenses were created. Validate that we have the same number.
grp.sort_values('incident_id', ascending=False)

Unnamed: 0_level_0,incident_id,offense_name
start_dt,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-10-25 15:30:00,14,14
2016-10-29 12:00:00,12,12
2017-10-21 15:30:00,12,12
2011-11-19 12:00:00,12,12
2009-10-03 12:00:00,11,11
...,...,...
2012-02-11 18:00:00,0,0
2015-11-27 17:30:00,0,0
2015-11-26 21:30:00,0,0
2015-11-26 18:30:00,0,0


In [39]:
# Seeing 10-25-2014 has the highest number of offenses.  Let's see what incidents/offenses were generated
testsql = "SELECT i.school, i.INCIDENT_ID,i.OFFENSE_NAME FROM incs i WHERE i.INCIDENT_DATE BETWEEN '2014-10-25 15:30:00' AND '2014-10-26 2:30:00'"
r = spark.sql(testsql)
r.show()

+--------------+-----------+--------------------+
|        school|INCIDENT_ID|        OFFENSE_NAME|
+--------------+-----------+--------------------+
|      michigan|   74955899|Drug/Narcotic Vio...|
|michigan state|   73419507| Theft From Building|
|michigan state|   73418616|Destruction/Damag...|
|michigan state|   73418627|Destruction/Damag...|
|      michigan|   74955896| Theft From Building|
|michigan state|   73418619|      Simple Assault|
|michigan state|   73418637|      Simple Assault|
|michigan state|   73418625|      Simple Assault|
|michigan state|   73418636|      Simple Assault|
|michigan state|   73418620|      Simple Assault|
|michigan state|   73418617|  Aggravated Assault|
|michigan state|   73418618|      Simple Assault|
|      michigan|   74955897| Theft From Building|
|michigan state|   73418623|      Simple Assault|
+--------------+-----------+--------------------+



Looks like 14 offenses and most attributed on MSU's campus (assault).  This was a game that MSU and U-M played and MSU won, btw.

In [40]:
tstsql = "SELECT i.school, i.INCIDENT_ID,i.OFFENSE_NAME FROM incs i WHERE i.INCIDENT_DATE BETWEEN '2011-01-30 18:00:00' AND '2011-01-31 04:00:00'"
tresult = spark.sql(tstsql)
tresult.show()

+------+-----------+------------+
|school|INCIDENT_ID|OFFENSE_NAME|
+------+-----------+------------+
+------+-----------+------------+



In [41]:
df_full = pd.read_csv('../04_finaldata/full_data.csv', parse_dates=['start_dt', 'end_inc_window'])