### Library Imports

In [23]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

### Exploratory Data Analysis

In [4]:
df = pd.read_csv('ProjectDataset.csv')
df.head()

Unnamed: 0,state,playerid,wagerid,event_start,placed_date,settled_date,sportname,bet_type,result,net_stake,ggr,legresult,decimalodds
0,State1,30651210.0,1693004.0,2021-04-28 00:30:00+00,2021-04-27,2021-04-27,nhl,straight,won,6.64,-4.96,won,1.74627
1,State1,22237170.0,1696371.0,2021-04-28 01:45:00+00,2021-04-27,2021-04-27,nba,parlay,lost,5.0,5.0,won,1.78125
2,State1,22237170.0,1696371.0,2021-04-28 01:45:00+00,2021-04-27,2021-04-27,nba,parlay,lost,5.0,5.0,lost,1.86207
3,State1,22237170.0,1696371.0,2021-04-28 01:45:00+00,2021-04-27,2021-04-27,nba,parlay,lost,5.0,5.0,lost,1.74627
4,State1,22237170.0,1696371.0,2021-04-28 01:45:00+00,2021-04-27,2021-04-27,nba,parlay,lost,5.0,5.0,lost,1.78125


In [6]:
# rows and columns count

df.shape

(4174500, 13)

In [7]:
# names of columns

df.columns.values

array(['state', 'playerid', 'wagerid', 'event_start', 'placed_date',
       'settled_date', 'sportname', 'bet_type', 'result', 'net_stake',
       'ggr', 'legresult', 'decimalodds'], dtype=object)

In [10]:
# count of unique values in each column

df.nunique()

state                 3
playerid          36387
wagerid         2402300
event_start       11991
placed_date         366
settled_date        367
sportname             7
bet_type              2
result                2
net_stake         31701
ggr               96979
legresult             5
decimalodds       18694
dtype: int64

In [19]:
# where do null values exist in the dataframe

df.isna().sum()

state              0
playerid           0
wagerid            0
event_start        0
placed_date        0
settled_date       0
sportname          0
bet_type           0
result             0
net_stake          0
ggr                0
legresult          0
decimalodds     2472
dtype: int64

There are 4174500 rows and 13 columns in the dataset. There are 2472 null values, all of which are in the 'decimalodds' columns.

In [41]:
df.describe()

Unnamed: 0,playerid,wagerid,net_stake,ggr,decimalodds
count,4174500.0,4174500.0,4174500.0,4174500.0,4172028.0
mean,22739270.0,47541730.0,23.61655,2.451643,3.984459
std,10338950.0,36751710.0,81.89246,85.32259,16.0293
min,59716.4,1691622.0,4.001,-21000.0,1.0001
25%,17502100.0,22962530.0,5.0,0.0,1.64935
50%,23798740.0,38594040.0,10.0,6.0,1.90909
75%,30640890.0,58615770.0,20.0,12.0,2.2
max,42976710.0,177919200.0,19672.0,19672.0,5001.0


ggr values on average are slightly higher than the net_stake values. How to increase the margin?

### Examining Null Values

In [21]:
null_data = df[df.isnull().any(axis=1)]
null_data.head()

Unnamed: 0,state,playerid,wagerid,event_start,placed_date,settled_date,sportname,bet_type,result,net_stake,ggr,legresult,decimalodds
936440,State1,22274820.0,48954810.0,2021-03-29 23:25:00+00,2021-03-29,2021-03-30,college basketball,parlay,won,5.0,-6.25,won,
936441,State1,22274820.0,48954810.0,2021-03-30 02:02:00+00,2021-03-29,2021-03-30,college basketball,parlay,won,5.0,-6.25,won,
936833,State1,28869490.0,49117400.0,2021-03-30 23:25:00+00,2021-03-30,2021-03-31,college basketball,parlay,lost,5.0,5.0,won,
936834,State1,28869490.0,49117400.0,2021-03-31 02:07:00+00,2021-03-30,2021-03-31,college basketball,parlay,lost,5.0,5.0,lost,
936850,State1,30914580.0,49121960.0,2021-03-31 02:07:00+00,2021-03-30,2021-03-31,college basketball,parlay,lost,4.84,4.84,lost,


In [30]:
null_data.shape

(2472, 13)

In [34]:
null_data.nunique()

state             3
playerid        288
wagerid         928
event_start     394
placed_date      88
settled_date     84
sportname         3
bet_type          1
result            2
net_stake       191
ggr             248
legresult         3
decimalodds       0
dtype: int64

In [35]:
null_data.sportname.value_counts()

nba                   1823
college basketball     638
college football        11
Name: sportname, dtype: int64

Rows with null values seem to be random and shouldn't be counted as outlier data points so no reason to drop them.

### Examining Individual Columns

In [26]:
# how many states are included in dataframe

df.state.value_counts(normalize=True)

State2    0.400008
State3    0.330195
State1    0.269797
Name: state, dtype: float64

In [25]:
# which sports are included in the dataframe

df.sportname.value_counts(normalize=True)

nfl                   0.329091
nba                   0.325649
college basketball    0.119011
mlb                   0.113918
college football      0.062496
nhl                   0.044950
champions league      0.004885
Name: sportname, dtype: float64

In [27]:
# what kinds of bets were placed

df.bet_type.value_counts(normalize=True)

parlay      0.589196
straight    0.410804
Name: bet_type, dtype: float64

In [28]:
# wins vs losses

df.result.value_counts(normalize=True)

lost    0.732141
won     0.267859
Name: result, dtype: float64

In [29]:
# 

df.legresult.value_counts(normalize=True)

lost       0.505631
won        0.488063
void       0.005670
open       0.000626
unknown    0.000010
Name: legresult, dtype: float64

In [38]:
# which kinds of bets produced the highest ggr

df.ggr.sort_values(ascending=False)

3330266    19672.00
102129     12000.00
565327     12000.00
3224061    10000.00
2122683     8743.32
             ...   
2532086   -10000.00
3737475   -10909.09
3329787   -14297.50
1057409   -19620.00
557657    -21000.00
Name: ggr, Length: 4174500, dtype: float64

In [48]:
df['net_stake'].groupby(df['playerid']).value_counts()

playerid      net_stake 
5.971640e+04  5.000000      141
              10.000000      66
              6.666666       56
              7.500000       46
              25.000000      46
                           ... 
4.297319e+07  200.000000      1
4.297671e+07  25.000000       2
              10.000000       1
              20.000000       1
              30.000000       1
Name: net_stake, Length: 567135, dtype: int64

### Visualizations

### Things to look at

In [None]:
# histogram breaking down the number of bets placed by people, how to count that?
# which sports have the most returning users, what is the amount that they are betting, wins/losses?
# 