In [3]:
# The purpose of this notebook is to perform chi-squared
# analysis on the various categorical variables of the data.

import os
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

script_dir = os.path.abspath('')

file = os.path.realpath(script_dir + '/../data/interim/train_users_2_2.csv')

df = pd.read_csv(file)

df.head()

Unnamed: 0.1,Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination,days_thinking,number_of_actions
0,0,d1mm9tcy42,2014-01-01,2014-01-01 00:09:36,2014-01-04 00:00:00,MALE,62.0,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,other,2.0,127
1,1,xwxei6hdk4,2014-01-01,2014-01-01 00:27:42,2014-01-07 00:00:00,FEMALE,32.0,facebook,0,en,seo,google,linked,Web,iPad,Mobile Safari,US,5.0,7
2,2,ro2stddszp,2014-01-01,2014-01-01 00:55:03,2014-12-04 00:00:00,-unknown-,19.0,basic,0,en,sem-brand,google,untracked,Web,Mac Desktop,Safari,other,336.0,43
3,3,qtw88d9pbl,2014-01-01,2014-01-01 00:58:37,,MALE,25.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF,,364
4,4,awiurksqr3,2014-01-01,2014-01-01 01:01:13,2014-01-02 00:00:00,FEMALE,32.0,facebook,0,en,direct,direct,untracked,Web,iPad,Mobile Safari,US,0.0,8


In [14]:
# Create a frequency table on which to use chi squares.
# For this table, we are using gender. It is important
# to note that we are treating '-unknown-' as a piece
# of information about gender in its own right, rather
# than just noise. This may change as we tweak our model.

dest_by_gender = pd.crosstab(index=df['country_destination'],
                             columns=df['gender'])

print(dest_by_gender)

gender               -unknown-  FEMALE  MALE  OTHER
country_destination                                
AU                          15      39    57      0
CA                          65     119   117      2
DE                          29      60    94      1
ES                          92     214   163      0
FR                         205     435   308      0
GB                          98     244   155      1
IT                         127     283   191      0
NDF                       1835    7209  5985     18
NL                          36      68    71      1
PT                           4      21    19      0
US                        3017    6078  5106     22
other                      480     860  1002      5


In [19]:
chi2_contingency(dest_by_gender)

# This result tells us that gender is extremely
# likely to be a predictor of country destination.

# Of note, we performed this test despite some cells
# in the 'OTHER' category equaling zero. Let's do it
# again and remove this category.

(582.5265216284108,
 1.0254057356102045e-101,
 33,
 array([[1.90647764e+01, 4.96389231e+01, 4.21375068e+01, 1.58793740e-01],
        [5.20416869e+01, 1.35500844e+02, 1.15024005e+02, 4.33463992e-01],
        [3.16028726e+01, 8.22843409e+01, 6.98495608e+01, 2.63225659e-01],
        [8.05529742e+01, 2.09735630e+02, 1.78040457e+02, 6.70939315e-01],
        [1.62823496e+02, 4.23943235e+02, 3.59877085e+02, 1.35618437e+00],
        [8.55338617e+01, 2.22704358e+02, 1.89049355e+02, 7.12425968e-01],
        [1.03224600e+02, 2.68765701e+02, 2.28149924e+02, 8.59775114e-01],
        [2.58439361e+03, 6.72898086e+03, 5.71209968e+03, 2.15258505e+01],
        [3.02288347e+01, 7.87067609e+01, 6.68126234e+01, 2.51781065e-01],
        [7.55720866e+00, 1.96766902e+01, 1.67031558e+01, 6.29452662e-02],
        [2.44286770e+03, 6.36049011e+03, 5.39929513e+03, 2.03470573e+01],
        [4.03108380e+02, 1.04957254e+03, 8.90961518e+02, 3.35755772e+00]]))

In [7]:
df2 = df[df['gender'] != 'OTHER']

dest_by_gender2 = pd.crosstab(index=df2['country_destination'],
                              columns=df2['gender'])

print(dest_by_gender2)

chi2_contingency(dest_by_gender2)

# We still saw a very low p-value. That's good.

gender               -unknown-  FEMALE  MALE
country_destination                         
AU                          15      39    57
CA                          65     119   117
DE                          29      60    94
ES                          92     214   163
FR                         205     435   308
GB                          98     244   155
IT                         127     283   191
NDF                       1835    7209  5985
NL                          36      68    71
PT                           4      21    19
US                        3017    6078  5106
other                      480     860  1002


(567.8597147238904,
 4.771902511772956e-106,
 22,
 array([[  19.09208905,   49.71003696,   42.19787399],
        [  51.77224148,  134.79928942,  114.4284691 ],
        [  31.47614682,   81.95438526,   69.56946792],
        [  80.66837626,  210.03610212,  178.29552162],
        [ 163.05676055,  424.55058594,  360.39265351],
        [  85.48439873,  222.5755709 ,  188.94003037],
        [ 103.37248216,  269.15074067,  228.47677717],
        [2585.0000573 , 6730.55986934, 5713.44007335],
        [  30.1001404 ,   78.37167989,   66.52817971],
        [   7.5680353 ,   19.70487952,   16.72708518],
        [2442.58339303, 6359.7498639 , 5398.66674307],
        [ 402.82587891, 1048.83699607,  890.33712501]]))

In [22]:
dest_by_signup = pd.crosstab(index=df['country_destination'],
                             columns=df['signup_method'])

print(dest_by_signup)

chi2_contingency(dest_by_signup)

# Again, zeroes invalidate the result. Perform it
# again, but without Google.

signup_method        basic  facebook  google
country_destination                         
AU                      81        30       0
CA                     233        70       0
DE                     125        59       0
ES                     327       142       0
FR                     695       253       0
GB                     363       135       0
IT                     449       152       0
NDF                   6928      8116       3
NL                     128        48       0
PT                      28        16       0
US                   10112      4105       6
other                 1658       689       0


(2310.7467136516993,
 0.0,
 22,
 array([[6.70967068e+01, 4.38747103e+01, 2.85828732e-02],
        [1.83155875e+02, 1.19766101e+02, 7.80235186e-02],
        [1.11223370e+02, 7.27292495e+01, 4.73806186e-02],
        [2.83498698e+02, 1.85380533e+02, 1.20769077e-01],
        [5.73042145e+02, 3.74713742e+02, 2.44113187e-01],
        [3.01028468e+02, 1.96843295e+02, 1.28236674e-01],
        [3.63289377e+02, 2.37555864e+02, 1.54759520e-01],
        [9.09553286e+03, 5.94759249e+03, 3.87465309e+00],
        [1.06387571e+02, 6.95671082e+01, 4.53205917e-02],
        [2.65968928e+01, 1.73917771e+01, 1.13301479e-02],
        [8.59744560e+03, 5.62189193e+03, 3.66247032e+00],
        [1.41870244e+03, 9.27693199e+02, 6.04360390e-01]]))

In [25]:
df3 = df[df['signup_method'] != 'google']

dest_by_signup2 = pd.crosstab(index=df3['country_destination'],
                              columns=df3['signup_method'])

print(dest_by_signup2)

chi2_contingency(dest_by_signup2)

# And we still get the low p-value

signup_method        basic  facebook
country_destination                 
AU                      81        30
CA                     233        70
DE                     125        59
ES                     327       142
FR                     695       253
GB                     363       135
IT                     449       152
NDF                   6928      8116
NL                     128        48
PT                      28        16
US                   10112      4105
other                 1658       689


(2307.5453176407136, 0.0, 11, array([[  67.1139889 ,   43.8860111 ],
        [ 183.20305077,  119.79694923],
        [ 111.25201763,   72.74798237],
        [ 283.57171885,  185.42828115],
        [ 573.189743  ,  374.810257  ],
        [ 301.10600424,  196.89399576],
        [ 363.38294889,  237.61705111],
        [9096.06170225, 5947.93829775],
        [ 106.41497338,   69.58502662],
        [  26.60374335,   17.39625665],
        [8596.03225345, 5620.96774655],
        [1419.0678553 ,  927.9321447 ]]))

In [26]:
dest_by_language = pd.crosstab(index=df['country_destination'],
                               columns=df['language'])

print(dest_by_language)

chi2_contingency(dest_by_language)

# The high number of empty crosses means we can't use this test.
# I'm fairly certain there is some effect that language has
# on the destination, but this chi-square analysis won't be
# conclusive.

language             cs  da  de  el     en   es  fi   fr  hu  id  ...  ko  nl  \
country_destination                                               ...           
AU                    0   0   1   0    109    0   0    1   0   0  ...   0   0   
CA                    0   0   1   0    298    0   0    0   0   0  ...   1   0   
DE                    0   0   8   0    171    0   0    1   0   0  ...   0   1   
ES                    0   0   2   1    451    3   0    4   0   0  ...   0   0   
FR                    0   0   4   0    915    3   1   12   0   0  ...   5   1   
GB                    0   0   0   0    491    1   0    2   0   0  ...   1   0   
IT                    0   0   3   0    584    0   0    2   0   0  ...   2   0   
NDF                   3   3  56   4  14186  145   2  127   2   8  ...  93   6   
NL                    0   0   0   0    172    1   0    1   0   0  ...   0   1   
PT                    0   0   1   0     41    0   0    2   0   0  ...   0   0   
US                    1   2 

(557.5784866368358,
 5.936552622769447e-27,
 242,
 array([[1.27034992e-02, 1.58793740e-02, 3.62049727e-01, 2.54069984e-02,
         1.06576006e+02, 6.22471460e-01, 1.27034992e-02, 6.54230208e-01,
         1.27034992e-02, 2.54069984e-02, 6.35174959e-03, 2.73125232e-01,
         1.65145489e-01, 5.04964093e-01, 3.81104976e-02, 9.52762439e-03,
         3.81104976e-02, 1.77848989e-01, 2.82652857e-01, 7.30451203e-02,
         2.54069984e-02, 2.22311236e-02, 1.06391806e+00],
        [3.46771194e-02, 4.33463992e-02, 9.88297903e-01, 6.93542388e-02,
         2.90923693e+02, 1.69917885e+00, 3.46771194e-02, 1.78587165e+00,
         3.46771194e-02, 6.93542388e-02, 1.73385597e-02, 7.45558067e-01,
         4.50802552e-01, 1.37841550e+00, 1.04031358e-01, 2.60078395e-02,
         1.04031358e-01, 4.85479672e-01, 7.71565907e-01, 1.99393437e-01,
         6.93542388e-02, 6.06849589e-02, 2.90420875e+00],
        [2.10580527e-02, 2.63225659e-02, 6.00154502e-01, 4.21161054e-02,
         1.76666533e+02, 1.0318

In [23]:
dest_by_affiliate = pd.crosstab(index=df['country_destination'],
                                columns=df['affiliate_provider'])

print(dest_by_affiliate)

chi2_contingency(dest_by_affiliate)

# This test has too many empty pairings to give us a
# reliable result, with no clear way to remedy it
# by combining columns.
# Additionally, it looks like it has a significant
# cross-correlation to the 'signup_method' feature.
# This one should get sidelined.

affiliate_provider   baidu  bing  craigslist  direct  email-marketing  \
country_destination                                                     
AU                       0     1           0      68                0   
CA                       0     2           0     199                0   
DE                       0     4           0     100                0   
ES                       0     3           0     303                0   
FR                       0     9           0     577                0   
GB                       0     4           0     299                0   
IT                       0     7           0     364                1   
NDF                      2   189           1    8353               24   
NL                       0     1           0     101                0   
PT                       0     3           0      21                0   
US                       3   153           0    8816               16   
other                    2    31           0    147

(340.0006962356616,
 3.244720036272799e-14,
 165,
 array([[2.22311236e-02, 1.29258104e+00, 3.17587480e-03, 6.56643873e+01,
         1.46090241e-01, 1.40691253e+00, 1.84200738e-01, 3.77357443e+01,
         4.76381219e-02, 6.35174959e-03, 2.54069984e-02, 3.95714000e+00,
         3.14411605e-01, 3.81104976e-02, 1.27034992e-01, 2.85828732e-02],
        [6.06849589e-02, 3.52839690e+00, 8.66927985e-03, 1.79246030e+02,
         3.98786873e-01, 3.84049097e+00, 5.02818231e-01, 1.03008383e+02,
         1.30039198e-01, 1.73385597e-02, 6.93542388e-02, 1.08019227e+01,
         8.58258705e-01, 1.04031358e-01, 3.46771194e-01, 7.80235186e-02],
        [3.68515922e-02, 2.14265686e+00, 5.26451318e-03, 1.08849074e+02,
         2.42167606e-01, 2.33217934e+00, 3.05341764e-01, 6.25529456e+01,
         7.89676976e-02, 1.05290264e-02, 4.21161054e-02, 6.55958342e+00,
         5.21186804e-01, 6.31741581e-02, 2.10580527e-01, 4.73806186e-02],
        [9.39315041e-02, 5.46144602e+00, 1.34187863e-02, 2.77446826e+02

In [27]:
dest_by_app = pd.crosstab(index=df['country_destination'],
                          columns=df['signup_app'])

print(dest_by_app)

chi2_contingency(dest_by_app)

# There's a large number of zeroes here, but it's less
# than twenty percent. The test is a little dubious, but
# its result shows that application type definitely has
# an effect.

signup_app           Android  Moweb    Web  iOS
country_destination                            
AU                         0      0    105    6
CA                         1      8    288    6
DE                         0      2    178    4
ES                         2      6    444   17
FR                         0      9    914   25
GB                         1      7    474   16
IT                         2      8    570   21
NDF                       30    411  13744  862
NL                         0      3    166    7
PT                         0      0     44    0
US                        44    433  13098  648
other                      4     41   2207   95


(113.47190036709551,
 9.213051820558939e-11,
 33,
 array([[2.66773483e-01, 2.94721181e+00, 1.02364796e+02, 5.42121828e+00],
        [7.28219507e-01, 8.04509170e+00, 2.79428228e+02, 1.47984607e+01],
        [4.42219107e-01, 4.88546823e+00, 1.69685789e+02, 8.98652399e+00],
        [1.12717805e+00, 1.24526337e+01, 4.32514320e+02, 2.29058682e+01],
        [2.27838975e+00, 2.51707820e+01, 8.74250694e+02, 4.63001345e+01],
        [1.19687563e+00, 1.32226260e+01, 4.59258276e+02, 2.43222225e+01],
        [1.44442219e+00, 1.59574261e+01, 5.54245429e+02, 2.93527224e+01],
        [3.61634288e+01, 3.99519785e+02, 1.38764243e+04, 7.34892535e+02],
        [4.22992189e-01, 4.67305656e+00, 1.62308146e+02, 8.59580556e+00],
        [1.05748047e-01, 1.16826414e+00, 4.05770364e+01, 2.14895139e+00],
        [3.41830563e+01, 3.77641384e+02, 1.31165270e+04, 6.94648537e+02],
        [5.64069698e+00, 6.23162714e+01, 2.16441601e+03, 1.14627021e+02]]))

In [29]:
dest_by_device = pd.crosstab(index=df['country_destination'],
                             columns=df['first_device_type'])

print(dest_by_device)

chi2_contingency(dest_by_device)

# Let's try this again with fewer categories.

first_device_type    Android Phone  Android Tablet  Desktop (Other)  \
country_destination                                                   
AU                               0               0                1   
CA                               3               2                8   
DE                               0               1                1   
ES                               1               1                2   
FR                               3              12                8   
GB                               1               1                2   
IT                               2               4                3   
NDF                            177             133              130   
NL                               0               2                0   
PT                               0               1                0   
US                             115             105              117   
other                           15              23               19   

first

(222.69672814169243,
 1.192418269634232e-13,
 88,
 array([[1.00675231e+00, 9.05124317e-01, 9.24179566e-01, 5.41454894e+01,
         6.98692455e-02, 4.76381219e-02, 4.08735086e+01, 7.55223027e+00,
         5.47520815e+00],
        [2.74816171e+00, 2.47074476e+00, 2.52276044e+00, 1.47802552e+02,
         1.90724157e-01, 1.30039198e-01, 1.11573632e+02, 2.06155475e+01,
         1.49458385e+01],
        [1.66885068e+00, 1.50038626e+00, 1.53197333e+00, 8.97546851e+01,
         1.15819290e-01, 7.89676976e-02, 6.77542846e+01, 1.25190123e+01,
         9.07602071e+00],
        [4.25375526e+00, 3.82435410e+00, 3.90486681e+00, 2.28776888e+02,
         2.95213299e-01, 2.01281795e-01, 1.72699780e+02, 3.19098738e+01,
         2.31339876e+01],
        [8.59820892e+00, 7.73025092e+00, 7.89299305e+00, 4.62431747e+02,
         5.96721124e-01, 4.06855312e-01, 3.49081857e+02, 6.45001288e+01,
         4.67612372e+01],
        [4.51678064e+00, 4.06082802e+00, 4.14631913e+00, 2.42923006e+02,
         3.134674

In [38]:
df4 = df

df4.loc[df4['first_device_type'] == 'iPad', 'first_device_type'] = 'iOS'
df4.loc[df4['first_device_type'] == 'iPhone', 'first_device_type'] = 'iOS'
df4.loc[df4['first_device_type'] == 'Android Phone', 'first_device_type'] = 'Other'
df4.loc[df4['first_device_type'] == 'Android Tablet', 'first_device_type'] = 'Other'
df4.loc[df4['first_device_type'] == 'Desktop (Other)', 'first_device_type'] = 'Other'
df4.loc[df4['first_device_type'] == 'SmartPhone (Other)', 'first_device_type'] = 'Other'
df4.loc[df4['first_device_type'] == 'Other/Unknown', 'first_device_type'] = 'Other'

dest_by_device2 = pd.crosstab(index=df4['country_destination'],
                              columns=df4['first_device_type'])

print(dest_by_device2)

chi2_contingency(dest_by_device2)

# We still have some categories with very few entries. One more try.

first_device_type    Mac Desktop  Other  Windows Desktop   iOS
country_destination                                           
AU                            59      1               32    19
CA                           148     13              119    23
DE                           113      2               53    16
ES                           245      4              163    57
FR                           527     23              311    87
GB                           275      5              162    56
IT                           314      9              212    66
NDF                         7008    462             5664  1913
NL                           103      2               54    17
PT                            18      1               21     4
US                          7139    351             5148  1585
other                       1100     57              931   259


(141.23522756690292,
 2.396913929083777e-15,
 33,
 array([[5.41454894e+01, 2.95356356e+00, 4.08735086e+01, 1.30274384e+01],
        [1.47802552e+02, 8.06243026e+00, 1.11573632e+02, 3.55613859e+01],
        [8.97546851e+01, 4.89599725e+00, 6.77542846e+01, 2.15950330e+01],
        [2.28776888e+02, 1.24794713e+01, 1.72699780e+02, 5.50438614e+01],
        [4.62431747e+02, 2.52250293e+01, 3.49081857e+02, 1.11261366e+02],
        [2.42923006e+02, 1.32511230e+01, 1.83378444e+02, 5.84474264e+01],
        [2.93166118e+02, 1.59918171e+01, 2.21306114e+02, 7.05359503e+01],
        [7.33988450e+03, 4.00380819e+02, 5.54075391e+03, 1.76598077e+03],
        [8.58523075e+01, 4.68312781e+00, 6.48084461e+01, 2.06561186e+01],
        [2.14630769e+01, 1.17078195e+00, 1.62021115e+01, 5.16402964e+00],
        [6.93793960e+03, 3.78455266e+02, 5.23733255e+03, 1.66927258e+03],
        [1.14486003e+03, 6.24505737e+01, 8.64235358e+02, 2.75454036e+02]]))

In [40]:
df5 = df4[df4['first_device_type'] != 'Other']

dest_by_device3 = pd.crosstab(index=df5['country_destination'],
                              columns=df5['first_device_type'])

print(dest_by_device3)

chi2_contingency(dest_by_device3)

# Ok, we have a significant result. We want to use
# this variable in our test as well.

first_device_type    Mac Desktop  Windows Desktop   iOS
country_destination                                    
AU                            59               32    19
CA                           148              119    23
DE                           113               53    16
ES                           245              163    57
FR                           527              311    87
GB                           275              162    56
IT                           314              212    66
NDF                         7008             5664  1913
NL                           103               54    17
PT                            18               21     4
US                          7139             5148  1585
other                       1100              931   259


(106.2180120432377,
 5.192320318874483e-13,
 22,
 array([[5.51244819e+01, 4.16125334e+01, 1.32629846e+01],
        [1.45328180e+02, 1.09705770e+02, 3.49660504e+01],
        [9.12059610e+01, 6.88498280e+01, 2.19442109e+01],
        [2.33026219e+02, 1.75907528e+02, 5.60662532e+01],
        [4.63546780e+02, 3.49923577e+02, 1.11529643e+02],
        [2.47057905e+02, 1.86499809e+02, 5.94422856e+01],
        [2.96669939e+02, 2.23951089e+02, 7.13789718e+01],
        [7.30900517e+03, 5.51744364e+03, 1.75855119e+03],
        [8.71969078e+01, 6.58234620e+01, 2.09796302e+01],
        [2.15486611e+01, 1.62667176e+01, 5.18462126e+00],
        [6.95169830e+03, 5.24771876e+03, 1.67258293e+03],
        [1.14759149e+03, 8.66297287e+02, 2.76111225e+02]]))

In [42]:
dest_by_browser = pd.crosstab(index=df['country_destination'],
                              columns=df['first_browser'])

print(dest_by_browser)

chi2_contingency(dest_by_browser)

# Ugh, the fragmentation!!

first_browser        AOL Explorer  Android Browser  Apple Mail  Avant Browser  \
country_destination                                                             
AU                              0                0           0              0   
CA                              0                0           0              0   
DE                              0                0           1              0   
ES                              0                1           0              0   
FR                              0                1           0              0   
GB                              0                1           0              0   
IT                              1                1           0              0   
NDF                             3               71           4              1   
NL                              0                0           0              0   
PT                              0                0           0              0   
US                          

(438.4940391715434,
 9.835244213659776e-06,
 319,
 array([[2.54069984e-02, 3.62049727e-01, 2.85828732e-02, 3.17587480e-03,
         2.22311236e-02, 4.48179451e+01, 1.08614918e+00, 5.08139967e-02,
         6.35174959e-03, 1.66130011e+01, 9.21321278e+00, 2.54069984e-02,
         9.52762439e-03, 6.35174959e-03, 2.54069984e-02, 3.17587480e-02,
         1.25986953e+01, 3.17587480e-03, 1.11155618e-01, 3.17587480e-03,
         3.17587480e-03, 3.17587480e-03, 2.58103345e+01, 3.17587480e-03,
         9.21003691e-02, 9.52762439e-03, 9.52762439e-03, 3.17587480e-03,
         3.17587480e-03, 1.90552488e-02],
        [6.93542388e-02, 9.88297903e-01, 7.80235186e-02, 8.66927985e-03,
         6.06849589e-02, 1.22340877e+02, 2.96489371e+00, 1.38708478e-01,
         1.73385597e-02, 4.53490029e+01, 2.51495808e+01, 6.93542388e-02,
         2.60078395e-02, 1.73385597e-02, 6.93542388e-02, 8.66927985e-02,
         3.43910332e+01, 8.66927985e-03, 3.03424795e-01, 8.66927985e-03,
         8.66927985e-03, 8.66927

In [43]:
topbrowsers = ['Chrome', 'Firefox', 'Safari']

df6 = df[df['first_browser'].isin(topbrowsers)]

dest_by_browser2 = pd.crosstab(index=df6['country_destination'],
                               columns=df6['first_browser'])

print(dest_by_browser2)

chi2_contingency(dest_by_browser2)

# There's likely something to this one, too. Here we need to be
# careful due to the risk of cross-correlation with device type.
# (Safari users are almost certainly on iOS or Mac Desktop.)
# But there is signifcance!

first_browser        Chrome  Firefox  Safari
country_destination                         
AU                       40       12      30
CA                      130       56      70
DE                       75       26      51
ES                      185       74     119
FR                      366      132     278
GB                      193       76     136
IT                      237       86     159
NDF                    5902     2079    3500
NL                       73       28      43
PT                       20        9       8
US                     5912     2259    3236
other                   979      394     497


(56.14465126359901,
 8.15184542139852e-05,
 22,
 array([[  42.12537313,   15.61492537,   24.25970149],
        [ 131.51336003,   48.74903531,   75.73760466],
        [  78.08605752,   28.94473972,   44.96920277],
        [ 194.18769567,   71.98099745,  111.83130688],
        [ 398.64987259,  147.77051329,  229.57961412],
        [ 208.05824536,   77.12249727,  119.81925737],
        [ 247.61499818,   91.78529305,  142.59970877],
        [5898.06596287, 2186.27997816, 3396.65405897],
        [  73.97626502,   27.42133236,   42.60240262],
        [  19.00779032,    7.04575901,   10.94645067],
        [5860.05038224, 2172.18846014, 3374.76115763],
        [ 960.66399709,  356.09646888,  553.23953404]]))

In [9]:
# lots of significant results here, for basically every variable.
# Essentially this tells us that none of the categorical variables
# are independent from the target variable. This test does not tell
# us, however, what that relationship is, or whether they should
# be included in our ML model.