<a href="https://colab.research.google.com/github/dgazeyreyn/amazon-purchases/blob/main/Categorical_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [None]:
%load_ext google.colab.data_table

In [None]:
# Display query output immediately

%%bigquery --project mind-diagnostics-414622
SELECT
  COUNT(*) as total_rows
FROM `sandbox-418514.sample_data.visitor_signups_data`

In [None]:
project_id = 'sandbox-418514'

In [None]:
from google.cloud import bigquery

client = bigquery.Client(project=project_id)

df = client.query('''
  SELECT
    *
  FROM
    `sandbox-418514.sample_data.visitor_signups_data`''').to_dataframe()

In [None]:
# =============================================================================
# Import dependencies
# =============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rc("font", size=14)
import scipy.stats as stats

In [None]:
# creating a dataframe counting non-NA for each column
fields_values_cnt_df = df.count().to_frame(name='values_cnt')
# creating a dataframe counting missing values for each column
fields_nulls_cnt_df = df.isnull().sum().to_frame(name='nulls_cnt')
# creating a dataframe that joins fields_values_cnt_df with fields_nulls_cnt_df
fields_summary_df = fields_values_cnt_df.join(fields_nulls_cnt_df, how='left')

fields_summary_df.head(10)

Unnamed: 0,values_cnt,nulls_cnt
mduid,535391,0
bh_goal_name,8727,526664
region,535391,0
browser,535391,0
os_name,529900,5491


In [None]:
# cleaning up some funkiness related to browser (contains both 'mobile_web' and 'mobile_web} values)
df['browser_rev'] = df['browser'].apply(lambda x: 'mobile_web' if (x == 'mobile_web') or (x == 'mobile_web}') else 'desktop_web')
# assigning null operating system (os_name) values to an 'unassigned' category
df['os_name_rev'] = df['os_name'].apply(lambda x: 'unassigned' if x == None else x)
# generating list of operating systems categories to be used for subsequent analysis
os_list = ['iOS', 'Android', 'Windows', 'macOS', 'Chrome OS']
df['os_name_grouped'] = np.where(df['os_name_rev'].isin(os_list), df['os_name_rev'], 'remaining')
# creating binary 0/1 values to a new user_signups dependent variable
df['user_signups'] = df['bh_goal_name'].apply(lambda x: 1 if pd.notnull(x) else 0)
# replacing values containing spaces with an '_' to prevent downstream logistic regression from throwing errors
df = df.replace(' ', '_', regex=True)

In [None]:
df.head()

Unnamed: 0,mduid,bh_goal_name,region,browser,os_name,browser_rev,os_name_rev,os_name_grouped,user_signups
0,c3dd9c9787287cb2c3649933e05aadb6,,Rhode_Island,desktop_web,Linux,desktop_web,Linux,remaining,0
1,26a3aaee9cb2b0230f871a120b197dc0,,South_Dakota,desktop_web,Linux,desktop_web,Linux,remaining,0
2,ef7040df7893df97e3de1dab358a04bd,,Rhode_Island,desktop_web,Linux,desktop_web,Linux,remaining,0
3,1e5615df1c93f69a17095fb353563a7c,,Rhode_Island,desktop_web,Linux,desktop_web,Linux,remaining,0
4,e8ac65b98db15608c190bf30cb487693,,Montana,desktop_web,Ubuntu,desktop_web,Ubuntu,remaining,0


In [None]:
# =============================================================================
# =============================================================================
# CHI-SQUARE TEST OF INDEPENDENCE - Determining whether a statistically
# significant relationship exists between independent and dependent variable
# =============================================================================
# =============================================================================

In [None]:
# =============================================================================
# Browser and User Signups - Contingency Tables
# =============================================================================
browser_tab = pd.crosstab(df['browser_rev'],
							df['user_signups'],
							margins = True
)
print(browser_tab)

user_signups       0     1     All
browser_rev                       
desktop_web    81503   766   82269
mobile_web    445161  7961  453122
All           526664  8727  535391


In [None]:
# =============================================================================
# Browser and User Signups - Expected Values
# =============================================================================
browser_tab.columns = ["no", "yes", "row_totals"]
browser_tab.index = ["desktop_web", "mobile_web", "col_totals"]
browser_observed = browser_tab.iloc[0:2,0:2]
browser_expected =  np.outer(browser_tab["row_totals"][0:2],
                     browser_tab.loc["col_totals"][0:2]) / 535391

browser_expected = pd.DataFrame(browser_expected)

browser_expected.columns = ["no","yes"]
browser_expected.index = ["desktop_web","mobile_web"]

browser_expected

Unnamed: 0,no,yes
desktop_web,80927.995831,1341.004169
mobile_web,445736.004169,7385.995831


In [None]:
browser_observed

Unnamed: 0,no,yes
desktop_web,81503,766
mobile_web,445161,7961


In [None]:
# =============================================================================
# Browser and User Signups - Chi-Squared Test of Independence
# =============================================================================
stats.chi2_contingency(observed= browser_observed)

Chi2ContingencyResult(statistic=295.63074097725536, pvalue=2.949230378104203e-66, dof=1, expected_freq=array([[ 80927.99583108,   1341.00416892],
       [445736.00416892,   7385.99583108]]))

In [None]:
# =============================================================================
# Browser and User Signups - Chi-Square Post Hoc Testing
# =============================================================================
browser_dummies = pd.get_dummies(df['browser_rev'])
for series in browser_dummies:
    nl = "\n"

    browser_crosstab = pd.crosstab(browser_dummies[f"{series}"], df['user_signups'])
    print(browser_crosstab, nl)
    chi2, p, dof, expected = stats.chi2_contingency(browser_crosstab)
    print(f"Chi2 value= {chi2}{nl}p-value= {p}{nl}Degrees of freedom= {dof}{nl}")

user_signups       0     1
desktop_web               
False         445161  7961
True           81503   766 

Chi2 value= 295.63074097725536
p-value= 2.949230378104203e-66
Degrees of freedom= 1

user_signups       0     1
mobile_web                
False          81503   766
True          445161  7961 

Chi2 value= 295.63074097725536
p-value= 2.949230378104203e-66
Degrees of freedom= 1



In [None]:
# =============================================================================
# Operating System and User Signups - Contingency Tables
# =============================================================================
os_tab = pd.crosstab(df['os_name_grouped'],
							df['user_signups'],
							margins = True
)
print(os_tab)

user_signups          0     1     All
os_name_grouped                      
Android          144409  2877  147286
Chrome_OS          6522   139    6661
Windows           30847   345   31192
iOS              323860  5190  329050
macOS             14021   142   14163
remaining          7005    34    7039
All              526664  8727  535391


In [None]:
# =============================================================================
# Operating System and User Signups - Expected Values
# =============================================================================
os_tab.columns = ["no", "yes", "row_totals"]
os_tab.index = ["Android", "Chrome OS", "Windows", "iOS", "macOS", "remaining",
                "col_totals"]
os_observed = os_tab.iloc[0:6,0:2]
os_expected =  np.outer(os_tab["row_totals"][0:6],
                     os_tab.loc["col_totals"][0:2]) / 535391

os_expected = pd.DataFrame(os_expected)

os_expected.columns = ["no","yes"]
os_expected.index = ["Android", "Chrome OS", "Windows", "iOS", "macOS",
                     "remaining"]

os_expected

Unnamed: 0,no,yes
Android,144885.203345,2400.796655
Chrome OS,6552.424124,108.575876
Windows,30683.563018,508.436982
iOS,323686.40713,5363.59287
macOS,13932.139748,230.860252
remaining,6924.262634,114.737366


In [None]:
# =============================================================================
# Operating System and User Signups - Chi-Squared Test of Independence
# =============================================================================
stats.chi2_contingency(observed= os_observed)

Chi2ContingencyResult(statistic=256.33021213584755, pvalue=2.4082025200005342e-53, dof=5, expected_freq=array([[1.44885203e+05, 2.40079666e+03],
       [6.55242412e+03, 1.08575876e+02],
       [3.06835630e+04, 5.08436982e+02],
       [3.23686407e+05, 5.36359287e+03],
       [1.39321397e+04, 2.30860252e+02],
       [6.92426263e+03, 1.14737366e+02]]))

In [None]:
# =============================================================================
# Operating System and User Signups - Chi-Square Post Hoc Testing
# =============================================================================
os_dummies = pd.get_dummies(df['os_name_grouped'])
for series in os_dummies:
    nl = "\n"

    os_crosstab = pd.crosstab(os_dummies[f"{series}"], df['user_signups'])
    print(os_crosstab, nl)
    chi2, p, dof, expected = stats.chi2_contingency(os_crosstab)
    print(f"Chi2 value= {chi2}{nl}p-value= {p}{nl}Degrees of freedom= {dof}{nl}")

user_signups       0     1
Android                   
False         382255  5850
True          144409  2877 

Chi2 value= 132.1832100650528
p-value= 1.3642907584803792e-30
Degrees of freedom= 1

user_signups       0     1
Chrome_OS                 
False         520142  8588
True            6522   139 

Chi2 value= 8.489538562617811
p-value= 0.003571943892870518
Degrees of freedom= 1

user_signups       0     1
Windows                   
False         495817  8382
True           30847   345 

Chi2 value= 56.36489743199644
p-value= 6.019546830580369e-14
Degrees of freedom= 1

user_signups       0     1
iOS                       
False         202804  3537
True          323860  5190 

Chi2 value= 14.73416466407495
p-value= 0.00012378280360961967
Degrees of freedom= 1

user_signups       0     1
macOS                     
False         512643  8585
True           14021   142 

Chi2 value= 35.313883938028134
p-value= 2.8062440540018382e-09
Degrees of freedom= 1

user_signups       0     1


In [None]:
# =============================================================================
# State and User Signups - Contingency Tables
# =============================================================================
state_tab = pd.crosstab(df['region'],
							df['user_signups'],
							margins = True
)
print(state_tab)

In [None]:
# =============================================================================
# State and User Signups - Expected Values
# =============================================================================
state_tab.columns = ["no", "yes", "row_totals"]
state_tab.index = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
                   'Colorado', 'Connecticut', 'Delaware',
                   'District_of_Columbia', 'Florida', 'Georgia', 'Hawaii',
                   'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
                   'Louisiana', 'Maine', 'Maryland', 'Massachusetts',
                   'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
                   'Montana', 'Nebraska', 'Nevada', 'New_Hampshire',
                   'New_Jersey', 'New_Mexico', 'New_York', 'North_Carolina',
                   'North_Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
                   'Rhode_Island', 'South_Carolina', 'South_Dakota',
                   'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
                   'Washington', 'West Virginia', 'Wisconsin', 'Wyoming',
                   "col_totals"]
state_observed = state_tab.iloc[0:51,0:2]
state_expected =  np.outer(state_tab["row_totals"][0:51],
                     state_tab.loc["col_totals"][0:2]) / 535391

state_expected = pd.DataFrame(state_expected)

state_expected.columns = ["no","yes"]
state_expected.index = ['Alabama', 'Alaska', 'Arizona', 'Arkansas',
                        'California',m'Colorado', 'Connecticut', 'Delaware',
                        'District_of_Columbia', 'Florida', 'Georgia', 'Hawaii',
                        'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',
                        'Kentucky', 'Louisiana', 'Maine', 'Maryland',
                        'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
                        'Missouri', 'Montana', 'Nebraska', 'Nevada',
                        'New_Hampshire', 'New_Jersey', 'New_Mexico', 'New_York',
                        'North_Carolina', 'North_Dakota', 'Ohio', 'Oklahoma',
                        'Oregon', 'Pennsylvania', 'Rhode_Island',
                        'South_Carolina', 'South_Dakota', 'Tennessee', 'Texas',
                        'Utah', 'Vermont', 'Virginia', 'Washington',
                        'West Virginia', 'Wisconsin', 'Wyoming']

state_expected

In [None]:
# =============================================================================
# State and User Signups - Chi-Squared Test of Independence
# =============================================================================
stats.chi2_contingency(observed= state_observed)

In [None]:
# =============================================================================
# State and User Signups - Chi-Square Post Hoc Testing
# =============================================================================
state_dummies = pd.get_dummies(df['region'])
for series in state_dummies:
    nl = "\n"

    state_crosstab = pd.crosstab(state_dummies[f"{series}"], df['user_signups'])
    print(state_crosstab, nl)
    chi2, p, dof, expected = stats.chi2_contingency(state_crosstab)
    print(f"Chi2 value= {chi2}{nl}p-value= {p}{nl}Degrees of freedom= {dof}{nl}")

In [None]:
# =============================================================================
# Region and User Signups - Contingency Tables
# =============================================================================
# creating dictionary of states to corresponding region assignments
states_to_regions = {'Connecticut' : 'Northeast',
                     'Maine' : 'Northeast',
                     'Massachusetts' : 'Northeast',
                     'New_Hampshire' : 'Northeast',
                     'Rhode_Island' : 'Northeast',
                     'Vermont' : 'Northeast',
                     'New_Jersey' : 'Northeast',
                     'New_York' : 'Northeast',
                     'Pennsylvania' : 'Northeast',
                     'Indiana' : 'Midwest',
                     'Illinois' : 'Midwest',
                     'Michigan' : 'Midwest',
                     'Ohio' : 'Midwest',
                     'Wisconsin' : 'Midwest',
                     'Iowa' : 'Midwest',
                     'Kansas' : 'Midwest',
                     'Minnesota' : 'Midwest',
                     'Missouri' : 'Midwest',
                     'Nebraska' : 'Midwest',
                     'North_Dakota' : 'Midwest',
                     'South_Dakota' : 'Midwest',
                     'Delaware' : 'South',
                     'District_of_Columbia' : 'South',
                     'Florida' : 'South',
                     'Georgia' : 'South',
                     'Maryland' : 'South',
                     'North_Carolina' : 'South',
                     'South_Carolina' : 'South',
                     'Virginia' : 'South',
                     'West_Virginia' : 'South',
                     'Alabama' : 'South',
                     'Kentucky' : 'South',
                     'Mississippi' : 'South',
                     'Tennessee' : 'South',
                     'Arkansas' : 'South',
                     'Louisiana' : 'South',
                     'Oklahoma' : 'South',
                     'Texas' : 'South',
                     'Arizona' : 'West',
                     'Colorado' : 'West',
                     'Idaho' : 'West',
                     'New_Mexico' : 'West',
                     'Montana' : 'West',
                     'Utah' : 'West',
                     'Nevada' : 'West',
                     'Wyoming' : 'West',
                     'Alaska' : 'West',
                     'California' : 'West',
                     'Hawaii' : 'West',
                     'Oregon' : 'West',
                     'Washington' : 'West'}
df['state_region'] = df['region'].map(states_to_regions)

region_tab = pd.crosstab(df['state_region'],
							df['user_signups'],
							margins = True
)
print(region_tab)

In [None]:
# =============================================================================
# Region and User Signups - Expected Values
# =============================================================================
region_tab.columns = ["no", "yes", "row_totals"]
region_tab.index = ['Midwest', 'Northeast', 'South', 'West', "col_totals"]
region_observed = region_tab.iloc[0:4,0:2]
region_expected =  np.outer(region_tab["row_totals"][0:4],
                     region_tab.loc["col_totals"][0:2]) / 535391
region_expected = pd.DataFrame(region_expected)

region_expected.columns = ["no","yes"]
region_expected.index = ['Midwest', 'Northeast', 'South', 'West']

region_expected

In [None]:
# =============================================================================
# Region and User Signups - Chi-Squared Test of Independence
# =============================================================================
stats.chi2_contingency(observed= region_observed)

In [None]:
# =============================================================================
# Region and User Signups - Chi-Square Post Hoc Testing
# =============================================================================
region_dummies = pd.get_dummies(df['state_region'])
for series in region_dummies:
    nl = "\n"

    region_crosstab = pd.crosstab(region_dummies[f"{series}"], df['user_signups'])
    print(region_crosstab, nl)
    chi2, p, dof, expected = stats.chi2_contingency(region_crosstab)
    print(f"Chi2 value= {chi2}{nl}p-value= {p}{nl}Degrees of freedom= {dof}{nl}")

In [None]:
# =============================================================================
# Sub-Region and User Signups - Contingency Tables
# =============================================================================
# creating dictionary of states to corresponding sub-region assignments
states_to_sub_regions = {'Connecticut' : 'New_England',
                         'Maine' : 'New_England',
                         'Massachusetts' : 'New_England',
                         'New_Hampshire' : 'New_England',
                         'Rhode_Island' : 'New_England',
                         'Vermont' : 'New_England',
                         'New_Jersey' : 'Middle_Atlantic',
                         'New_York' : 'Middle_Atlantic',
                         'Pennsylvania' : 'Middle_Atlantic',
                         'Indiana' : 'East_North_Central',
                         'Illinois' : 'East_North_Central',
                         'Michigan' : 'East_North_Central',
                         'Ohio' : 'East_North_Central',
                         'Wisconsin' : 'East_North_Central',
                         'Iowa' : 'West_North_Central',
                         'Kansas' : 'West_North_Central',
                         'Minnesota' : 'West_North_Central',
                         'Missouri' : 'West_North_Central',
                         'Nebraska' : 'West_North_Central',
                         'North_Dakota' : 'West_North_Central',
                         'South_Dakota' : 'West_North_Central',
                         'Delaware' : 'South_Atlantic',
                         'District_of_Columbia' : 'South_Atlantic',
                         'Florida' : 'South_Atlantic',
                         'Georgia' : 'South_Atlantic',
                         'Maryland' : 'South_Atlantic',
                         'North_Carolina' : 'South_Atlantic',
                         'South_Carolina' : 'South_Atlantic',
                         'Virginia' : 'South_Atlantic',
                         'West_Virginia' : 'South_Atlantic',
                         'Alabama' : 'East_South_Central',
                         'Kentucky' : 'East_South_Central',
                         'Mississippi' : 'East_South_Central',
                         'Tennessee' : 'East_South_Central',
                         'Arkansas' : 'West_South_Central',
                         'Louisiana' : 'West_South_Central',
                         'Oklahoma' : 'West_South_Central',
                         'Texas' : 'West_South_Central',
                         'Arizona' : 'Mountain',
                         'Colorado' : 'Mountain',
                         'Idaho' : 'Mountain',
                         'New_Mexico' : 'Mountain',
                         'Montana' : 'Mountain',
                         'Utah' : 'Mountain',
                         'Nevada' : 'Mountain',
                         'Wyoming' : 'Mountain',
                         'Alaska' : 'Pacific',
                         'California' : 'Pacific',
                         'Hawaii' : 'Pacific',
                         'Oregon' : 'Pacific',
                         'Washington' : 'Pacific'}
df['state_sub_region'] = df['region'].map(states_to_sub_regions)

sub_region_tab = pd.crosstab(df['state_sub_region'],
							df['user_signups'],
							margins = True
)
print(sub_region_tab)

In [None]:
# =============================================================================
# Sub-Region and User Signups - Expected Values
# =============================================================================
sub_region_tab.columns = ["no", "yes", "row_totals"]
sub_region_tab.index = ['East_North_Central', 'East_South_Central',
                        'Middle_Atlantic', 'Mountain', 'New_England', 'Pacific',
                        'South_Atlantic', 'West_North_Central',
                        'West_South_Central', "col_totals"]
sub_region_observed = sub_region_tab.iloc[0:9,0:2]
sub_region_expected =  np.outer(sub_region_tab["row_totals"][0:9],
                     sub_region_tab.loc["col_totals"][0:2]) / 535391

sub_region_expected = pd.DataFrame(sub_region_expected)

sub_region_expected.columns = ["no","yes"]
sub_region_expected.index = ['East_North_Central', 'East_South_Central',
                        'Middle_Atlantic', 'Mountain', 'New_England', 'Pacific',
                        'South_Atlantic', 'West_North_Central',
                        'West_South_Central']

sub_region_expected

In [None]:
# =============================================================================
# Sub-Region and User Signups - Chi-Squared Test of Independence
# =============================================================================
stats.chi2_contingency(observed= sub_region_observed)

In [None]:
# =============================================================================
# Sub-Region and User Signups - Chi-Square Post Hoc Testing
# =============================================================================
sub_region_dummies = pd.get_dummies(df['state_sub_region'])
for series in sub_region_dummies:
    nl = "\n"

    sub_region_crosstab = pd.crosstab(sub_region_dummies[f"{series}"], df['user_signups'])
    print(sub_region_crosstab, nl)
    chi2, p, dof, expected = stats.chi2_contingency(sub_region_crosstab)
    print(f"Chi2 value= {chi2}{nl}p-value= {p}{nl}Degrees of freedom= {dof}{nl}")

In [None]:
# =============================================================================
# =============================================================================
# UNI-VARIATE (LOGISTIC REGRESSION) ANALYSIS
# =============================================================================
# =============================================================================

In [None]:
# =============================================================================
# BROWSER
# =============================================================================

In [None]:
import statsmodels.formula.api as smf

# fitting a model
model = smf.logit("user_signups ~ C(browser_rev, Treatment('desktop_web'))", data=df)
results = model.fit()
results.summary()

In [None]:
# converting model coefficients to odds ratios for easier interpretation
coefs = pd.DataFrame({
    'coef': results.params.values,
    'odds ratio': np.exp(results.params.values),
    'p value': results.pvalues,
    'name': results.params.index
})
coefs

In [None]:
# =============================================================================
# OPERATING SYSTEM
# =============================================================================

In [None]:
# fitting a model
model = smf.logit("user_signups ~ C(os_name_grouped, Treatment('iOS'))", data=df)
results = model.fit()
results.summary()

In [None]:
# converting model coefficients to odds ratios for easier interpretation
coefs = pd.DataFrame({
    'coef': results.params.values,
    'odds ratio': np.exp(results.params.values),
    'p value': results.pvalues,
    'name': results.params.index
})
coefs

In [None]:
# =============================================================================
# STATES
# =============================================================================

In [None]:
# fitting a model
model = smf.logit("user_signups ~ C(region, Treatment('California'))", data=df)
results = model.fit()
results.summary()

In [None]:
# converting model coefficients to odds ratios for easier interpretation
coefs = pd.DataFrame({
    'coef': results.params.values,
    'odds ratio': np.exp(results.params.values),
    'p value': results.pvalues,
    'name': results.params.index
})
coefs

In [None]:
# =============================================================================
# REGIONS
# =============================================================================

In [None]:
# fitting a model
model = smf.logit("user_signups ~ C(state_region, Treatment('West'))", data=df)
results = model.fit()
results.summary()

In [None]:
# converting model coefficients to odds ratios for easier interpretation
coefs = pd.DataFrame({
    'coef': results.params.values,
    'odds ratio': np.exp(results.params.values),
    'p value': results.pvalues,
    'name': results.params.index
})
coefs

In [None]:
# =============================================================================
# SUB-REGIONS
# =============================================================================

In [None]:
# fitting a model
model = smf.logit("user_signups ~ C(state_sub_region, Treatment('Pacific'))", data=df)
results = model.fit()
results.summary()

In [None]:
# converting model coefficients to odds ratios for easier interpretation
coefs = pd.DataFrame({
    'coef': results.params.values,
    'odds ratio': np.exp(results.params.values),
    'p value': results.pvalues,
    'name': results.params.index
})
coefs

In [None]:
# =============================================================================
# =============================================================================
# MULTI-VARIATE (LOGISTIC REGRESSION) ANALYSIS
# =============================================================================
# =============================================================================

In [None]:
# =============================================================================
# MODEL A - MODEL FIT
# =============================================================================

In [None]:
# fitting a model
model = smf.logit("user_signups ~ C(browser_rev, Treatment('mobile_web')) + C(os_name_grouped, Treatment('iOS')) + C(region, Treatment('California'))", data=df)
results = model.fit()
results.summary()

In [None]:
# converting model coefficients to odds ratios for easier interpretation
coefs = pd.DataFrame({
    'coef': results.params.values,
    'odds ratio': np.exp(results.params.values),
    'p value': results.pvalues,
    'name': results.params.index
})
coefs

In [None]:
# =============================================================================
# MODEL B - MODEL FIT
# =============================================================================

In [None]:
# concatenating browser and operating system
df['browser_os'] = df['browser_rev'] + "_" + df['os_name_grouped']
# creating list of browser/os combinations to retain
browser_os_list = ['mobile_web_iOS', 'mobile_web_Android', 'desktop_web_Windows', 'desktop_web_iOS', 'desktop_web_macOS', 'desktop_web_Chrome_OS',
                   'desktop_web_Android']
# grouping desktop_web_remaining and mobile_web_remaining to form a single 'remaining' group
df['browser_os_grouped'] = np.where(df['browser_os'].isin(browser_os_list), df['browser_os'], 'remaining')

In [None]:
# fitting a model
model = smf.logit("user_signups ~ C(browser_os_grouped, Treatment('mobile_web_iOS')) + C(region, Treatment('California'))", data=df)
results = model.fit()
results.summary()

In [None]:
# converting model coefficients to odds ratios for easier interpretation
coefs = pd.DataFrame({
    'coef': results.params.values,
    'odds ratio': np.exp(results.params.values),
    'p value': results.pvalues
})
coefs

In [None]:
# =============================================================================
# MODEL C - MODEL FIT
# =============================================================================

In [None]:
# fitting a model
model = smf.logit("user_signups ~ C(browser_os_grouped, Treatment('mobile_web_iOS')) + C(state_region, Treatment('West'))", data=df)
results = model.fit()
results.summary()

In [None]:
# converting model coefficients to odds ratios for easier interpretation
coefs = pd.DataFrame({
    'coef': results.params.values,
    'odds ratio': np.exp(results.params.values),
    'p value': results.pvalues
})
coefs

In [None]:
# =============================================================================
# MODEL D - MODEL FIT
# =============================================================================

In [None]:
# fitting a model
model = smf.logit("user_signups ~ C(browser_os_grouped, Treatment('mobile_web_iOS')) + C(state_sub_region, Treatment('Pacific'))", data=df)
results = model.fit()
results.summary()

In [None]:
# converting model coefficients to odds ratios for easier interpretation
coefs = pd.DataFrame({
    'coef': results.params.values,
    'odds ratio': np.exp(results.params.values),
    'p value': results.pvalues
})
coefs

In [None]:
# =============================================================================
# MODEL E - MODEL FIT
# =============================================================================

In [None]:
# fitting a model
model = smf.logit("user_signups ~ C(browser_rev, Treatment('mobile_web')) + C(os_name_grouped, Treatment('iOS')) + C(state_region, Treatment('West'))", data=df)
results = model.fit()
results.summary()

In [None]:
# converting model coefficients to odds ratios for easier interpretation
coefs = pd.DataFrame({
    'coef': results.params.values,
    'odds ratio': np.exp(results.params.values),
    'p value': results.pvalues
})
coefs

In [None]:
# =============================================================================
# MODEL F - MODEL FIT
# =============================================================================

In [None]:
# fitting a model
model = smf.logit("user_signups ~ C(browser_rev, Treatment('mobile_web')) + C(os_name_grouped, Treatment('iOS')) + C(state_sub_region, Treatment('Pacific'))", data=df)
results = model.fit()
results.summary()

In [None]:
# converting model coefficients to odds ratios for easier interpretation
coefs = pd.DataFrame({
    'coef': results.params.values,
    'odds ratio': np.exp(results.params.values),
    'p value': results.pvalues
})
coefs

In [None]:
# =============================================================================
# =============================================================================
# MODEL EVALUATION
# =============================================================================
# =============================================================================

In [None]:
# =============================================================================
# Model A - MODEL EVALUATION
# =============================================================================

In [None]:
model_a_cols = ['user_signups', 'region', 'browser_rev', 'os_name_grouped']
df_model_a = df[model_a_cols]
df_model_a_cat = pd.get_dummies(df_model_a, prefix=['region', 'browser', 'os'])
X=df_model_a_cat.iloc[:, 1:60]
y=df_model_a_cat['user_signups']

#importing and training the model

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, accuracy_score, roc_curve, auc, roc_auc_score
import seaborn as sns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
lr = LogisticRegression(solver='newton-cg', class_weight='balanced')
lr.fit(X_train, y_train)

# predicting on the test data
pred_test = lr.predict(X_test)

# calculating and printing the f1 score
f1_test = f1_score(y_test, pred_test)
print('The f1 score for the testing data:', f1_test)

# function to create a confusion matrix
def conf_matrix(y_test, pred_test):

    # creating a confusion matrix
    con_mat = confusion_matrix(y_test, pred_test)
    con_mat = pd.DataFrame(con_mat, range(2), range(2))

    # ploting the confusion matrix
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.5)
    sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)

# calling function
conf_matrix(y_test, pred_test)

In [None]:
# creating conversion matrix dataframe with row and column labels
pd.crosstab(y_test, pred_test, rownames = ['Actual'], colnames =['Predicted'], margins = True)

In [None]:
# predicting the test set results and calculating the accuracy
pred_test = lr.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, y_test)))

In [None]:
# computing precision, recall, F-measure and support
print(classification_report(y_test, pred_test))

In [None]:
# generating an ROC Curve
logit_roc_auc = roc_auc_score(y_test, lr.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, lr.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
# =============================================================================
# MODEL B - MODEL EVALUATION
# =============================================================================

In [None]:
model_b_cols = ['user_signups', 'region', 'browser_os_grouped']
df_model_b = df[model_b_cols]
df_model_b_cat = pd.get_dummies(df_model_b, prefix=['region', 'browser'])
X=df_model_b_cat.iloc[:, 1:60]
y=df_model_b_cat['user_signups']

#importing and training the model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
lr = LogisticRegression(solver='newton-cg', class_weight='balanced')
lr.fit(X_train, y_train)

# predicting on the test data
pred_test = lr.predict(X_test)

# calculating and printing the f1 score
f1_test = f1_score(y_test, pred_test)
print('The f1 score for the testing data:', f1_test)

# function to create a confusion matrix
def conf_matrix(y_test, pred_test):

    # creating a confusion matrix
    con_mat = confusion_matrix(y_test, pred_test)
    con_mat = pd.DataFrame(con_mat, range(2), range(2))

    # ploting the confusion matrix
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.5)
    sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)

# calling function
conf_matrix(y_test, pred_test)

In [None]:
# creating conversion matrix dataframe with row and column labels
pd.crosstab(y_test, pred_test, rownames = ['Actual'], colnames =['Predicted'], margins = True)

In [None]:
# predicting the test set results and calculating the accuracy
pred_test = lr.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, y_test)))

In [None]:
# computing precision, recall, F-measure and support
print(classification_report(y_test, pred_test))

In [None]:
# generating an ROC Curve
logit_roc_auc = roc_auc_score(y_test, lr.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, lr.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
# =============================================================================
# MODEL C - MODEL EVALUATION
# =============================================================================

In [None]:
model_c_cols = ['user_signups', 'state_region', 'browser_os_grouped']
df_model_c = df[model_c_cols]
df_model_c_cat = pd.get_dummies(df_model_c, prefix=['region', 'browser'])
X=df_model_c_cat.iloc[:, 1:13]
y=df_model_c_cat['user_signups']

#importing and training the model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
lr = LogisticRegression(solver='newton-cg', class_weight='balanced')
lr.fit(X_train, y_train)

# predicting on the test data
pred_test = lr.predict(X_test)

# calculating and printing the f1 score
f1_test = f1_score(y_test, pred_test)
print('The f1 score for the testing data:', f1_test)

# function to create a confusion matrix
def conf_matrix(y_test, pred_test):

    # creating a confusion matrix
    con_mat = confusion_matrix(y_test, pred_test)
    con_mat = pd.DataFrame(con_mat, range(2), range(2))

    # ploting the confusion matrix
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.5)
    sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)

#calling function
conf_matrix(y_test, pred_test)

In [None]:
# creating conversion matrix dataframe with row and column labels
pd.crosstab(y_test, pred_test, rownames = ['Actual'], colnames =['Predicted'], margins = True)

In [None]:
# predicting the test set results and calculating the accuracy
pred_test = lr.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, y_test)))

In [None]:
# computing precision, recall, F-measure and support
print(classification_report(y_test, pred_test))

In [None]:
# generating an ROC Curve
logit_roc_auc = roc_auc_score(y_test, lr.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, lr.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
# =============================================================================
# MODEL D - MODEL EVALUATION
# =============================================================================

In [None]:
model_d_cols = ['user_signups', 'state_sub_region', 'browser_os_grouped']
df_model_d = df[model_d_cols]
df_model_d_cat = pd.get_dummies(df_model_d, prefix=['region', 'browser'])
X=df_model_d_cat.iloc[:, 1:18]
y=df_model_d_cat['user_signups']

# importing and training the model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
lr = LogisticRegression(solver='newton-cg', class_weight='balanced')
lr.fit(X_train, y_train)

# predicting on the test data
pred_test = lr.predict(X_test)

# calculating and printing the f1 score
f1_test = f1_score(y_test, pred_test)
print('The f1 score for the testing data:', f1_test)

# function to create a confusion matrix
def conf_matrix(y_test, pred_test):

    # creating a confusion matrix
    con_mat = confusion_matrix(y_test, pred_test)
    con_mat = pd.DataFrame(con_mat, range(2), range(2))

    # ploting the confusion matrix
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.5)
    sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)

# calling function
conf_matrix(y_test, pred_test)

In [None]:
# creating conversion matrix dataframe with row and column labels
pd.crosstab(y_test, pred_test, rownames = ['Actual'], colnames =['Predicted'], margins = True)

In [None]:
# predicting the test set results and calculating the accuracy
pred_test = lr.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, y_test)))

In [None]:
# computing precision, recall, F-measure and support
print(classification_report(y_test, pred_test))

In [None]:
# generating an ROC Curve
logit_roc_auc = roc_auc_score(y_test, lr.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, lr.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
# =============================================================================
# MODEL E - MODEL EVALUATION
# =============================================================================

In [None]:
model_e_cols = ['user_signups', 'state_region', 'browser_rev', 'os_name_grouped']
df_model_e = df[model_e_cols]
df_model_e_cat = pd.get_dummies(df_model_e, prefix=['region', 'browser', 'os'])
X=df_model_e_cat.iloc[:, 1:13]
y=df_model_e_cat['user_signups']

# importing and training the model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
lr = LogisticRegression(solver='newton-cg', class_weight='balanced')
lr.fit(X_train, y_train)

# predicting on the test data
pred_test = lr.predict(X_test)

# calculating and printing the f1 score
f1_test = f1_score(y_test, pred_test)
print('The f1 score for the testing data:', f1_test)

# function to create a confusion matrix
def conf_matrix(y_test, pred_test):

    # creating a confusion matrix
    con_mat = confusion_matrix(y_test, pred_test)
    con_mat = pd.DataFrame(con_mat, range(2), range(2))

    # ploting the confusion matrix
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.5)
    sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)

# calling function
conf_matrix(y_test, pred_test)

In [None]:
# creating conversion matrix dataframe with row and column labels
pd.crosstab(y_test, pred_test, rownames = ['Actual'], colnames =['Predicted'], margins = True)

In [None]:
# predicting the test set results and calculating the accuracy
pred_test = lr.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, y_test)))

In [None]:
# computing precision, recall, F-measure and support
print(classification_report(y_test, pred_test))

In [None]:
# generating an ROC Curve
logit_roc_auc = roc_auc_score(y_test, lr.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, lr.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
# =============================================================================
# MODEL F - MODEL EVALUATION
# =============================================================================

In [None]:
model_f_cols = ['user_signups', 'state_sub_region', 'browser_rev', 'os_name_grouped']
df_model_f = df[model_f_cols]
df_model_f_cat = pd.get_dummies(df_model_f, prefix=['region', 'browser', 'os'])
X=df_model_f_cat.iloc[:, 1:18]
y=df_model_f_cat['user_signups']

# importing and training the model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
lr = LogisticRegression(solver='newton-cg', class_weight='balanced')
lr.fit(X_train, y_train)

# predicting on the test data
pred_test = lr.predict(X_test)

# calculating and printing the f1 score
f1_test = f1_score(y_test, pred_test)
print('The f1 score for the testing data:', f1_test)

# function to create a confusion matrix
def conf_matrix(y_test, pred_test):

    # creating a confusion matrix
    con_mat = confusion_matrix(y_test, pred_test)
    con_mat = pd.DataFrame(con_mat, range(2), range(2))

    # ploting the confusion matrix
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.5)
    sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)

# calling function
conf_matrix(y_test, pred_test)

In [None]:
creating conversion matrix dataframe with row and column labels
pd.crosstab(y_test, pred_test, rownames = ['Actual'], colnames =['Predicted'], margins = True)

In [None]:
# predicting the test set results and calculating the accuracy
pred_test = lr.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, y_test)))

In [None]:
# computing precision, recall, F-measure and support
print(classification_report(y_test, pred_test))

In [None]:
# generating an ROC Curve
logit_roc_auc = roc_auc_score(y_test, lr.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, lr.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()