In [171]:
import json
import pandas as pd
import requests
import io
from urllib.request import urlopen
import json
import logging
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pandas.io.json import json_normalize

In [172]:
df_v1 = pd.read_csv('data/v1_cleaned.csv')
df_v2 = pd.read_csv('data/v2_cleaned.csv')

In [173]:
# merge the two dataframes
df = pd.concat([df_v1, df_v2], ignore_index=True)

## Filtering out assumed cheaters

Assume that participants who did answer at least 5 out of 6 control questions correctly were paying attention and gave valid answers.

In [174]:
# participants who did not get 2 points in the qualification shouldn't be considered
df = df[df['POINTS.qualification'] == 2]

In [175]:
df = df[df["POINTS.main"] >= 5]

In [176]:
df["METADATA.FEATURE"].value_counts()

basic      133
salient    105
Name: METADATA.FEATURE, dtype: int64

## Demographics

In [177]:
# get columns which start with "demographics"
demographics_cols = [col for col in df.columns if col.startswith('demographics')]
demographics_cols
# country and nationality are not relevant, since the study was conducted in the US

['demographics.age',
 'demographics.country',
 'demographics.education',
 'demographics.employment',
 'demographics.gender',
 'demographics.income',
 'demographics.nationality']

In [178]:
df["demographics.age"].value_counts(normalize=True).sort_index() * 100

18-20           2.941176
21-29          28.991597
30-39          38.235294
40-49          21.008403
50-59           6.722689
60-or-older     2.100840
Name: demographics.age, dtype: float64

In [179]:
df["demographics.education"].value_counts(normalize=True) * 100
# university degree is the most common education level, this is surprising

university         55.462185
high-school        39.495798
no-degree           2.100840
secondary           1.680672
upper-secondary     1.260504
Name: demographics.education, dtype: float64

In [180]:
df["demographics.employment"].value_counts(normalize=True) * 100

salaried-employee    35.714286
self-employed        31.932773
unemployed           12.605042
student              10.924370
other                 6.722689
civil-servant         0.840336
retiree-pensioner     0.840336
apprentice            0.420168
Name: demographics.employment, dtype: float64

In [181]:
income_vc = df["demographics.income"].value_counts(normalize=True)
print("less-than-20000-usd", income_vc["less-than-20000-usd"] * 100)
print("20000-34999-usd", income_vc["20000-34999-usd"] * 100)
print("35000-49999-usd", income_vc["35000-49999-usd"] * 100)
print("50000-74999-usd", income_vc["50000-74999-usd"] * 100)
print("75000-99999-usd", income_vc["75000-99999-usd"] * 100)
print("over-100000-usd", income_vc["over-100000-usd"] * 100)
print("no-answer", income_vc["no-answer"] * 100)

less-than-20000-usd 16.80672268907563
20000-34999-usd 17.22689075630252
35000-49999-usd 12.605042016806722
50000-74999-usd 18.487394957983195
75000-99999-usd 12.605042016806722
over-100000-usd 10.92436974789916
no-answer 11.344537815126051


In [182]:
df["demographics.gender"].value_counts(normalize=True) * 100

female     67.647059
male       31.092437
diverse     1.260504
Name: demographics.gender, dtype: float64