In [1]:
import csv
import numpy as np

**Format data as JSON**

Create 1) nodes with details about each paper and 2) links with source/target representing citations


In [2]:
def make_node(row):
    return {
        "id": row[1].lower() + " " + row[0],
        "year": int(row[0]),
        "title": row[2],
        "citations": int(row[3]),
        "social_network": [r.lower().strip() for r in row[4].split(",")],
        "sample_size": int(row[5]),
        "sample_country": [r.lower().strip() for r in row[6].split(",")],
        "study_type": [r.lower().strip() for r in row[7].split(",")],
        "variables": [r.lower().strip() for r in row[8].split(",")]
    }

In [3]:
filepath = "../data/data.tsv"

In [4]:
in_dataset = ['acquisti & gross 2006', 'banks et al 2010', 'bartsch & dienlin 2016', 'bateman et al 2011', 'benson et al 2015', 'buchi et al 2016', 'collins et al 2012', 'coventry et al 2014', 'cranor et al 1999', 'de wolf et al 2014', 'debatin et al 2009', 'dienlin & trepte 2014', 'dinev & hart 2005', 'doherty & lang 2014', 'dong et al 2015', 'drennan et al 2006', 'dwyer et al 2007', 'feng & xie 2014', 'fox & royne 2018', 'garg et al 2014', 'golbeck & mauriello 2016', 'govani & pashley 2005', 'graeff & harmon 2002', 'hajli & lin 2016', 'hazari & brown 2013', 'heirman et al 2013', 'hoofnagle et al 2010', 'hossain & zhang 2015', 'jensen et al 2005', 'jiang et al 2013', 'johnson et al 2012', 'keith et al 2013', 'kezer et al 2016', 'kisilevich & mansmann 2010', 'krasnova & kift 2012', 'lawler et al 2012', 'lin & liu 2012', 'litt 2013', 'liu et al 2011', 'madejski et al 2012', 'malik et al 2016', 'mcknight et al 2010', 'mesch 2010', 'millham & atkin 2016', 'milne & culnan 2004', 'miltgen & smith 2015', 'moll et al 2014', 'orito et al 2014', 'ozdemir et al 2017', 'paine et al 2007', 'park 2011', 'park et al 2012', 'pitkanen et al 2012', 'posey et al 2010', 'potzch et al 2010', 'reynolds et al 2011', 'staddon et al 2012', 'steijn et al 2016', 'stutzman et al 2011', 'taddicken 2014', 'torres 2012', 'turow 2003', 'tuunainen et al 2009', 'veltri et al 2011', 'wills & zeljkovic 2011', 'xu et al 2008', 'yao et al 2007', 'youn 2009', 'young & quan-haase 2009', 'zlatolas et al 2015']
nodes = []
links = []

with open(filepath, 'rU') as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        if row[0] != "year":
            # Make nodes
            nodes.append(make_node(row))
            citations = [r.lower() for r in row[9].split(",")]
            for c in citations:
                # Make links
                link = {"source": row[1].lower() + " " + row[0], "target": c}
                if c in in_dataset and link not in links: links.append(link)

In [5]:
import json

In [6]:
json.dumps(nodes)
json.dumps(links)

'[{"source": "bartsch & dienlin 2016", "target": "acquisti & gross 2006"}, {"source": "bartsch & dienlin 2016", "target": "hoofnagle et al 2010"}, {"source": "bartsch & dienlin 2016", "target": "litt 2013"}, {"source": "bartsch & dienlin 2016", "target": "taddicken 2014"}, {"source": "buchi et al 2016", "target": "feng & xie 2014"}, {"source": "buchi et al 2016", "target": "park et al 2012"}, {"source": "buchi et al 2016", "target": "taddicken 2014"}, {"source": "coventry et al 2014", "target": "jensen et al 2005"}, {"source": "coventry et al 2014", "target": "drennan et al 2006"}, {"source": "de wolf et al 2014", "target": "acquisti & gross 2006"}, {"source": "de wolf et al 2014", "target": "dwyer et al 2007"}, {"source": "debatin et al 2009", "target": "acquisti & gross 2006"}, {"source": "debatin et al 2009", "target": "litt 2013"}, {"source": "debatin et al 2009", "target": "stutzman et al 2011"}, {"source": "dienlin & trepte 2014", "target": "yao et al 2007"}, {"source": "doherty 

Count most common variables:

In [7]:
variables = []
with open(filepath, 'rU') as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        if row[0] != "year":
            spl = [r.lower().strip() for r in row[8].split(",")]
            spl = [s for s in spl if s != ""]
            variables.extend(spl)

In [8]:
from collections import Counter
v = Counter(variables)

In [9]:
v.most_common()

[('privacy concern', 20),
 ('age', 14),
 ('gender', 11),
 ('trust', 10),
 ('privacy concerns', 8),
 ('privacy attitudes', 7),
 ('information disclosure', 6),
 ('self-disclosure', 5),
 ('privacy awareness', 4),
 ('privacy behavior', 3),
 ('knowledge', 3),
 ('privacy control', 3),
 ('income', 3),
 ('privacy settings', 3),
 ('perceived risk', 3),
 ('privacy knowledge', 3),
 ('privacy protection', 2),
 ('social influence', 2),
 ('perceived rewards', 2),
 ('perceived benefits', 2),
 ('privacy social norms', 2),
 ('perceived severity', 2),
 ('risk', 2),
 ('privacy protection strategies', 2),
 ('privacy breaches', 2),
 ('control over personal information', 2),
 ('privacy protection behaviors', 2),
 ('willingness to pay for privacy', 2),
 ('concern about unwanted audiences', 2),
 ('disclosure', 2),
 ('internet use', 2),
 ('use', 2),
 ('interest', 2),
 ('information control', 2),
 ('perceived vulnerability', 2),
 ('behavior', 2),
 ('sharing intentions', 2),
 ('education', 2),
 ('teen privacy co

In [10]:
variables = [v.split(" ") for v in variables]

In [11]:
import nltk
from nltk.corpus import stopwords
s_words = set(stopwords.words('english'))

In [12]:
variables = [[v.decode('utf-8', 'ignore') for v in var if v not in s_words] for var in variables]

In [13]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

In [14]:
variables = [" ".join([stemmer.stem(v) for v in var]) for var in variables]

In [15]:
for x in Counter(variables).most_common(): print x[0] + "\t" + str(x[1])

privaci concern	28
age	14
gender	11
trust	10
privaci attitud	8
inform disclosur	6
self-disclosur	5
privaci awar	4
privaci behavior	4
privaci knowledg	3
knowledg	3
privaci set	3
concern unwant audienc	3
perceiv risk	3
privaci control	3
incom	3
willing pay privaci	2
activ	2
risk	2
privaci valu	2
social influenc	2
internet use	2
inform control	2
privaci breach	2
perceiv privaci risk	2
disclosur	2
privaci protect	2
control person inform	2
perceiv sever	2
use	2
privaci protect behavior	2
perceiv vulner	2
interest	2
privaci protect strategi	2
educ	2
privaci social norm	2
perceiv benefit	2
behavior	2
share intent	2
perceiv reward	2
trust site	2
presenc text/audio/pictori cue privaci polici	1
composit	1
common-dread	1
reason use	1
comput experi	1
trust legal assur	1
site use	1
confidenti disclosur boundari	1
profil updat frequenc	1
perceiv boundari connect disclosur osn	1
content correct	1
privaci self-efficaci	1
altern read notic	1
info disclosur benefit	1
fear	1
privaci concept	1
privaci saf

In [16]:
unique_words = {}
for x in variables:
    spl = x.split(" ")
    for word in spl:
        if word in unique_words:
            unique_words[word].append(x)
        else:
            unique_words[word] = [x]

In [17]:
for word in sorted(unique_words):
    print len(Counter(unique_words[word]))

1
1
1
1
3
3
2
3
1
1
1
1
1
1
3
1
1
1
2
10
3
9
1
14
4
3
1
1
3
3
1
2
1
1
1
1
1
1
2
1
1
1
1
2
1
1
7
1
1
1
1
1
1
2
1
1
10
2
1
1
12
1
1
1
1
6
10
2
1
1
1
3
1
1
1
1
1
9
1
1
1
3
1
1
2
1
1
1
3
1
1
3
1
1
2
1
2
1
1
1
1
1
4
1
1
3
25
1
3
1
2
7
1
9
2
1
1
5
1
2
2
1
1
3
1
1
1
3
5
1
1
1
1
1
1
4
1
1
5
1
1
1
1
8
2
3
3
1
1
1
1
1
1
34
3
10
1
1
10
2
3
1
79
6
1
1
5
3
1
1
2
3
1
4
1
1
1
3
1
1
1
1
1
3
1
1
10
1
1
1
1
1
1
1
1
4
3
2
1
1
3
8
2
10
1
4
1
2
9
12
1
1
1
2
1
1
1
2
1
5
1
1
1
1
1
4
1
3
2
1
15
1
2
4
3
1
1
1
3
27
2
1
1
2
3
1
1
4
2
1
6
