In [59]:
import pandas as pd

# Load the CSV file
file_path = "./../tiktok_PP.csv"
df = pd.read_csv(file_path)

def create_xml_node(row):
    data_piece_name = row['Target']
    subcategory = row['Subcategory']
    category = row['Category']
    kind = row['Kind']
    
    xml_structure = f"""
<node id="{data_piece_name}">
    <data key="level_0">{data_piece_name}</data>
    <data key="level_1">{subcategory}</data>
    <data key="level_2">{category}</data>
    <data key="level_3">{kind}</data>
    <data key="d0">{data_piece_name}</data>
    <data key="d1">DATA</data>
</node>
"""
    return xml_structure

def create_actor_node(actor_name):
    xml_structure = f"""
<node id="{actor_name}">
    <data key="d0">{actor_name}</data>
    <data key="d1">ACTOR</data>
</node>
"""
    return xml_structure

def create_subsum_edges(row, created_subsum_edges):
    data_piece_name = row['Target']
    subcategory = row['Subcategory']
    category = row['Category']
    kind = row['Kind']

    edges = []
    # Avoid repeating subsum edges by tracking created relationships
    if (subcategory, data_piece_name) not in created_subsum_edges:
        edges.append(f"""
<edge source="{subcategory}" target="{data_piece_name}" id="edge_subsum_{subcategory}_{data_piece_name}">
    <data key="d2">SUBSUM</data>
</edge>
""")
        created_subsum_edges.add((subcategory, data_piece_name))

    if (category, subcategory) not in created_subsum_edges:
        edges.append(f"""
<edge source="{category}" target="{subcategory}" id="edge_subsum_{category}_{subcategory}">
    <data key="d2">SUBSUM</data>
</edge>
""")
        created_subsum_edges.add((category, subcategory))

    if (kind, category) not in created_subsum_edges:
        edges.append(f"""
<edge source="{kind}" target="{category}" id="edge_subsum_{kind}_{category}">
    <data key="d2">SUBSUM</data>
</edge>
""")
        created_subsum_edges.add((kind, category))

    return edges

def create_collect_edges(row, collect_edges_tracker):
    data_piece_name = row['Target']
    actor = row['Source']
    text = row['Text']

    # Track collect edges by (actor, data_piece_name) pair
    if (actor, data_piece_name) not in collect_edges_tracker:
        collect_edges_tracker[(actor, data_piece_name)] = [text]
    else:
        collect_edges_tracker[(actor, data_piece_name)].append(text)

def generate_xml_from_dataframe(df, output_file):
    xml_actor_nodes = ''
    xml_data_nodes = ''
    xml_subsum_edges = ''
    xml_collect_edges = ''
    edge_id_counter = 0

    actor_nodes = set()  # To track actor nodes
    data_nodes = set()  # To track data nodes
    created_subsum_edges = set()  # To track subsum relationships
    collect_edges_tracker = {}  # To track collect relationships with multiple texts

    for index, row in df.iterrows():
        # Add actor node if not already created
        actor = row['Source']
        if actor not in actor_nodes:
            xml_actor_nodes += create_actor_node(actor)
            actor_nodes.add(actor)

        # Add data node if not already created
        data_piece_name = row['Target']
        if data_piece_name not in data_nodes:
            xml_data_nodes += create_xml_node(row)
            data_nodes.add(data_piece_name)

        # Create edges for subsum relations (avoiding repetition)
        subsum_edges = create_subsum_edges(row, created_subsum_edges)
        for edge in subsum_edges:
            xml_subsum_edges += edge

        # Create collect edges (merging multiple texts for the same source-target pair)
        create_collect_edges(row, collect_edges_tracker)

    # Now, create the collect edges with combined texts
    for (actor, data_piece_name), texts in collect_edges_tracker.items():
        combined_text = "\n".join(texts)  # Combine the texts into one string, with new lines
        xml_collect_edges += f"""
<edge source="{actor}" target="{data_piece_name}" id="edge_collect_{actor}_{data_piece_name}">
    <data key="d2">COLLECT</data>
    <data key="d3">{combined_text}</data>
</edge>
"""

    # Combine all nodes first and then edges in the correct order
    final_xml_structure = f"<graph>\n{xml_actor_nodes}\n{xml_data_nodes}\n{xml_subsum_edges}\n{xml_collect_edges}\n</graph>"
    
    # Write the XML structure to a file
    with open(output_file, 'w') as file:
        file.write(final_xml_structure)
        
    print(f"XML written to {output_file}")

# Example usage
# Assuming df is your pandas DataFrame with the relevant columns
output_file = "./../graphmls/tiktok.graphml"
generate_xml_from_dataframe(df, output_file)


XML written to ./../graphmls/tiktok.graphml


### Generating the lists of unique actors and unique data pieces:

In [70]:
import pandas as pd

# Load the CSV file
file_path = './../tiktok_PP.csv'
df = pd.read_csv(file_path)

# Get unique actors from the 'Source' column
unique_actors = df['Source'].unique().tolist()

# Get unique data pieces from the 'Target' column
unique_data_pieces = df['Target'].unique().tolist()

# Print unique actors
print("Unique Actors:")
for actor in unique_actors:
    print(actor)

# Print unique data pieces
print("\nUnique Data Pieces:")
for data_piece in unique_data_pieces:
    print(data_piece)


Unique Actors:
we
Advertising, Measurement and Other Partners
Merchants, Payment and Transaction Fulfillment Providers
Third Party Platforms
Other Users
Third Party Services with TikTok Developer Tools
Third Party Providers
Organisations, Businesses, People, and Others
Service Providers
Third Party Platforms and Partners
Advertisers
Third Party Measurement Providers
Merchants, Payment and Transaction Fulfilment Providers, and Other Service Providers
Entities within our corporate group
Users and the Public
Search Engines, Content Aggregators, and News Sites
Researchers
Corporate Transaction Parties
Law Enforcement Agencies, or Other Third Parties
Public Authorities
Copyright Holders
Other Third Parties

Unique Data Pieces:
Date of Birth
Username
Email Address
Telephone Number
Password
Profile Bio
Profile Photo
Photographs
Videos
Audio Recordings
Livestreams
Comments
Hashtags
Feedback
Reviews
Creation Time
Creation Date
Location of Content Creation
Creator Identity
Text from Clipboard
Im

### generating edges representing access to data pieces or subcategories

In [84]:
import pandas as pd

# Load the CSV file
file_path = "./../tiktok_PP.csv"
df = pd.read_csv(file_path)

def create_xml_node(data_piece_name, subcategory, category, kind):
    xml_structure = f"""
<node id="{data_piece_name}">
    <data key="level_0">{data_piece_name}</data>
    <data key="level_1">{subcategory}</data>
    <data key="level_2">{category}</data>
    <data key="level_3">{kind}</data>
    <data key="d0">{data_piece_name}</data>
    <data key="d1">DATA</data>
</node>
"""
    return xml_structure

def create_subcategory_node(subcategory, category, kind):
    xml_structure = f"""
<node id="{subcategory}">
    <data key="level_0">{subcategory}</data>
    <data key="level_1">{subcategory}</data>
    <data key="level_2">{category}</data>
    <data key="level_3">{kind}</data>
    <data key="d0">{subcategory}</data>
    <data key="d1">DATA</data>
</node>
"""
    return xml_structure

def create_category_node(category, kind):
    xml_structure = f"""
<node id="{category}">
    <data key="level_0">{category}</data>
    <data key="level_1">{category}</data>
    <data key="level_2">{kind}</data>
    <data key="d0">{category}</data>
    <data key="d1">DATA</data>
</node>
"""
    return xml_structure

def create_kind_node(kind):
    xml_structure = f"""
<node id="{kind}">
    <data key="level_0">{kind}</data>
    <data key="d0">{kind}</data>
    <data key="d1">DATA</data>
</node>
"""
    return xml_structure

def create_actor_node(actor_name):
    xml_structure = f"""
<node id="{actor_name}">
    <data key="d0">{actor_name}</data>
    <data key="d1">ACTOR</data>
</node>
"""
    return xml_structure

def create_subsum_edges(row, created_subsum_edges):
    subcategory = row['Subcategory']
    category = row['Category']
    kind = row['Kind']
    data_piece_name = row['Target']

    edges = []
    # Avoid repeating subsum edges by tracking created relationships
    if (subcategory, data_piece_name) not in created_subsum_edges:
        edges.append(f"""
<edge source="{subcategory}" target="{data_piece_name}" id="edge_subsum_{subcategory}_{data_piece_name}">
    <data key="d2">SUBSUM</data>
</edge>
""")
        created_subsum_edges.add((subcategory, data_piece_name))

    if (category, subcategory) not in created_subsum_edges:
        edges.append(f"""
<edge source="{category}" target="{subcategory}" id="edge_subsum_{category}_{subcategory}">
    <data key="d2">SUBSUM</data>
</edge>
""")
        created_subsum_edges.add((category, subcategory))

    if (kind, category) not in created_subsum_edges:
        edges.append(f"""
<edge source="{kind}" target="{category}" id="edge_subsum_{kind}_{category}">
    <data key="d2">SUBSUM</data>
</edge>
""")
        created_subsum_edges.add((kind, category))

    return edges

def create_collect_edges(row, collect_edges_tracker, access_to_subcategories=True):
    actor = row['Source']
    target = row['Subcategory'] if access_to_subcategories else row['Target']
    text = row['Text']

    # Track collect edges by (actor, target) pair
    if (actor, target) not in collect_edges_tracker:
        collect_edges_tracker[(actor, target)] = [text]
    else:
        collect_edges_tracker[(actor, target)].append(text)

def generate_xml_from_dataframe(df, output_file, access_to_subcategories=True):
    xml_actor_nodes = ''
    xml_data_nodes = ''
    xml_subsum_edges = ''
    xml_collect_edges = ''
    edge_id_counter = 0

    actor_nodes = set()  # To track actor nodes
    data_nodes = set()  # To track data, subcategory, and category nodes
    created_subsum_edges = set()  # To track subsum relationships
    collect_edges_tracker = {}  # To track collect relationships with multiple texts

    for index, row in df.iterrows():
        actor = row['Source']
        data_piece_name = row['Target']
        subcategory = row['Subcategory']
        category = row['Category']
        kind = row['Kind']

        # Add actor node if not already created
        if actor not in actor_nodes:
            xml_actor_nodes += create_actor_node(actor)
            actor_nodes.add(actor)

        # Add kind node if not already created
        if kind not in data_nodes:
            xml_data_nodes += create_kind_node(kind)
            data_nodes.add(kind)

        # Add category node if not already created
        if category not in data_nodes:
            xml_data_nodes += create_category_node(category, kind)
            data_nodes.add(category)

        # Add subcategory node if not already created
        if subcategory not in data_nodes:
            xml_data_nodes += create_subcategory_node(subcategory, category, kind)
            data_nodes.add(subcategory)

        # Add data piece node if not already created
        if data_piece_name not in data_nodes:
            xml_data_nodes += create_xml_node(data_piece_name, subcategory, category, kind)
            data_nodes.add(data_piece_name)

        # Create edges for subsum relations (avoiding repetition)
        subsum_edges = create_subsum_edges(row, created_subsum_edges)
        for edge in subsum_edges:
            xml_subsum_edges += edge

        # Create collect edges (merging multiple texts for the same source-target pair)
        create_collect_edges(row, collect_edges_tracker, access_to_subcategories)

    # Now, create the collect edges with combined texts
    for (actor, target), texts in collect_edges_tracker.items():
        combined_text = "\n".join(texts)  # Combine the texts into one string, with new lines
        xml_collect_edges += f"""
<edge source="{actor}" target="{target}" id="edge_collect_{actor}_{target}">
    <data key="d2">COLLECT</data>
    <data key="d3">{combined_text}</data>
</edge>
"""

    # Combine all nodes first and then edges in the correct order
    final_xml_structure = f"<graph>\n{xml_actor_nodes}\n{xml_data_nodes}\n{xml_subsum_edges}\n{xml_collect_edges}\n</graph>"
    
    # Write the XML structure to a file
    with open(output_file, 'w') as file:
        file.write(final_xml_structure)
        
    print(f"XML written to {output_file}")

# Example usage: Change 'access_to_subcategories' to True or False as needed
output_file = "./../graphmls/tiktok.graphml"
generate_xml_from_dataframe(df, output_file, access_to_subcategories=False)


XML written to ./../graphmls/tiktok.graphml


### taking both versions of dataCategories and merging them, producing the union of values for each key

In [93]:
import json
from collections import defaultdict

# First version of dataCategories
dataCategories_v1 = {
    "Personally Identifiable Information": [
        "Date of Birth",
        "Email Address",
        "Telephone Number",
        "Username",
        "Password",
        "Profile Bio",
        "Profile Photo",
        "Proof of Identity",
        "Proof of Age"
    ],
    "Contact Information": [
        "Names from Phone Book",
        "Phone Numbers from Phone Book",
        "Email Addresses from Phone Book",
        "Your Social Network Public Profile Information",
        "Names of Social Network Contacts",
        "Profiles of Social Network Contacts",
        "Contacts Provided by Others",
        "Contact Information Synced by Others"
    ],
    "User-Generated Content": [
        "Photographs",
        "Videos",
        "Audio Recordings",
        "Livestreams",
        "Comments",
        "Hashtags",
        "Feedback",
        "Reviews",
        "Text from Clipboard",
        "Images from Clipboard",
        "Videos from Clipboard",
        "Survey Responses",
        "Research Participation Data",
        "Contest Entries",
        "Marketing Campaign Participation",
        "Event Participation",
        "Form Data",
        "Creation Time",
        "Creation Date",
        "Creator Identity"
    ],
    "Communication Data": [
        "Message Content",
        "Timestamps",
        "Chats with Merchants",
        "Virtual Assistant Interactions",
        "Policy Violation Reports"
    ],
    "Transaction and Financial Information": [
        "Payment Card Details",
        "Billing Information",
        "Delivery Information",
        "Contact Information (Purchases)",
        "Items Purchased",
        "Payment Confirmation Details",
        "Transaction Amounts",
        "Purchase or Payment Dates",
        "Shipping Address",
        "Delivery Status"
    ],
    "Device and Technical Information": [
        "Device Model",
        "Operating System",
        "Keystroke Patterns or Rhythms",
        "IP Address",
        "System Language",
        "Crash Reports",
        "Performance Logs",
        "Device ID",
        "User ID",
        "Network Type",
        "Notification Settings"
    ],
    "Location Data": [
        "Approximate Location (SIM and IP)",
        "Approximate Location (Device)",
        "Location Information",
        "Location of Content Creation"
    ],
    "Usage and Interaction Data": [
        "Content Viewed",
        "Duration of Use",
        "Frequency of Use",
        "Interactions with Other Users",
        "Search History",
        "Settings"
    ],
    "Cookies and Tracking Technologies": [
        "Cookie Identifiers",
        "Session Tokens",
        "Web Beacons",
        "Pixel Tags"
    ],
    "Inferred and Analytical Data": [
        "Inferred Age-Range",
        "Inferred Gender",
        "Interests and Preferences",
        "Objects and Scenery Recognition",
        "Face or Body Part Detection",
        "Speech-to-Text Transcriptions",
        "Information about ad performance"
    ],
    "Third-Party and External Data": [
        "Activities on Other Websites and Apps",
        "Products or Services Purchased Elsewhere",
        "Mobile Identifiers for Advertising",
        "Hashed Email Addresses",
        "Hashed Phone Numbers",
        "Email Address from Third Parties",
        "User ID from Third Parties",
        "Public Profile from Third Parties",
        "Data from TikTok Developer Tools Integrations",
        "Safety and Content Moderation Data",
        "Publicly Available Information",
        "Data from Government Authorities",
        "Data from Professional Organisations",
        "Data from Charity Groups",
        "Mentions in Content",
        "Cookie Identifiers (Third Parties)",
        "Delivery Information (from Merchants)"
    ],
    "Ambiguous or Non-specified Data": [
        "Ambiguous or Non-specified Data"
    ]
}

# Second version of dataCategories
dataCategories_v2 = {
    "Personally Identifiable Information": [
        "Basic Personal Details",
        "Profile Information",
        "Identification Documents"
    ],
    "Contact Information": [
        "Device Contacts",
        "Social Network Contacts",
        "Contacts Provided by Others"
    ],
    "User-Generated Content": [
        "Media Content",
        "Textual Content",
        "Clipboard Content",
        "Content Metadata",
        "User Participation Data"
    ],
    "Communication Data": [
        "Direct Messages",
        "Merchant Communications",
        "Support and Feedback"
    ],
    "Transaction and Financial Information": [
        "Payment Information",
        "Purchase Details",
        "Delivery Information"
    ],
    "Device and Technical Information": [
        "Device Information",
        "Network Information",
        "Technical Diagnostics",
        "Automatically Assigned Identifiers"
    ],
    "Location Data": [
        "Approximate Location",
        "Precise Location (with Permission)",
        "Location Tags in Content"
    ],
    "Usage and Interaction Data": [
        "Engagement Metrics",
        "Search and Activity History",
        "Settings and Preferences"
    ],
    "Cookies and Tracking Technologies": [
        "Cookie Data",
        "Tracking Information",
        "Usage Purposes"
    ],
    "Inferred and Analytical Data": [
        "Inferred Attributes",
        "Content Analysis",
        "Behavioral Profiles",
        "Aggregated Statistics"
    ],
    "Third-Party and External Data": [
        "Data from Advertising and Analytics Partners",
        "Data from Merchants and Service Providers",
        "Data from Third-Party Platforms",
        "Data from Public and External Sources",
        "Mentions in Content"
    ],
    "Ambiguous or Non-specified Data": [
        "All Data"
    ]
}

# Merge the two versions
merged_dataCategories = defaultdict(set)

for category, items in dataCategories_v1.items():
    merged_dataCategories[category].update(items)

for category, items in dataCategories_v2.items():
    merged_dataCategories[category].update(items)

# Convert back to regular dictionary and sort each list
merged_dataCategories = {category: sorted(list(items)) for category, items in merged_dataCategories.items()}

# Pretty print the JSON output
pretty_json = json.dumps(merged_dataCategories, indent=4)

# Output the result
print(pretty_json)


{
    "Personally Identifiable Information": [
        "Basic Personal Details",
        "Date of Birth",
        "Email Address",
        "Identification Documents",
        "Password",
        "Profile Bio",
        "Profile Information",
        "Profile Photo",
        "Proof of Age",
        "Proof of Identity",
        "Telephone Number",
        "Username"
    ],
    "Contact Information": [
        "Contact Information Synced by Others",
        "Contacts Provided by Others",
        "Device Contacts",
        "Email Addresses from Phone Book",
        "Names from Phone Book",
        "Names of Social Network Contacts",
        "Phone Numbers from Phone Book",
        "Profiles of Social Network Contacts",
        "Social Network Contacts",
        "Your Social Network Public Profile Information"
    ],
    "User-Generated Content": [
        "Audio Recordings",
        "Clipboard Content",
        "Comments",
        "Content Metadata",
        "Contest Entries",
        "Crea