In [23]:
import pandas as pd

# Load the CSV file
file_path = "./../draft_six_columns.csv"
df = pd.read_csv(file_path)

def create_xml_node(row):
    data_piece_name = row['Target']
    subcategory = row['Subcategory']
    category = row['Category']
    kind = row['Kind']
    
    xml_structure = f"""
<node id="{data_piece_name}">
    <data key="level_0">{data_piece_name}</data>
    <data key="level_1">{subcategory}</data>
    <data key="level_2">{category}</data>
    <data key="level_3">{kind}</data>
    <data key="d0">{data_piece_name}</data>
    <data key="d1">DATA</data>
</node>
"""
    return xml_structure

def create_actor_node(actor_name):
    xml_structure = f"""
<node id="{actor_name}">
    <data key="d0">{actor_name}</data>
    <data key="d1">ACTOR</data>
</node>
"""
    return xml_structure

def create_subsum_edges(row, created_subsum_edges):
    data_piece_name = row['Target']
    subcategory = row['Subcategory']
    category = row['Category']
    kind = row['Kind']

    edges = []
    # Avoid repeating subsum edges by tracking created relationships
    if (subcategory, data_piece_name) not in created_subsum_edges:
        edges.append(f"""
<edge source="{subcategory}" target="{data_piece_name}" id="edge_subsum_{subcategory}_{data_piece_name}">
    <data key="d2">SUBSUM</data>
</edge>
""")
        created_subsum_edges.add((subcategory, data_piece_name))

    if (category, subcategory) not in created_subsum_edges:
        edges.append(f"""
<edge source="{category}" target="{subcategory}" id="edge_subsum_{category}_{subcategory}">
    <data key="d2">SUBSUM</data>
</edge>
""")
        created_subsum_edges.add((category, subcategory))

    if (kind, category) not in created_subsum_edges:
        edges.append(f"""
<edge source="{kind}" target="{category}" id="edge_subsum_{kind}_{category}">
    <data key="d2">SUBSUM</data>
</edge>
""")
        created_subsum_edges.add((kind, category))

    return edges

def create_collect_edges(row, collect_edges_tracker):
    data_piece_name = row['Target']
    actor = row['Source']
    text = row['Text']

    # Track collect edges by (actor, data_piece_name) pair
    if (actor, data_piece_name) not in collect_edges_tracker:
        collect_edges_tracker[(actor, data_piece_name)] = [text]
    else:
        collect_edges_tracker[(actor, data_piece_name)].append(text)

def generate_xml_from_dataframe(df, output_file):
    xml_actor_nodes = ''
    xml_data_nodes = ''
    xml_subsum_edges = ''
    xml_collect_edges = ''
    edge_id_counter = 0

    actor_nodes = set()  # To track actor nodes
    data_nodes = set()  # To track data nodes
    created_subsum_edges = set()  # To track subsum relationships
    collect_edges_tracker = {}  # To track collect relationships with multiple texts

    for index, row in df.iterrows():
        # Add actor node if not already created
        actor = row['Source']
        if actor not in actor_nodes:
            xml_actor_nodes += create_actor_node(actor)
            actor_nodes.add(actor)

        # Add data node if not already created
        data_piece_name = row['Target']
        if data_piece_name not in data_nodes:
            xml_data_nodes += create_xml_node(row)
            data_nodes.add(data_piece_name)

        # Create edges for subsum relations (avoiding repetition)
        subsum_edges = create_subsum_edges(row, created_subsum_edges)
        for edge in subsum_edges:
            xml_subsum_edges += edge

        # Create collect edges (merging multiple texts for the same source-target pair)
        create_collect_edges(row, collect_edges_tracker)

    # Now, create the collect edges with combined texts
    for (actor, data_piece_name), texts in collect_edges_tracker.items():
        combined_text = "\n".join(texts)  # Combine the texts into one string, with new lines
        xml_collect_edges += f"""
<edge source="{actor}" target="{data_piece_name}" id="edge_collect_{actor}_{data_piece_name}">
    <data key="d2">COLLECT</data>
    <data key="d3">{combined_text}</data>
</edge>
"""

    # Combine all nodes first and then edges in the correct order
    final_xml_structure = f"<graph>\n{xml_actor_nodes}\n{xml_data_nodes}\n{xml_subsum_edges}\n{xml_collect_edges}\n</graph>"
    
    # Write the XML structure to a file
    with open(output_file, 'w') as file:
        file.write(final_xml_structure)
        
    print(f"XML written to {output_file}")

# Example usage
# Assuming df is your pandas DataFrame with the relevant columns
output_file = 'output.xml'  # Specify the output file name
generate_xml_from_dataframe(df, output_file)


XML written to output.xml


### Generating the lists of unique actors and unique data pieces:

In [28]:
import pandas as pd

# Load the CSV file
file_path = './../draft_six_columns.csv'
df = pd.read_csv(file_path)

# Get unique actors from the 'Source' column
unique_actors = df['Source'].unique().tolist()

# Get unique data pieces from the 'Target' column
unique_data_pieces = df['Target'].unique().tolist()

# Print unique actors
print("Unique Actors:")
for actor in unique_actors:
    print(actor)

# Print unique data pieces
print("\nUnique Data Pieces:")
for data_piece in unique_data_pieces:
    print(data_piece)


Unique Actors:
We (TikTok)
Advertising, Measurement and Other Partners
Merchants, Payment and Transaction Fulfillment Providers
Third Party Platforms
Other Users
Third Party Services with TikTok Developer Tools
Third Party Providers
Organisations, Businesses, People, and Others
Service Providers
Third Party Platforms and Partners
Advertisers
Third Party Measurement Providers
Merchants, Payment and Transaction Fulfilment Providers, and Other Service Providers
Entities within our corporate group
Users and the Public
Search Engines, Content Aggregators, and News Sites
Researchers
Corporate Transaction Parties
Law Enforcement Agencies, or Other Third Parties
Public Authorities
Copyright Holders
Other Third Parties

Unique Data Pieces:
Date of Birth
Username
Email Address
Telephone Number
Password
Profile Bio
Profile Photo
Photographs
Videos
Audio Recordings
Livestreams
Comments
Hashtags
Feedback
Reviews
Creation Time
Creation Date
Location of Content Creation
Creator Identity
Clipboard Tex