In [43]:
import pandas as pd
import json
from pathlib import Path

In [29]:
current_dir = Path.cwd()  # .../src/data_processing
project_root = current_dir.parent.parent  # Go up 2 levels to main dir

output_parent =  project_root / "data" / "processed"
raw_data_folder = project_root / "data" / "raw"

In [30]:
df = pd.read_csv(raw_data_folder / "backbone.csv")
df.head()

Unnamed: 0,0,1,2,category,prevalence
0,orange_peel,orange,14,plant,0.027302
1,orange,orange_peel,14,fruit,0.081571
2,orange,orange_juice,54,fruit,0.081571
3,orange,citrus,51,fruit,0.081571
4,orange,citrus_peel,40,fruit,0.081571


In [4]:
# set column header
df.columns = ["ingredient1", "ingredient2", "num_shared_compound", "category", "prevalence"]
df.head()

Unnamed: 0,ingredient1,ingredient2,num_shared_compound,category,prevalence
0,orange_peel,orange,14,plant,0.027302
1,orange,orange_peel,14,fruit,0.081571
2,orange,orange_juice,54,fruit,0.081571
3,orange,citrus,51,fruit,0.081571
4,orange,citrus_peel,40,fruit,0.081571


In [28]:
data = {
    "ingredient_pairs": [],
    "ingredients": {}
}

processed_pairs = set()

for _, row in df.iterrows():
    # Handle ingredient pairs
    pair = tuple(sorted([row["ingredient1"], row["ingredient2"]]))  # Sort to avoid duplicates
    
    if pair not in processed_pairs:
        data["ingredient_pairs"].append({
            "ingredient1": pair[0],
            "ingredient2": pair[1],
            "num_shared_compound": row["num_shared_compound"]
        })
        processed_pairs.add(pair)
    
    # Handle ingredient info
    if row["ingredient1"] not in data["ingredients"]:
        data["ingredients"][row["ingredient1"]] = {
            "category": row["category"],
            "prevalence": row["prevalence"]
        }

# Save to JSON
with open(output_parent/"ingredient_data.json", "w", encoding='utf-8') as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

In [None]:
with open(output_parent/"ingredient_data.json", "r") as file:
    ingredient_data = json.load(file)

In [None]:
ingredient_pairs = ingredient_data['ingredient_pairs']
ingredients_info = ingredient_data['ingredients']

In [18]:
ingredient_pairs

[{'ingredient1': 'orange',
  'ingredient2': 'orange_peel',
  'num_shared_compound': 14},
 {'ingredient1': 'orange',
  'ingredient2': 'orange_juice',
  'num_shared_compound': 54},
 {'ingredient1': 'citrus', 'ingredient2': 'orange', 'num_shared_compound': 51},
 {'ingredient1': 'citrus_peel',
  'ingredient2': 'orange',
  'num_shared_compound': 40},
 {'ingredient1': 'nutmeg', 'ingredient2': 'orange', 'num_shared_compound': 19},
 {'ingredient1': 'bergamot',
  'ingredient2': 'orange',
  'num_shared_compound': 9},
 {'ingredient1': 'orange',
  'ingredient2': 'strawberry',
  'num_shared_compound': 69},
 {'ingredient1': 'grapefruit',
  'ingredient2': 'orange',
  'num_shared_compound': 30},
 {'ingredient1': 'mandarin',
  'ingredient2': 'orange',
  'num_shared_compound': 57},
 {'ingredient1': 'lemon_peel',
  'ingredient2': 'orange',
  'num_shared_compound': 8},
 {'ingredient1': 'apple', 'ingredient2': 'orange', 'num_shared_compound': 67},
 {'ingredient1': 'lime', 'ingredient2': 'orange', 'num_shar

In [None]:
ingredients_info

{'orange_peel': {'category': 'plant', 'prevalence': 0.0273017994368},
 'orange': {'category': 'fruit', 'prevalence': 0.0815712853802},
 'cinnamon': {'category': 'spice', 'prevalence': 0.267433535392},
 'tarragon': {'category': 'herb', 'prevalence': 0.0227196792516},
 'anise': {'category': 'spice', 'prevalence': 0.0106438833469},
 'ginger': {'category': 'spice', 'prevalence': 0.182139277361},
 'fennel': {'category': 'herb', 'prevalence': 0.0432914896664},
 'nutmeg': {'category': 'spice', 'prevalence': 0.119898811513},
 'guava': {'category': 'fruit', 'prevalence': 0.000620495441745},
 'star_anise': {'category': 'spice', 'prevalence': 0.00434346809222},
 'cherry': {'category': 'fruit', 'prevalence': 0.0517397737578},
 'apple': {'category': 'fruit', 'prevalence': 0.115412152165},
 'roasted_pork': {'category': 'meat', 'prevalence': 0.00596630232447},
 'pork': {'category': 'meat', 'prevalence': 0.0937425421221},
 'raw_beef': {'category': 'meat', 'prevalence': 9.54608371915e-05},
 'beef': {'c

In [48]:
conn = {}

# has ingredient connections
with open(raw_data_folder / "srep00196-s3.csv", "r") as file:
    for line in file:
        s = line.strip().split(",")
        cuisine = s[0]
        ingredients = s[1:]
        
        if cuisine not in conn:
            conn[cuisine] = [ingredients]
        else:
            conn[cuisine].append(ingredients)

with open(output_parent/"cuisine_ingredient_data.json", "w") as file:
    json.dump(conn, file, indent=2, ensure_ascii=False)