In [1]:
import pandas as pd

# Load the preprocessed dataset
df = pd.read_csv('dataset_preprocessed.csv')
df.head()

Unnamed: 0,Age,Gender,Height,Looking For,Children,Education Level,Occupation,Swiping History,Frequency of Usage,cooking,hiking,movies,music,reading,sports,travel
0,30,0,5.240385,Casual Dating,0,HS,Student,96,Weekly,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,27,1,4.937625,Friendship,1,MSc,Artist,96,Monthly,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,29,1,5.806296,Casual Dating,0,BSc,Social Media Influencer,64,Daily,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,29,1,5.101402,Marriage,0,PhD,Doctor,67,Daily,0.0,1.0,1.0,1.0,1.0,1.0,0.0
4,32,0,5.98667,Long-term Relationship,1,PhD,Engineer,93,Monthly,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
import json

# Only rename columns for the users DataFrame before exporting users.json
users_df = df.rename(columns={
    'Age': 'age',
    'Gender': 'gender',
    'Looking For': 'looking_for',
    'Swiping History': 'swiping_history',
    'Frequency of Usage': 'usage_frequency',
    'Education Level': 'education_level',
    'Occupation': 'occupation',
    'Children': 'children'
})

# Convert to list of dicts
users = users_df.to_dict(orient='records')

# Save to JSON file in the web-app/public/data directory

with open('../web-app/public/data/users.json', 'w') as f:
    json.dump(users, f, indent=2)

In [5]:
# List of interest columns (one-hot encoded)
interest_columns = ["cooking", "hiking", "movies", "music", "reading", "sports", "travel"]

# Save to JSON file
import json
with open('../web-app/src/data/interests.json', 'w') as f:
    json.dump(interest_columns, f, indent=2)

In [10]:
import json
import pandas as pd

# Age distribution (example bins: 18-22, 23-27, 28-32, 33-35)
if 'Age' in df.columns:
    age_bins = [18, 23, 28, 33, 36]
    age_labels = ['18-22', '23-27', '28-32', '33-35']
    df['age_group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)
    age_distribution = df['age_group'].value_counts().sort_index().to_dict()
else:
    age_distribution = {}

# Gender counts (try both possible encodings)
if 'Gender' in df.columns:
    gender_counts = df['Gender'].value_counts().to_dict()
else:
    gender_counts = {}

# Relationship goals counts
goal_col = None
for col in ['Looking For', 'relationship_goal', 'Relationship Goal']:
    if col in df.columns:
        goal_col = col
        break

if goal_col:
    relationship_goals = df[goal_col].value_counts().to_dict()
else:
    relationship_goals = {}


# Usage patterns computation
usage_patterns = {}
if 'Frequency of Usage' in df.columns and 'Swiping History' in df.columns and 'Gender' in df.columns:
    for freq in df['Frequency of Usage'].unique():
        group = df[df['Frequency of Usage'] == freq]
        usage_patterns[freq] = {
            "avgSwipes": group['Swiping History'].mean(),
            "count": int(len(group)),
            "byGender": {
                "male": int(len(group[group['Gender'] == 'Male'])),
                "female": int(len(group[group['Gender'] == 'Female']))
            }
        }

# Combine into one dictionary
aggregates = {
    'age_distribution': age_distribution,
    'gender_counts': gender_counts,
    'relationship_goals': relationship_goals,
    'usage_patterns': usage_patterns
}

# Print for debugging
print('Aggregates preview:', json.dumps(aggregates, indent=2))

# Save to JSON file
with open('../web-app/public/data/aggregates.json', 'w') as f:
    json.dump(aggregates, f, indent=2)

Aggregates preview: {
  "age_distribution": {
    "18-22": 116,
    "23-27": 146,
    "28-32": 143,
    "33-35": 95
  },
  "gender_counts": {
    "0": 251,
    "1": 249
  },
  "relationship_goals": {
    "Casual Dating": 137,
    "Long-term Relationship": 137,
    "Friendship": 114,
    "Marriage": 112
  },
  "usage_patterns": {
    "Weekly": {
      "avgSwipes": 48.517857142857146,
      "count": 168,
      "byGender": {
        "male": 0,
        "female": 0
      }
    },
    "Monthly": {
      "avgSwipes": 50.846666666666664,
      "count": 150,
      "byGender": {
        "male": 0,
        "female": 0
      }
    },
    "Daily": {
      "avgSwipes": 52.37362637362637,
      "count": 182,
      "byGender": {
        "male": 0,
        "female": 0
      }
    }
  }
}


In [7]:
import json

# Example statements (customize and add more as needed)
game_statements = [
    {
        "statement": "What is the most common relationship goal among users?",
        "answer": max(aggregates["relationship_goals"], key=aggregates["relationship_goals"].get),
        "explanation": f"The most common relationship goal is {max(aggregates['relationship_goals'], key=aggregates['relationship_goals'].get)} with {max(aggregates['relationship_goals'].values())} users."
    },
    {
        "statement": "Which age group has the highest number of users?",
        "answer": max(aggregates["age_distribution"], key=aggregates["age_distribution"].get),
        "explanation": f"The age group {max(aggregates['age_distribution'], key=aggregates['age_distribution'].get)} has the highest number of users: {max(aggregates['age_distribution'].values())}."
    },
    {
        "statement": "What percentage of users are female?",
        "answer": round(100 * aggregates["gender_counts"].get("Female", 0) / sum(aggregates["gender_counts"].values()), 1),
        "explanation": f"{round(100 * aggregates['gender_counts'].get('Female', 0) / sum(aggregates['gender_counts'].values()), 1)}% of users are female."
    }
    # Add more statements as you wish!
]

# Save to JSON file
with open('../web-app/src/data/game_statements.json', 'w') as f:
    json.dump(game_statements, f, indent=2)

In [13]:
import json
import itertools

# If you have one-hot columns for interests:
interest_columns = ["cooking", "hiking", "movies", "music", "reading", "sports", "travel"]

# Build nodes
nodes = []
for interest in interest_columns:
    value = int(df[interest].sum())
    nodes.append({"id": interest, "value": value})

# Build links
links = []
for interest1, interest2 in itertools.combinations(interest_columns, 2):
    # Count users who have both interests
    both = int(((df[interest1] == 1) & (df[interest2] == 1)).sum())
    if both > 0:  # Only include links with at least one shared user
        links.append({"source": interest1, "target": interest2, "value": both})


# Combine into network
network = {
    "nodes": nodes,
    "links": links
}

# Save to JSON file
with open('../web-app/public/data/interest_network.json', 'w') as f:
    json.dump(network, f, indent=2)

print("Interest network saved to ../web-app/public/data/interest_network.json")

Interest network saved to ../web-app/public/data/interest_network.json
