# Synthetic Data Generation

### Step 1: MCQ Synthetic Data Generation Using SDV
### Step 2: MRQ Synthetic Data Generation Using Frequency-Based Sampling
### Step 3: Free-Response Question Mapping

In [2]:
import pandas as pd
import numpy as np
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer

In [33]:
# Load survey data and Tripadvisor data
survey_data = pd.read_csv('../data/survey_data_cleaned.csv')
tripadvisor_data = pd.read_csv('../data/tripadvisor_data_cleaned.csv') 

# Create metadata for the survey dataset
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(survey_data)

# Create the GaussianCopulaSynthesizer
synthesizer = GaussianCopulaSynthesizer(metadata)

# Train the synthesizer on the real survey data
synthesizer.fit(survey_data)

# Generate 5000 synthetic records
synthetic_data = synthesizer.sample(150)

# For MRQ (multi-answer) columns, apply frequency-based sampling
mrq_columns = [
    'what are your preferred attraction(s)? select all that apply.',
    'which attraction(s) did you visit? (select all that apply)\n* includes meet-and-greets, theatre shows etc.',
    'which store(s) did you visit? (select all that apply)',
    'if your answer was "yes" above, which food outlet(s) did you visit? (select all that apply)',
    'which aspect of your visit was the most disappointing? (select all that apply)'
]

for column in mrq_columns:
    value_counts = survey_data[column].value_counts(normalize=True)
    synthetic_data[column] = np.random.choice(value_counts.index, len(synthetic_data), p=value_counts.values)

# Match TripAdvisor reviews based on the overall satisfaction rating
rating_reviews = {rating: tripadvisor_data[tripadvisor_data['Rating'] == rating]['Review Text Cleaned'].tolist() 
                  for rating in tripadvisor_data['Rating'].unique()}

# Populate 'additional_comments' based on corresponding overall satisfaction rating
synthetic_data['additional_comments'] = synthetic_data['overall_satisfaction'].apply(
    lambda rating: np.random.choice(rating_reviews.get(rating, ["No comments available"]))
)

# Add a column to indicate whether the row is real or synthetic
survey_data['source'] = 'real'
synthetic_data['source'] = 'synthetic'

# Concatenate the real data and synthetic data
final_synthetic_data = pd.concat([survey_data, synthetic_data], ignore_index=True)



### Verify the mapping of additional comments based on overall satisfaction

In [34]:
def verify_mapping(synthetic_data, tripadvisor_data):
    errors = []
    
    # Create a dictionary of TripAdvisor reviews based on rating
    rating_reviews = {rating: tripadvisor_data[tripadvisor_data['Rating'] == rating]['Review Text Cleaned'].tolist() 
                      for rating in tripadvisor_data['Rating'].unique()}
    
    # Iterate through synthetic data rows
    for idx, row in synthetic_data.iterrows():
        satisfaction_rating = row['overall_satisfaction']
        comment = row['additional_comments']
        
        # Retrieve reviews corresponding to the satisfaction rating
        valid_reviews = rating_reviews.get(satisfaction_rating, [])
        
        # Check if the additional comment is present in the valid reviews
        if comment not in valid_reviews and comment != "No comments available":
            errors.append((idx, satisfaction_rating, comment))
    
    return errors

# Call the verification function
errors = verify_mapping(synthetic_data, tripadvisor_data)

# Display results
if len(errors) == 0:
    print("All additional comments are correctly mapped based on overall satisfaction.")
else:
    print(f"There are {len(errors)} errors in the mapping. Here are the incorrect mappings:")
    for error in errors:
        print(f"Row: {error[0]}, Satisfaction Rating: {error[1]}, Incorrect Comment: {error[2]}")


All additional comments are correctly mapped based on overall satisfaction.


### Step 4: Average Attraction Wait Time Assignment Using Real Wait Time Data

In [35]:
# Load daily average wait time data
daily_avg_wait_time_df = pd.read_csv('/Users/liyuan/DSA3101/Final Synthetic Data Generation/daily_avg_wait_time_df.csv')

# Group the daily average wait time data by wait_time_rating
wait_time_by_rating = daily_avg_wait_time_df.groupby('wait_time_rating')['avg_wait_time_day'].apply(list).to_dict()

# Function to assign random wait time based on attraction_wait_time rating
def assign_wait_time(attraction_wait_time):
    # Get the list of wait times corresponding to the attraction wait time rating
    wait_times = wait_time_by_rating.get(attraction_wait_time, [])
    
    # If there are valid wait times, choose one randomly; otherwise, assign 'No data'
    if wait_times:
        return np.random.choice(wait_times)
    else:
        return 'No data'

# Apply the function to assign average wait time in the synthetic data
final_synthetic_data['avg_attraction_wait_time'] = final_synthetic_data['attraction_wait_time'].apply(assign_wait_time)

# Display the updated synthetic data with the new column
final_synthetic_data.head()

Unnamed: 0,age,gender,occupation,visitor_profile,ticket_purchase_method,ticket_type,special_event_ticket,ticket_purchase_ease,visitor_type,what are your preferred attraction(s)? select all that apply.,...,park_cleanliness,park_seating_availability,restroom_accessibility,overall_satisfaction,which aspect of your visit was the most disappointing? (select all that apply),return_likelihood,recommendation_likelihood,additional_comments,source,avg_attraction_wait_time
0,18 - 25,Male,Student,Singaporean,"Third Party Vendor (e.g. Klook, Trip.com)",Regular ticket with Express Pass,No,4,Group,"Thrill Rides, Shows",...,4,Yes,4,4,"Long wait times, Overcrowded attractions",Maybe,Yes,install water cooler given singapore humid cli...,real,21.0
1,18 - 25,Female,Employed,Singaporean,Resort World Sentosa Website,One-day regular ticket,No,4,Group,"Thrill Rides, Special Events (e.g. Halloween H...",...,5,Yes,5,5,,Yes,Yes,everything gd,real,10.0
2,18 - 25,Female,Student,Singaporean,"Third Party Vendor (e.g. Klook, Trip.com)",One-day regular ticket,No,5,Group,Thrill Rides,...,3,No,2,4,"Long wait times, Poor weather conditions, Over...",Yes,Yes,shorter waiting time bench shade around park b...,real,34.0
3,18 - 25,Male,Student,Singaporean,"Third Party Vendor (e.g. Klook, Trip.com)",One-day regular ticket,Yes,3,Group,"Thrill Rides, Shows",...,4,Yes,4,4,Long wait times,Yes,Yes,shorter waiting time,real,40.0
4,18 - 25,Male,Student,Singaporean,Resort World Sentosa Website,One-day regular ticket,No,5,Group,"Thrill Rides, Shows",...,4,No,4,3,"Long wait times, Overcrowded attractions",Maybe,Yes,maybe put aircon waitingqueuing area sheltered...,real,16.0


In [36]:
# Save to csv
# final_synthetic_data.to_csv('final_synthetic_data.csv', index=False)