In [31]:
import pandas as pd
import numpy as np
from faker import Faker
from random import choice, choices, randrange
from joblib import Parallel, delayed
from unidecode import unidecode

In [32]:
from src import BOOK_GenBehaviour as bkbeh
from src import BOOK_Grouping as bkgroup
from src import BOOK_Trip as bktrip
from src import FLY_Booking as fb

In [33]:
df_city = pd.read_csv('data/geoCrosswalk/GeoCrossWalkMed.csv')
df_flight = pd.read_csv('data/flightData/EU_flight_new.csv')
df_hubs = pd.read_csv('data/geoCrosswalk/ReverseHubsV2.csv')
route = pd.read_csv('data/flightData/route_all.csv')
crosswalk = pd.read_csv('data/geoCrosswalk/GeoCrossWalkMed.csv')
bus_stay_day = pd.read_csv('data/business_stay.csv')['bus_stay_day'].tolist()
bus_stay_weight = pd.read_csv('data/business_stay.csv')['bus_stay_weight'].tolist()
vac_stay_day = pd.read_csv('data/vacation_stay.csv')['vac_stay_day'].tolist()
vac_stay_weight = pd.read_csv('data/vacation_stay.csv')['vac_stay_weight'].tolist()
personas = pd.read_csv('data/personas.csv')['personas'].tolist()
weight = pd.read_csv('data/personas.csv')['weight'].tolist()
agencies = pd.read_csv('data/agencies.csv')['agencies'].tolist()
agency_weight = pd.read_csv('data/agencies.csv')['agency_weight'].tolist()

In [34]:
def safe_locale_gen(locale):
    try:
        return Faker(locale)
    except AttributeError:
        return Faker('en')

def generate_households(num_households, df_city):
    """
    Generate synthetic household data for flight passenger simulation.
    
    Parameters:
    - num_households: int, number of households to generate.
    - df_city: DataFrame, contains city data including available languages.
    - available_langs: list, languages available for selection.
    
    Returns:
    - DataFrame with household information.
    """

    # HH_num	GenderHOH	AgeHOH	SizeHH	HHID	HH_ISO	HHType	Lang	Lang_P	Surname	Address	PostCode	Country	NationalityLP	NationalityNat
    households = []
    available_langs = df_city['Lang'].tolist()
    for i in range(num_households):
        HH_ISO = choice(df_city['HH_ISO'].tolist())
        HHID = f"POI_{i}"
        gender = choice(["M", "F"])
        HHType = choice(["T1", "T2"])
        age = randrange(5, 17) if HHType == 'T1' else randrange(28, 56)
        sizeHH = 1 if HHType == 'T1' else 2
        
        lang = choice(available_langs) if np.random.random() < 0.2 else 'en'  # Simplified language logic
        faker_gen = safe_locale_gen(lang)
        
        # surname = faker_gen.last_name()
        address = faker_gen.address()
        postcode = faker_gen.postcode()
        country = faker_gen.country()
        payment_vendor = faker_gen.credit_card_provider()
        payment_expiry = faker_gen.credit_card_expire(start="now", end="+10y", date_format="%d/%m/%y")
        payment_number = faker_gen.credit_card_number(card_type=None)
        
        households.append([i, gender, age, sizeHH, HHID, HH_ISO, HHType, lang, address, postcode, country, payment_vendor, payment_expiry, payment_number])
    
    columns = ['HH_num', 'GenderHOH', 'AgeHOH', 'SizeHH', 'HHID', 'HH_ISO', 'HHType', 'Lang', 'Address', 'PostCode', 'Country', 'PaymentInfo_VendorCode', 'PaymentInfo_ExpiryDate', 'PaymentInfo_AccountNbr']
    return pd.DataFrame(households, columns=columns)

In [35]:
from datetime import datetime, timedelta

def generate_dob(age):
    """
    Generate a Date of Birth for the given age.
    """
    today = datetime.today()
    start_of_year = datetime(today.year - age, 1, 1)
    end_of_year = datetime(today.year - age, 12, 31)
    random_days = timedelta(days=(end_of_year - start_of_year).days * np.random.random())
    dob = start_of_year + random_days
    return dob.strftime('%d/%m/%Y')


def generate_typ_names(faker_gen, doc_first_name, doc_surname):
    """
    Generate typology names with a 20% chance of being different from the document names.
    """
    if np.random.rand() < 0.5:  # 20% chance
        typ_first_name = faker_gen.first_name()
        typ_surname = faker_gen.last_name()
    else:
        typ_first_name = doc_first_name
        typ_surname = doc_surname
    return typ_first_name, typ_surname


def populate_passengers(household_row, df_city):
    """
    Populate passengers for a given household, ensuring diversity and generating comprehensive passenger attributes.
    Now includes a 20% chance for typology names to differ from document names.
    
    Parameters:
    - household_row: Series, a row from the household DataFrame.
    - df_city: DataFrame, contains city data for determining locales.
    
    Returns:
    - List of comprehensive passenger data for the household.
    """
    passengers = []
    HHID = household_row['HHID']
    HHType = household_row['HHType']
    base_age = household_row['AgeHOH']
    lang = household_row['Lang']
    payment_vendor = household_row['PaymentInfo_VendorCode']
    payment_expiry = household_row['PaymentInfo_ExpiryDate']
    payment_number = household_row['PaymentInfo_AccountNbr']    
    faker_gen = safe_locale_gen(lang)
    print(household_row['SizeHH'])
    for j in range(household_row['SizeHH']):
        
        P_num = j
        P_ID = f"{HHID}_{j+1}"
        print("HHID:", HHID, "j:", j, "P_ID:", P_ID, "HHType:", HHType) 
        if HHType == 'T1' or j == 0:
            age = base_age
            gender = household_row['GenderHOH']
        else:
            age = randrange(5, 17)
            gender = choices(['M', 'F'], weights=[0.3, 0.7])[0]

        first_name = faker_gen.first_name_male() if gender == 'M' else faker_gen.first_name_female()
        surname = faker_gen.last_name()
        dob = generate_dob(age)
        free_email = faker_gen.free_email()
        # payment_vendor = faker_gen.credit_card_provider()
        # payment_expiry = faker_gen.credit_card_expire(start="now", end="+10y", date_format="%d/%m/%y")
        # payment_number = faker_gen.credit_card_number(card_type=None)
        work_email = faker_gen.company_email()
        docs_expiry = (datetime.today() + timedelta(days=365 * 10)).strftime('%Y-%m-%d')  # Assuming 10 years from now
        
        doc_first_name = unidecode(first_name)
        doc_surname = unidecode(surname)  # Document names are the real names
        typ_first_name, typ_surname = generate_typ_names(faker_gen, doc_first_name, doc_surname)  # Generate TYP names
        NationalityNat = choice(df_city['HH_ISO'].tolist())


        passenger = [
            P_num, HHID, P_ID, age, 
            f"{5 * (age // 5)}-{5 * (age // 5) + 4}" if age < 100 else "100+",  # AgeRange
            f"AGE{age // 5 + 1}" if age < 100 else "AGE21",  # AgeGroup
            gender, household_row['GenderHOH'], base_age, household_row['SizeHH'],
            household_row['HH_ISO'], HHType, lang, surname, household_row['Address'],
            household_row['PostCode'], household_row['Country'], first_name, dob, free_email,
            payment_vendor, payment_expiry, payment_number, work_email, docs_expiry,
            doc_first_name, doc_surname, typ_first_name, typ_surname, NationalityNat
            ]
    # No placeholder for P_num is added here
        passengers.append(passenger)
    
    return passengers

def generate_passenger_data(df_HH, df_city):
    """
    Generate comprehensive passenger data for all households.
    
    Parameters:
    - df_HH: DataFrame, household data.
    - df_city: DataFrame, city data for locale information.
    
    Returns:
    - DataFrame with comprehensive passenger data.
    """
    # passenger_data = Parallel(n_jobs=-1)(delayed(populate_passengers)(row, df_city) for index, row in df_HH.iterrows())
    # passenger_data = [p for sublist in passenger_data for p in sublist]  # Flatten the list of lists

    passenger_data = []
    for index, row in df_HH.iterrows():
        passengers = populate_passengers(row, df_city)
        passenger_data.extend(passengers) 
    
    columns = [
        'P_num', 'HHID', 'P_ID', 'P_AGE', 'AgeRange', 'AgeGroup', 'P_Gender', 'GenderHOH', 'AgeHOH', 'SizeHH', 'HH_ISO', 'HHType', 'Lang',
        'Surname', 'Address', 'PostCode', 'Country', 'FirstName', 'DOB', 'FreeEmail',
        'PaymentInfo_VendorCode', 'PaymentInfo_ExpiryDate', 'PaymentInfo_AccountNbr',
        'WorkEmail', 'DOCS_ExpiryDate', 'DOC_FirstName', 'DOC_Surname', 'TYP_FirstName',
        'TYP_Surname', 'NationalityNat'
    ]
    df_passengers = pd.DataFrame(passenger_data, columns=columns)
    
    return df_passengers

def finalize_data(df):
    """
    Finalize the DataFrame by adding P_num and ensuring the correct column order.
    """
    df.insert(0, 'P_num', range(1, len(df) + 1))  # Insert P_num at the beginning
    column_order = [
        'P_num', 'HHID', 'P_ID', 'P_AGE', 'AgeRange', 'AgeGroup', 'P_GENDER', 'GenderHOH', 'AgeHOH', 'SizeHH', 'HH_ISO', 'HHType', 'Lang',
        'Surname', 'Address', 'PostCode', 'Country', 'FirstName', 'DOB', 'FreeEmail', 'PaymentInfo_VendorCode', 'PaymentInfo_ExpiryDate',
        'PaymentInfo_AccountNbr', 'WorkEmail', 'DOCS_ExpiryDate', 'DOC_FirstName', 'DOC_Surname', 'TYP_FirstName', 'TYP_Surname'
    ]
    return df[column_order]

# Note: Ensure that df_city and df_HH are properly set up before calling generate_passenger_data.


In [36]:
def introduce_typos(text, typo_rate):
    typo_text = list(text)
    for i in range(len(typo_text)-1):
        if np.random() < typo_rate:
            # Introduce a typo (e.g., swap with the next character)
            typo_text[i], typo_text[i+1] = typo_text[i+1], typo_text[i]
    return ''.join(typo_text)

def docIDs(row):

    data={}
    random_number = np.random()
    if random_number > 0.005:
        data['TYP_FirstName'] = row['DOC_FirstName']
        data['TYP_Surname'] = row['DOC_Surname']
        # data['TYP_DOB'] = str(row['DOB'])
    else:
        data['TYP_FirstName'] = introduce_typos(row['DOC_FirstName'], typo_rate=0.2)
        data['TYP_Surname'] = introduce_typos(row['DOC_Surname'], typo_rate=0.2)
        # data['TYP_DOB'] = introduce_dob_typos2(str(row['DOB']), typo_rate=0.2)
    
    return data

In [37]:
num_HH = 20
df_HH = generate_households(num_HH, df_city)
df_passengers = generate_passenger_data(df_HH, df_city)
# df_passengers_final = finalize_data(df_passengers)

2
HHID: POI_0 j: 0 P_ID: POI_0_1 HHType: T2
HHID: POI_0 j: 1 P_ID: POI_0_2 HHType: T2
2
HHID: POI_1 j: 0 P_ID: POI_1_1 HHType: T2
HHID: POI_1 j: 1 P_ID: POI_1_2 HHType: T2
2
HHID: POI_2 j: 0 P_ID: POI_2_1 HHType: T2
HHID: POI_2 j: 1 P_ID: POI_2_2 HHType: T2
1
HHID: POI_3 j: 0 P_ID: POI_3_1 HHType: T1
2
HHID: POI_4 j: 0 P_ID: POI_4_1 HHType: T2
HHID: POI_4 j: 1 P_ID: POI_4_2 HHType: T2
1
HHID: POI_5 j: 0 P_ID: POI_5_1 HHType: T1
2
HHID: POI_6 j: 0 P_ID: POI_6_1 HHType: T2
HHID: POI_6 j: 1 P_ID: POI_6_2 HHType: T2
1
HHID: POI_7 j: 0 P_ID: POI_7_1 HHType: T1
1
HHID: POI_8 j: 0 P_ID: POI_8_1 HHType: T1
2
HHID: POI_9 j: 0 P_ID: POI_9_1 HHType: T2
HHID: POI_9 j: 1 P_ID: POI_9_2 HHType: T2
1
HHID: POI_10 j: 0 P_ID: POI_10_1 HHType: T1
1
HHID: POI_11 j: 0 P_ID: POI_11_1 HHType: T1
1
HHID: POI_12 j: 0 P_ID: POI_12_1 HHType: T1
1
HHID: POI_13 j: 0 P_ID: POI_13_1 HHType: T1
1
HHID: POI_14 j: 0 P_ID: POI_14_1 HHType: T1
2
HHID: POI_15 j: 0 P_ID: POI_15_1 HHType: T2
HHID: POI_15 j: 1 P_ID: POI_15_2

In [38]:
df_city, df_HH = bktrip.original_city_assign_init(df_HH, df_flight, df_hubs)
df_behaviour, df_behaviour_complete = bkbeh.generate_behaviour(df_HH, df_flight, ['SOI'], [1], 3, crosswalk)
df_group = bkgroup.grouping_init(df_behaviour_complete, agencies, agency_weight, route, bus_stay_day, bus_stay_weight, vac_stay_day, vac_stay_weight)

df_HH.shape: (20, 16)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged['ISO_Travel'].fillna(merged['HH_ISO'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_HH['IATA_O'].fillna(pd.Series(np.random.choice(['LHR', 'CDG', 'IST', 'DXB', 'AUH'], size=len(df_HH))), inplace=True)


df_behaviour_shape:  (20, 2)
df_behaviour_shape:  (20, 3)
df_behaviour_shape:  (20, 4)
df_sizeHH: 0     2
1     2
2     2
3     1
4     2
5     1
6     2
7     1
8     1
9     2
10    1
11    1
12    1
13    1
14    1
15    2
16    2
17    2
18    1
19    2
Name: SizeHH, dtype: int64
df_sizeHH: 0     2
1     2
2     2
3     1
4     2
5     1
6     2
7     1
8     1
9     2
10    1
11    1
12    1
13    1
14    1
15    2
16    2
17    2
18    1
19    2
Name: SizeHH, dtype: int64
df_sizeHH: 0     2
1     2
2     2
3     1
4     2
5     1
6     2
7     1
8     1
9     2
10    1
11    1
12    1
13    1
14    1
15    2
16    2
17    2
18    1
19    2
Name: SizeHH, dtype: int64
done business group
done leisure
done SOI
0    IST-SEN-SOI-ID0
1    IST-DUB-SOI-ID0
2    DXB-CDG-SOI-ID0
3    DXB-LIS-SOI-ID0
4    IST-CDG-SOI-ID0
Name: init_id, dtype: object
Index(['init_id', 'list_of_passengers', 'route', 'num_in_party',
       'BookingAgency', 'BookingDay', 'Unnamed: 0', 'IATA_O', 'IATA_D',
      

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f'Leg_fixRoute_{i}'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f'Leg_fixRoute_{i}'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

In [40]:
df_group.to_csv('data/synthesizedData/group_soi.csv', index=False)