In [1]:
import pandas as pd
import glob

## Uploading Generated Syntatic Data

In [2]:
## Multiple User Preferences
csv_files = glob.glob(r"../synthetic_data_experiments/experiment_*.csv")

In [3]:
single_inputs = pd.read_csv(r"../synthetic_data_experiments/single_preferences_results.csv")

In [4]:
print(csv_files)

['../synthetic_data_experiments\\experiment_1.csv', '../synthetic_data_experiments\\experiment_2.csv', '../synthetic_data_experiments\\experiment_3.csv', '../synthetic_data_experiments\\experiment_4.csv', '../synthetic_data_experiments\\experiment_5.csv', '../synthetic_data_experiments\\experiment_6.csv', '../synthetic_data_experiments\\experiment_7.csv', '../synthetic_data_experiments\\experiment_8.csv', '../synthetic_data_experiments\\experiment_9.csv']


## Merging Multiple User Preference Datasets

In [5]:
dataframes = []
for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)
final_df = pd.concat(dataframes, ignore_index=True)

## EDA

In [6]:
final_df.head()

Unnamed: 0,Scenario,Categories,Cities,Tourism_Types,Precision@k,Preference_Coverage@k,Category_Diversity,Tourism_Type_Diversity
0,1,Mountain,"Beni Suef, Hurghada, Qena","Religious and Spiritual Attractions, Medical A...",0.714286,0.666667,0.607143,0.607143
1,2,Water Park,"Red Sea, Cairo, Sharm El Sheikh","Medical Attractions, Natural Attractions",1.0,1.0,0.714286,0.607143
2,3,"Fortress, Mosque","Qena, Sharqia, Beheira, Alexandria","Religious and Spiritual Attractions, Entertain...",1.0,0.777778,0.857143,0.619048
3,4,Theme Park,"Fayoum, Assiut, Aswan","Religious and Spiritual Attractions, Entertain...",1.0,0.285714,0.75,0.333333
4,5,Beach,"Sohag, Sharm El Sheikh, Cairo","Entertainment and Modern Attractions, Cultural...",1.0,0.833333,0.75,0.75


## Identifying Duplicate Scenarios 

In [7]:
final_df.duplicated().sum()

1

In [8]:
final_df = final_df.drop_duplicates()

In [9]:
single_inputs.head()

Unnamed: 0,Category,City,Tourism Type,Precision@k,Preference_Coverage@k,Category_Diversity,Tourism_Type_Diversity
0,Theme Park,,,0.5,1.0,0.75,0.0
1,Water Park,,,0.428571,1.0,0.714286,0.0
2,Zoo,,,0.285714,1.0,0.642857,0.0
3,Tower,,,0.428571,1.0,0.714286,0.0
4,Shopping,,,0.5,1.0,0.75,0.0


## Integrating Data from Multiple and Single Scenario Datasets

In [11]:
multiple_inputs = final_df.iloc[:,1:] 

In [12]:
single_inputs = single_inputs.rename(columns={
    'Category': 'Categories',
    'City': 'Cities',
    'Tourism Type': 'Tourism_Types'
})

single_inputs.columns == multiple_inputs.columns  

array([ True,  True,  True,  True,  True,  True,  True])

In [13]:
final_simulation =  pd.concat([multiple_inputs, single_inputs], ignore_index=True)

In [14]:
final_simulation

Unnamed: 0,Categories,Cities,Tourism_Types,Precision@k,Preference_Coverage@k,Category_Diversity,Tourism_Type_Diversity
0,Mountain,"Beni Suef, Hurghada, Qena","Religious and Spiritual Attractions, Medical A...",0.714286,0.666667,0.607143,0.607143
1,Water Park,"Red Sea, Cairo, Sharm El Sheikh","Medical Attractions, Natural Attractions",1.000000,1.000000,0.714286,0.607143
2,"Fortress, Mosque","Qena, Sharqia, Beheira, Alexandria","Religious and Spiritual Attractions, Entertain...",1.000000,0.777778,0.857143,0.619048
3,Theme Park,"Fayoum, Assiut, Aswan","Religious and Spiritual Attractions, Entertain...",1.000000,0.285714,0.750000,0.333333
4,Beach,"Sohag, Sharm El Sheikh, Cairo","Entertainment and Modern Attractions, Cultural...",1.000000,0.833333,0.750000,0.750000
...,...,...,...,...,...,...,...
500,,,Entertainment and Modern Attractions,0.857143,1.000000,0.000000,0.928571
501,,,Natural Attractions,0.785714,1.000000,0.000000,0.892857
502,,,Religious and Spiritual Attractions,0.642857,1.000000,0.000000,0.821429
503,,,Medical Attractions,0.642857,1.000000,0.000000,0.821429


## Null Value Handling in Multiple and Single Scenario Datasets 

In [15]:
columns_to_fill = ['Cities', 'Categories', 'Tourism_Types']
final_simulation[columns_to_fill] = final_simulation[columns_to_fill].fillna("No Input")

In [16]:
final_simulation

Unnamed: 0,Categories,Cities,Tourism_Types,Precision@k,Preference_Coverage@k,Category_Diversity,Tourism_Type_Diversity
0,Mountain,"Beni Suef, Hurghada, Qena","Religious and Spiritual Attractions, Medical A...",0.714286,0.666667,0.607143,0.607143
1,Water Park,"Red Sea, Cairo, Sharm El Sheikh","Medical Attractions, Natural Attractions",1.000000,1.000000,0.714286,0.607143
2,"Fortress, Mosque","Qena, Sharqia, Beheira, Alexandria","Religious and Spiritual Attractions, Entertain...",1.000000,0.777778,0.857143,0.619048
3,Theme Park,"Fayoum, Assiut, Aswan","Religious and Spiritual Attractions, Entertain...",1.000000,0.285714,0.750000,0.333333
4,Beach,"Sohag, Sharm El Sheikh, Cairo","Entertainment and Modern Attractions, Cultural...",1.000000,0.833333,0.750000,0.750000
...,...,...,...,...,...,...,...
500,No Input,No Input,Entertainment and Modern Attractions,0.857143,1.000000,0.000000,0.928571
501,No Input,No Input,Natural Attractions,0.785714,1.000000,0.000000,0.892857
502,No Input,No Input,Religious and Spiritual Attractions,0.642857,1.000000,0.000000,0.821429
503,No Input,No Input,Medical Attractions,0.642857,1.000000,0.000000,0.821429


## Extracting the Number of User Inputs for Each Preference Type For Analysing The CBRS Perfromance 

In [17]:
final_simulation['Number_of_Categories'] = final_simulation['Categories'].apply(
    lambda x: 0 if pd.isna(x) else x.count(',') + 1
)

final_simulation['Number_of_Cities'] = final_simulation['Cities'].apply(
    lambda x: 0 if pd.isna(x) else x.count(',') + 1
)

final_simulation['Number_of_Tourism_Types'] = final_simulation['Tourism_Types'].apply(
    lambda x: 0 if pd.isna(x) else x.count(',') + 1
)

In [18]:
final_simulation

Unnamed: 0,Categories,Cities,Tourism_Types,Precision@k,Preference_Coverage@k,Category_Diversity,Tourism_Type_Diversity,Number_of_Categories,Number_of_Cities,Number_of_Tourism_Types
0,Mountain,"Beni Suef, Hurghada, Qena","Religious and Spiritual Attractions, Medical A...",0.714286,0.666667,0.607143,0.607143,1,3,2
1,Water Park,"Red Sea, Cairo, Sharm El Sheikh","Medical Attractions, Natural Attractions",1.000000,1.000000,0.714286,0.607143,1,3,2
2,"Fortress, Mosque","Qena, Sharqia, Beheira, Alexandria","Religious and Spiritual Attractions, Entertain...",1.000000,0.777778,0.857143,0.619048,2,4,3
3,Theme Park,"Fayoum, Assiut, Aswan","Religious and Spiritual Attractions, Entertain...",1.000000,0.285714,0.750000,0.333333,1,3,3
4,Beach,"Sohag, Sharm El Sheikh, Cairo","Entertainment and Modern Attractions, Cultural...",1.000000,0.833333,0.750000,0.750000,1,3,2
...,...,...,...,...,...,...,...,...,...,...
500,No Input,No Input,Entertainment and Modern Attractions,0.857143,1.000000,0.000000,0.928571,1,1,1
501,No Input,No Input,Natural Attractions,0.785714,1.000000,0.000000,0.892857,1,1,1
502,No Input,No Input,Religious and Spiritual Attractions,0.642857,1.000000,0.000000,0.821429,1,1,1
503,No Input,No Input,Medical Attractions,0.642857,1.000000,0.000000,0.821429,1,1,1


In [19]:
final_simulation.to_excel('final_simulation_results_of_egyptopia_cbrs.xlsx', index=False)

## Applying Descriptive Analysis 

In [20]:
final_simulation.columns = [col.replace('@k', '') for col in final_simulation.columns]

In [21]:
final_simulation.describe()

Unnamed: 0,Precision,Preference_Coverage,Category_Diversity,Tourism_Type_Diversity,Number_of_Categories,Number_of_Cities,Number_of_Tourism_Types
count,505.0,505.0,505.0,505.0,505.0,505.0,505.0
mean,0.926025,0.852647,0.76615,0.621311,4.742574,3.061386,1.99802
std,0.192416,0.13578,0.205521,0.231746,2.376528,1.460524,0.793073
min,0.214286,0.25,0.0,0.0,1.0,1.0,1.0
25%,1.0,0.777778,0.75,0.607143,3.0,2.0,1.0
50%,1.0,0.875,0.807143,0.678571,5.0,3.0,2.0
75%,1.0,0.9375,0.880952,0.75,7.0,4.0,3.0
max,1.0,1.0,0.928571,1.0,9.0,7.0,3.0
