# Data Preparation

## Important Libraries

In [1]:
import numpy as np
import pandas as pd

## Prepare the real data for Deep Learning Model

##### Note: The data named 'data.csv' does not exist in the repository due to confidential information.

In [2]:
dataframe = pd.read_csv('data.csv', encoding='latin-1')

In [3]:
# Drop duplicates:
dataframe = dataframe.drop_duplicates()

In [4]:
# Drop irrelevant columns.
dataframe = dataframe.drop(["Pnr", "Booking_Time", "Language_Code", 
                            "CUSTOMER_ID", "Channel_type", "first_Ticketnumber_1stPax", 
                            'Rebooking','Involuntary_Rebooking', 'Penalty_amount', 'Flight1', 
                            'Flight2', 'Flight3', 'Flight4', 'Flight5', 'Flight6', 
                            'Marketing_Carr', 'Operational_Carr', 'Dep_Date', 'Orig_Airport2', 
                            'Dest_Airport2', "Voucher"], axis=1)

In [5]:
# Select Verzilverd = 1:
dataframe = dataframe[dataframe["Verzilverd"]==1]

In [6]:
# change the type of column from string to datetime
dataframe["Booking_Date"] = pd.to_datetime(dataframe["Booking_Date"])

In [7]:
# Fill with 0 where the value of column 'Tot_Fare_Incl_Tax_Eur' is 'NaN':
dataframe['Tot_Fare_Incl_Tax_Eur'] = dataframe['Tot_Fare_Incl_Tax_Eur'].fillna(0)

In [8]:
dataframe.loc[:,"FullNm"] = dataframe.loc[:,"FirstNm"].astype(str) +" "+ dataframe.loc[:,"LastNm"]

dataframe["FullNm"] = dataframe["FullNm"].str.title()

dataframe = dataframe[dataframe["FullNm"].notnull()]

In [9]:
# Eleminate "NaN" values
dataframe = dataframe[dataframe['Number_Of_Pax'].notnull()]
dataframe = dataframe[dataframe['Nb_Of_Stops'].notnull()]
dataframe = dataframe[dataframe['Nb_Days_Dep_Ret'].notnull()]

In [10]:
dataframe = dataframe[['FullNm', 'Email_Address', 'Booking_Date', 'Orig_Airport', 'Dest_Airport']]

### Hide the data:

In [11]:
name_list = dataframe['FullNm'].unique()

In [12]:
name_list.size

139889

In [13]:
new_name = []

In [14]:
for i in range(len(name_list)):
    new_name.append('X'+str(i))

In [15]:
new_name[0]

'X0'

In [16]:
new_dataframe_name = pd.DataFrame()

new_dataframe_name["RealName"] = name_list
new_dataframe_name["FakeName"] = new_name

In [17]:
mail_list = dataframe['Email_Address'].unique()

In [18]:
mail_list.size

1880

In [19]:
new_mail = []
for i in range(len(mail_list)):
    new_mail.append('M'+str(i))

In [20]:
new_dataframe_mail = pd.DataFrame()

new_dataframe_mail["RealMail"] = mail_list
new_dataframe_mail["FakeMail"] = new_mail

In [21]:
df_cd = pd.merge(dataframe, new_dataframe_name, how='left', left_on = 'FullNm', right_on = 'RealName')

In [22]:
dataframe = df_cd.drop(['FullNm', 'RealName'], axis=1)

In [23]:
df_cd = pd.merge(dataframe, new_dataframe_mail, how='left', left_on = 'Email_Address', right_on = 'RealMail')
dataframe = df_cd.drop(['Email_Address', 'RealMail'], axis=1)

In [24]:
dataframe = dataframe[['Booking_Date', 'FakeName', 'FakeMail', 'Orig_Airport', 'Dest_Airport']]

## The dataset that will be used in the model:

In [25]:
dataframe.head(10)

Unnamed: 0,Booking_Date,FakeName,FakeMail,Orig_Airport,Dest_Airport
0,2021-01-01,X0,M0,MPL,BGO
1,2021-01-01,X1,M1,MRS,CFR
2,2021-01-01,X2,M2,ORY,PTP
3,2021-01-01,X3,M3,MAN,AMS
4,2021-01-01,X4,M1,MUC,AMS
5,2021-01-01,X5,M4,LHR,YYZ
6,2021-01-01,X6,M4,CPH,AUA
7,2021-01-01,X7,M1,ORY,PGF
8,2021-01-01,X8,M4,CDG,TLS
9,2021-01-01,X9,M4,CDG,NTE


In [26]:
# export the data:
dataframe.to_csv('newdata.csv', index=False)