# Match searches with bookings

### Prepare the data for processing

In [1]:
import pandas as pd

In [2]:
cd /home/gonzalo/Data/challenge/

/home/gonzalo/Data/challenge


In [3]:
pwd

'/home/gonzalo/Data/challenge'

In [None]:
bc = pd.read_csv('bookings.csv.bz2', sep='^', chunksize=300000, low_memory=False)

all_chunks = pd.DataFrame()

for i, chunk in enumerate(bc):
    all_chunks = all_chunks.append(chunk)
    all_chunks = all_chunks.drop_duplicates()
    print (i, len(all_chunks))

In [14]:
all_chunks.to_csv('booking_wo_duplicates.csv', sep='^', index=False)

In [21]:
sc = pd.read_csv('searches.csv.bz2', sep='^', chunksize=300000, low_memory=False)

all_chunks = pd.DataFrame()

for i, chunk in enumerate(sc):
    all_chunks = all_chunks.append(chunk)
    all_chunks = all_chunks.drop_duplicates()
    print (i, len(all_chunks))

0 299999
1 358999
2 358999
3 358999
4 658992
5 718002
6 718002
7 718002
8 718002
9 718002
10 718002
11 718002
12 718002
13 718002
14 718002
15 718002
16 718002
17 718002
18 718002
19 718002
20 718002
21 718002


KeyboardInterrupt: 

In [6]:
all_chunks.to_csv('searches_wo_duplicates.csv', sep='^', index=False)

In [10]:
! bzip2 booking_wo_duplicates.csv

In [10]:
! bzip2 searches_wo_duplicates.csv

In [14]:
! chmod 777 searches_wo_duplicates.csv.bz2

In [15]:
% ll

total 1083116
-rwxr-x--- 1 gonzalo 554970628 mar 13  2018 [0m[01;32mbookings.csv.bz2[0m*
-rwxrwxrwx 1 gonzalo  56437955 ene 26 12:16 [01;32mbooking_wo_duplicates.csv.bz2[0m*
-rwxrwxrwx 1 gonzalo   4232732 ene 25 17:18 [01;32msample_bookings.csv[0m*
-rwxrwxrwx 1 gonzalo    244720 ene 25 17:19 [01;32msample_searches.csv.bz2[0m*
-rwxr-x--- 1 gonzalo 483188920 mar 13  2018 [01;32msearches.csv.bz2[0m*
-rwxrwxrwx 1 gonzalo  10013467 ene 26 12:29 [01;32msearches_wo_duplicates.csv.bz2[0m*
-rw-r--r-- 1 gonzalo       476 ene 25 21:23 top_airport.csv


In [4]:
b = pd.read_csv('booking_wo_duplicates.csv.bz2', sep='^', usecols=['dep_port', 'arr_port', 'cre_date           '])

In [5]:
s = pd.read_csv('searches_wo_duplicates.csv.bz2', sep='^', usecols=['Origin', 'Destination', 'Date'])

### Make a plan

In [None]:
# bookings --> dep_port, arr_port, cre_date
# searches --> origin, destination, search_date

# 0 --> drop nan
# 1 --> mirar que no hay espacios en

### Let's code

In [6]:
s.dropna(inplace=True)
b.dropna(inplace=True)

In [7]:
b.columns=b.columns.map(lambda x:x.strip())
b['dep_port'] = b['dep_port'].map(lambda x:x.strip())
b['arr_port'] = b['arr_port'].map(lambda x:x.strip())

In [4]:
# El codigo que llevamos hasta ahora
b = pd.read_csv('booking_wo_duplicates.csv.bz2', sep='^', usecols=['dep_port', 'arr_port', 'cre_date           '])
s = pd.read_csv('searches_wo_duplicates.csv.bz2', sep='^', usecols=['Origin', 'Destination', 'Date'])
s.reset_index(inplace=True)
s.dropna(inplace=True)
b.columns=b.columns.map(lambda x:x.strip())
b['dep_port'] = b['dep_port'].map(lambda x:x.strip())
b['arr_port'] = b['arr_port'].map(lambda x:x.strip())
b['cre_date'] = b['cre_date'].str[0:10]

In [5]:
b['Booked'] = 1

In [6]:
b_gr = b.groupby(['dep_port', 'arr_port', 'cre_date']).sum().reset_index()

In [7]:
b_gr.head()

Unnamed: 0,dep_port,arr_port,cre_date,Booked
0,AAB,ROV,2013-05-29,2
1,AAB,ROV,2013-08-24,2
2,AAE,ALG,2013-01-07,1
3,AAE,ALG,2013-01-08,4
4,AAE,ALG,2013-01-09,2


In [8]:
s.head()

Unnamed: 0,index,Date,Origin,Destination
0,0,2013-01-01,TXL,AUH
1,1,2013-01-01,ATH,MIL
2,2,2013-01-01,ICT,SFO
3,3,2013-01-01,RNB,ARN
4,4,2013-01-01,OSL,MAD


In [10]:
s_b = s.merge(b_gr, how='left', left_on=['Date', 'Origin', 'Destination'], right_on=['cre_date', 'dep_port', 'arr_port'])

In [13]:
s_b[ s_b['Booked'] > 0].head()

Unnamed: 0,index,Date,Origin,Destination,dep_port,arr_port,cre_date,Booked
27,27,2013-01-01,RUH,JED,RUH,JED,2013-01-01,17.0
40,40,2013-01-01,DMM,MNL,DMM,MNL,2013-01-01,5.0
59,59,2013-01-01,ATL,MIA,ATL,MIA,2013-01-01,6.0
134,134,2013-01-01,MEL,SYD,MEL,SYD,2013-01-01,10.0
172,172,2013-01-01,BOM,JED,BOM,JED,2013-01-01,2.0


In [14]:
s_b.drop(['arr_port', 'dep_port', 'cre_date'], axis=1, inplace=True)

In [15]:
s_b['Booked'].fillna(value=0, inplace=True)

In [16]:
s_b['Booked'] = s_b['Booked'].map(lambda x: 1 if x>0 else 0)

In [17]:
#Lo que llevamos de codigo
b = pd.read_csv('booking_wo_duplicates.csv.bz2', sep='^', usecols=['dep_port', 'arr_port', 'cre_date           '])
s = pd.read_csv('searches_wo_duplicates.csv.bz2', sep='^', usecols=['Origin', 'Destination', 'Date'])
s.reset_index(inplace=True)
s.dropna(inplace=True)
b.columns=b.columns.map(lambda x:x.strip())
b['dep_port'] = b['dep_port'].map(lambda x:x.strip())
b['arr_port'] = b['arr_port'].map(lambda x:x.strip())
b['cre_date'] = b['cre_date'].str[0:10]
b['Booked'] = 1
b['Booked'] = 1
s_b = s.merge(b_gr, how='left', left_on=['Date', 'Origin', 'Destination'], right_on=['cre_date', 'dep_port', 'arr_port'])
s_b.drop(['arr_port', 'dep_port', 'cre_date'], axis=1, inplace=True)
s_b['Booked'].fillna(value=0, inplace=True)
s_b['Booked'] = s_b['Booked'].map(lambda x: 1 if x>0 else 0)

In [20]:
s_b.rename(columns={'index':'old_index_from_file'}, inplace=True)

In [25]:
s_b.sample(5)

Unnamed: 0,old_index_from_file,Date,Origin,Destination,Booked
375643,375643,2013-08-08,PAR,TNR,0
292414,292414,2013-10-20,COK,DXB,0
203831,203831,2013-07-23,GRZ,BCN,0
78529,78529,2013-03-20,SNA,ORD,0
213022,213022,2013-08-02,MOW,DXB,0


In [24]:
s_all = pd.read_csv('searches_wo_duplicates.csv.bz2', sep='^', low_memory=False)

In [30]:
result = s_all.merge(s_b, how='left', left_index=True, right_on=['old_index_from_file'])
result.drop(['Date_y', 'Origin_y', 'Destination_y'], inplace=True, axis=1)

In [31]:
result.sample(5)

Unnamed: 0,Date_x,Time,TxnCode,OfficeID,Country,Origin_x,Destination_x,RoundTrip,NbSegments,Seg1Departure,...,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice,old_index_from_file,Booked
416834,2013-12-13,18:54:40,CAL,314d2613218289efc5aec20eef9619f4,GB,MAN,DWC,1.0,2.0,MAN,...,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,0,416835,0.0
260026,2013-09-18,15:43:47,MPT,094e5ecca99b7171f31b7ca5064c839d,CA,YEG,PHX,1.0,2.0,YEG,...,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,0,260026,0.0
317648,2013-11-14,17:15:58,FXR,37d287d1689e7f9e40b77777c7bad712,BR,GRU,SYD,1.0,4.0,GRU,...,,,1ASI,0,0,0,81d810a3d92e768f3b1ffd7486968205,0,317648,0.0
81417,2013-03-23,20:13:51,FFP,c3b7373915e5bb351254e7b0f03bb774,BR,BSB,FOR,0.0,1.0,BSB,...,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,QSB,81417,0.0
349710,2013-12-16,14:37:00,FQP,2113e4bdd2f83eb47b968332fcf7a1ed,UA,JFK,FCO,0.0,1.0,JFK,...,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,0,349710,0.0


In [32]:
result.to_csv('search_with_booking.csv', sep='^', index=False)

In [33]:
! cat search_with_booking.csv | head -10

Date_x^Time^TxnCode^OfficeID^Country^Origin_x^Destination_x^RoundTrip^NbSegments^Seg1Departure^Seg1Arrival^Seg1Date^Seg1Carrier^Seg1BookingCode^Seg2Departure^Seg2Arrival^Seg2Date^Seg2Carrier^Seg2BookingCode^Seg3Departure^Seg3Arrival^Seg3Date^Seg3Carrier^Seg3BookingCode^Seg4Departure^Seg4Arrival^Seg4Date^Seg4Carrier^Seg4BookingCode^Seg5Departure^Seg5Arrival^Seg5Date^Seg5Carrier^Seg5BookingCode^Seg6Departure^Seg6Arrival^Seg6Date^Seg6Carrier^Seg6BookingCode^From^IsPublishedForNeg^IsFromInternet^IsFromVista^TerminalID^InternetOffice^old_index_from_file^Booked
2013-01-01^20:25:57^MPT^624d8c3ac0b3a7ca03e3c167e0f48327^DE^TXL^AUH^1.0^2.0^TXL^AUH^2013-01-26^D2^^AUH^TXL^2013-02-02^D2^^^^^^^^^^^^^^^^^^^^^^1ASIWS^0^0^0^d41d8cd98f00b204e9800998ecf8427e^FRA^0^0.0
2013-01-01^10:15:33^MPT^b0af35b31588dc4ab06d5cf2986e8e02^MD^ATH^MIL^0.0^1.0^ATH^MIL^2013-01-04^^^^^^^^^^^^^^^^^^^^^^^^^^^^1ASIWS^0^0^0^d41d8cd98f00b204e9800998ecf8427e^KIV^1^0.0
2013-01-01^18:04:49^MPT^3561a60621de06ab1badc8ca55699ef3^US

### Codigo final

In [34]:
b = pd.read_csv('booking_wo_duplicates.csv.bz2', sep='^', usecols=['dep_port', 'arr_port', 'cre_date           '])
s = pd.read_csv('searches_wo_duplicates.csv.bz2', sep='^', usecols=['Origin', 'Destination', 'Date'])
s.reset_index(inplace=True)
s.dropna(inplace=True)
b.columns=b.columns.map(lambda x:x.strip())
b['dep_port'] = b['dep_port'].map(lambda x:x.strip())
b['arr_port'] = b['arr_port'].map(lambda x:x.strip())
b['cre_date'] = b['cre_date'].str[0:10]
b['Booked'] = 1
b['Booked'] = 1
s_b = s.merge(b_gr, how='left', left_on=['Date', 'Origin', 'Destination'], right_on=['cre_date', 'dep_port', 'arr_port'])
s_b.drop(['arr_port', 'dep_port', 'cre_date'], axis=1, inplace=True)
s_b['Booked'].fillna(value=0, inplace=True)
s_b['Booked'] = s_b['Booked'].map(lambda x: 1 if x>0 else 0)
s_b.rename(columns={'index':'old_index_from_file'}, inplace=True)
s_all = pd.read_csv('searches_wo_duplicates.csv.bz2', sep='^', low_memory=False)
result = s_all.merge(s_b, how='left', left_index=True, right_on=['old_index_from_file'])
result.drop(['Date_y', 'Origin_y', 'Destination_y'], inplace=True, axis=1)
result.to_csv('search_with_booking.csv', sep='^', index=False)
! bzip2 search_with_booking.csv

In [35]:
% ll

total 1094060
-rwxr-x--- 1 gonzalo 554970628 mar 13  2018 [0m[01;32mbookings.csv.bz2[0m*
-rwxrwxrwx 1 gonzalo  56437955 ene 26 12:16 [01;32mbooking_wo_duplicates.csv.bz2[0m*
-rwxrwxrwx 1 gonzalo   4232732 ene 25 17:18 [01;32msample_bookings.csv[0m*
-rwxrwxrwx 1 gonzalo    244720 ene 25 17:19 [01;32msample_searches.csv.bz2[0m*
-rwxr-x--- 1 gonzalo 483188920 mar 13  2018 [01;32msearches.csv.bz2[0m*
-rwxrwxrwx 1 gonzalo  10013467 ene 26 12:29 [01;32msearches_wo_duplicates.csv.bz2[0m*
-rw-r--r-- 1 gonzalo  11203614 ene 26 13:46 [01;31msearch_with_booking.csv.bz2[0m
-rw-r--r-- 1 gonzalo       476 ene 25 21:23 top_airport.csv
