In [1]:
from IPython.display import Image
from IPython.core.display import HTML 

### SDD : 

There's an SME who wants to have a certain amount of flight details from a list of dates. She/he wants to use this data, which will be used in a subsequent task, to be able to reduce business travel prices done by UBS workers

Go to `www.google.com`. Once there, you should look for the list of dates given by the SME. 

In [2]:
Image(url= "https://raw.githubusercontent.com/edgar-hdz/workshop/master/start.png",width=600, height=600)

Search for all the dates for the given destination as indicated below. 

The data we are interested is highlited in yellow. These are the `Flight duration` and `Flight price`. You should extract this information, and indicate the lowest price and lowes duration from the extracted list. The price and duration will then further be used for a subsequent task (not in the scope of this exercise).

In [3]:
Image(url= "https://raw.githubusercontent.com/edgar-hdz/workshop/master/search.png",width=600, height=600)

_______________
### Before starting, first install the following stuff

`!pip install selenium`

`!apt install chromium-chromedriver`

`!apt-get update # to update ubuntu to correctly run apt install`

`!apt install chromium-chromedriver`

`!cp /usr/lib/chromium-browser/chromedriver /usr/bin`

`import sys`

`sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')`

### Importing some packages
 Pretty much setting everything up so that we can work similarly as what we would with AA

In [None]:
import datetime
import time
import calendar
import re
import pandas as pd
import numpy as np

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

### Setting up google chrome to work with this script

In [None]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

### Lets first check if we have a working connection with google chrome
We start by opening the website

In [None]:
browser = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
browser.get('http://www.google.com')

Next, we select an element, based in the elements name. We see that information from the expath of the element

In [6]:
Image(url= "https://raw.githubusercontent.com/edgar-hdz/workshop/master/elem.png",width=800, height=600)

In [None]:
search = browser.find_element_by_name('q')

By doing so, we have created a link between this variable and the website, and now we can start interacting with it!

In [None]:
search.send_keys("mexico to zurich 18 december") ## What to search in google
search.send_keys(Keys.RETURN) # hit return after you enter search text
print("Done")

In [None]:
el = browser.find_element_by_xpath("//div[@id='search']//div[@id='flun']")
print("Extracted text: {}".format(el.text))

Extract one word of interest using regex, for this example lets say we want the first quantity which is followed by a dollar sign `$`

In [None]:
example_regx = el.text

In [None]:
re.search(r'[$]\w.{1,4}', example_regx).group(0)

### Try it out with something different
Open a different website, search for the `XPath` linked to the item to interact with, or to extract data from. Get some data from there in the form of text, and try using `regex` to match a desired word.

In [None]:
browser = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

In [None]:
website_to_go = "http://" ## Put the website you want here
browser.get(website_to_go)

#### Start interacting with the website
The easiest thing to do, would be to go to a website which has already some information that can be extracted. 

A second option would we, to navigate using some tools, by first selecting an element in the website

[Some useful information about how to click buttons or extract text](https://towardsdatascience.com/controlling-the-web-with-python-6fceb22c5f08)


In [None]:
##Spaces for you to work here

### Defining our "Tasks/Functions"

In [None]:
def flight_details(departure, destination, date, num_days, klass):
    date_1 = datetime.datetime.strptime(date, "%d/%m/%y")
    month1 = calendar.month_name[date_1.month]
    end_date = date_1 + datetime.timedelta(days=num_days)
    month2 = calendar.month_name[end_date.month]
    start_day = date_1.day
    end_day = end_date.day

    return "{} to {} from {} {} to {} {} {}".format(departure, destination, start_day,month1,end_day, month2,klass)

### Testing the function we created
Try it out with different destinations, departure dates, and duration of stay

In [None]:
flight_details("Zurich", "Mexico City", "5/10/19",15,"")

In [None]:
list_of_days = ["5/10/19","5/11/19","5/12/19"]
list_prices = []

In [None]:
for something in list_of_days:
    print("Date is: {}".format(something))

In [None]:
browser = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
browser.get('http://www.google.com')

In [None]:
for ldays in list_of_days:
    Mainframe = browser.find_element_by_xpath("//input[@class='gLFyf gsfi']")
    Mainframe.clear()
    Mainframe.send_keys(flight_details("Zurich", "Mexico City", ldays,15,"Business"))
    print("Found")
    Mainframe.send_keys(Keys.ENTER)
    WebDriverWait(browser,10000).until(EC.visibility_of_element_located((By.TAG_NAME,'body')))
    Mainframe = browser.find_element_by_xpath("//div[cite[text()[contains(.,'www.google.com/flights')]]]//following-sibling::div[5]")
    print("Found flight details")
    list_prices.append(Mainframe.text)

browser.close()

Checking first how does the extraced data looks like. It requires some preprocessing indeed!

In [None]:
list_prices

Spliting the string into sentences

In [None]:
a = list_prices[0]
sentence = [p for p in a.split('\n') if p]
sentence

### Collect the data we are interested on
For this particular example, the `Flight duration`, `Price`, `Dates`, `Duration of stay`. From the variable `sentence` above, we could see that the data of interest is embedded inside a sentece. Meaning that we require some tools in order to get them all!

From the variable named `sentence`

In [None]:
sentence

We can already notice a certain structure we can take advantage of, this is that we always have the `Flight duration` followed by the `Flight price`  in the subsequent sentence. 

We can use the following regular expressions (`regex`), to match for the data we are looking for. We use the following, 

 1. To catch the `Flight price` we can use `\d(.*)\d`
 2. For `Flight duration` we can use `\W(.*)\d`
 
[Click link here \d(.*)\d explanation](https://regex101.com/r/kjXHIC/1/)

[Click here for \W(.*)\d explanation](https://regex101.com/r/S5Z3F8/1)


Bellow is an example of matching the `Flight price` and `Flight duration` from the first two sentences

In [None]:
print('Matching time: {}'.format(re.search(r'\d(.*)\d', sentence[0]).group(0)))
print('Matching flight price: {}'.format(re.search(r'\W(.*)\d', sentence[1]).group(0)))

In [None]:
frame = []
for k in range(len(list_of_days)):
    i = 0

    time_list,price_list = [],[]

    for sent in list_prices[k].split("\n"):
        #print("{}: ".format(i)+ sent)
        if sent == sentence[-1]:
            break
        if i%2==0:
            a = re.search(r'\d(.*)\d', sent).group(0)
            time_list.append(a)
            #print(a)
        else:
            a = re.search(r'\W(.*)\d', sent).group(0)
            price_list.append(a)
            #print(a)
        i+=1
        if i == 10:
            break

    dates = [list_of_days[k] for x in time_list]
    num_of_days = [17 for x in time_list]
    #vpn_list = ["{}".format(vpn_country) for x in time_list]

    #df = {'price': price_list, 'duration': time_list,'dates':dates,'num_of_days':num_of_days,'VPN':vpn_list}
    df = {'price': price_list, 'duration': time_list,'dates':dates,'num_of_days':num_of_days}
    frame.append(pd.DataFrame(df))

In [None]:
df_save = pd.concat(frame).reset_index().drop(columns="index")
df_save.head(10)
#df_save.to_csv("df_{}.csv".format(vpn_country))

In [None]:
df_save['price_2'] = [np.float(x.replace("$","").replace(",","")) for x in df_save.price]
df_save['duration_2'] = [(np.float(y[0])+np.float(y[1])/60) for y in [{"d" in x:x.split("d"),"h" in x:x.split("h")}.get(True,0) for x in df_save.duration]]

In [None]:
df_save