## Web scraping 

Goal : get a table of all the locations listed [here](https://en.wikipedia.org/wiki/Lists_of_World_Heritage_Sites), in the format \[url; name; coordinates\]

In [None]:
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd
import lxml
import unicodedata2 as unicodedata

In [None]:
response = requests.get(url="https://en.wikipedia.org/wiki/Lists_of_World_Heritage_Sites")

soup = BeautifulSoup(response.content, 'html.parser')

title = soup.find(id="firstHeading")

# Get all the links
allLinks = soup.find(id="bodyContent").find_all("a")


link_list = []
str_link = str(allLinks).split('<a')

for link in str_link:
    if 'href="/wiki/' in link:
        link_list.append(link)
    

In [None]:
links1 = []
for elem in soup.select("a[href*=wiki\/List_of]"):
    links1.append(elem['href'])

In [None]:
# create a csv file
with open('../data/unesco_sites.csv', 'w') as f:

    for link in links1:
        response = requests.get(url = 'https://en.wikipedia.org' + link)

        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table',{'class':"wikitable"})
        df = pd.read_html(str(table))

        # convert list to dataframe
        df = pd.DataFrame(df[0])
        newdf = df.fillna('')

        # drop the unwanted column
        if 'Site' in newdf.columns and 'Location' in newdf.columns:
            data = newdf[['Site', 'Location']]

            # add information to the csv file
            for index, value in data.iterrows():
                location_cleaned = unicodedata.normalize('NFKD', value['Location']).encode('ascii','ignore').decode('ascii').replace(';', ',')
                f.write("%s;%s;%s\n" % (link, value['Site'], location_cleaned))


## Video search

Please refer to video-search.py to get all the youtube links.

In [4]:
import numpy as np
import youtube_dl
import csv
import urllib
import urllib.request
import re
import unidecode
from tqdm.autonotebook import tqdm
import pytube
import pandas as pd
from pytube import YouTube

  from tqdm.autonotebook import tqdm


In [2]:
inputfile = csv.reader(open('../data/unesco_sites.csv','r'), delimiter=';')

ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': 'tmp/%(id)s.%(ext)s',
'noplaylist': True,
'quiet': True,
'prefer_ffmpeg': True,
'audioformat': 'wav',
'forceduration':True
}

# create an empty dictionnary
video_dict = {}

In [7]:

# read line by line
for row in inputfile:
    
    print(row)
    # get second column (names of places)
    place = row[1]
    
    # clean string : remove accents
    place_clean1 = unidecode.unidecode(place)
    # clean string : remove spaces
    place_clean2 = place_clean1.replace(' ', '+')
    
    # add key words 
    search_words = place_clean2 + "+drone"
    
    # make a request in youtube, store the results in a list
    results = []
    html = urllib.request.urlopen("https://www.youtube.com/results?search_query=" + search_words)
    
    # store the results
    video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
  
    for video_id in tqdm(video_ids):
        try:
            ydl_opts = {'ignoreerrors': True}
            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
                myVideo = YouTube("https://www.youtube.com/watch?v=%s" % video_id)
                if (myVideo.streams.filter(res="2160p") != None) :
                    if not myVideo.age_restricted:
                        dictMeta = ydl.extract_info("https://www.youtube.com/watch?v=%s" % video_id, download=False)
            
            video_dict.update({video_id : [place_clean2, dictMeta['duration'], dictMeta['title'], dictMeta['upload_date']]})
            
        except Exception as e:
            print("ERROR Catched and Passed", e)
            pass 
        
    
video_df = pd.DataFrame.from_dict(video_dict, orient='index', columns=['place', 'duration', 'title', 'date'])

video_df.to_csv('../data/video_info.csv', index_label = 'id')

['/wiki/List_of_World_Heritage_Sites_in_Africa', 'Aksum', 'Tigray Region, Ethiopia140749N 384307E / 14.130190N 38.718605E']


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=29.0), HTML(value='')))

[youtube] zH6dj_e78do: Downloading webpage
ERROR Catched and Passed name 'durations' is not defined
[youtube] 1s3ACx9N60M: Downloading webpage
ERROR Catched and Passed name 'durations' is not defined
[youtube] F0ex2LJh58M: Downloading webpage
[youtube] F0ex2LJh58M: Downloading MPD manifest
ERROR Catched and Passed name 'durations' is not defined



KeyboardInterrupt: 

## Screenshots

Select randomly 10 places, from which select 10 random videos, and take 3 screenshots for each video without downloading it for analysis purposes

In [None]:
import pandas as pd
import random
import youtube_dl
import numpy
import cv2

In [None]:
df = pd.read_csv('video_info2.csv', delimiter=',')
# we set the index of the dataframe on the 'place' columns
df.set_index('id.1', inplace = True)

places_sel = random.sample(df.index.unique().tolist(), 4)

dict_sample = {}
for place in places_sel:
    ids = df.loc[place].id.tolist()
    ids_sample = random.sample(ids, 10)
    dict_sample.update({place : ids_sample})
    
    

In [None]:
for key, value in dict_sample.items():
    
    place = key
    for youtube_id in value:
        print(youtube_id)
        video_url= 'https://www.youtube.com/watch?v=%s' % youtube_id  #The Youtube URL
        ydl_opts={}
        ydl=youtube_dl.YoutubeDL(ydl_opts)
        info_dict=ydl.extract_info(video_url, download=False)

        formats = info_dict.get('formats',None)

        f = formats[-1]
        print('test')
        url = f.get('url',None)
        cap = cv2.VideoCapture(url)

        x=0
        count=info_dict['duration']/4
        while x<3:
            ret, frame = cap.read()
            if not ret:
                break
            filename ="screenshots/%s-%d-%d.png" % (youtube_id, x, count)
            print(x)
            x+=1
            cv2.imwrite(filename.format(count), frame)
            count+=int(info_dict['duration']/4) #Skip 300 frames i.e. 10 seconds for 30 fps
            cap.set(1,count)
            if cv2.waitKey(30)&0xFF == ord('q'):
                print('test5')
                break
        cap.release()