## Web scraping 

Goal : get a table of all the locations listed [here](https://en.wikipedia.org/wiki/Lists_of_World_Heritage_Sites), in the format \[url; name; coordinates\]

In [None]:
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd
import lxml
import unicodedata2 as unicodedata

In [None]:
response = requests.get(url="https://en.wikipedia.org/wiki/Lists_of_World_Heritage_Sites")

soup = BeautifulSoup(response.content, 'html.parser')

title = soup.find(id="firstHeading")

# Get all the links
allLinks = soup.find(id="bodyContent").find_all("a")


link_list = []
str_link = str(allLinks).split('<a')

for link in str_link:
    if 'href="/wiki/' in link:
        link_list.append(link)
    

In [None]:
links1 = []
for elem in soup.select("a[href*=wiki\/List_of]"):
    links1.append(elem['href'])

In [None]:
# create a csv file
with open('../data/unesco_sites_raw.csv', 'w') as f:

    for link in links1:
        response = requests.get(url = 'https://en.wikipedia.org' + link)

        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table',{'class':"wikitable"})
        df = pd.read_html(str(table))

        # convert list to dataframe
        df = pd.DataFrame(df[0])
        newdf = df.fillna('')

        # drop the unwanted column
        if 'Site' in newdf.columns and 'Location' in newdf.columns:
            data = newdf[['Site', 'Location']]

            # add information to the csv file
            for index, value in data.iterrows():
                location_cleaned = unicodedata.normalize('NFKD', value['Location']).encode('ascii','ignore').decode('ascii').replace(';', ',')
                f.write("%s;%s;%s\n" % (link, value['Site'], location_cleaned))


### Clean the data 
Clean the text related to place entities. 

In [6]:
import pandas as pd
import csv
import re
from tqdm.autonotebook import tqdm
from Levenshtein import distance as lev 

In [7]:
data = pd.read_csv('../data/unesco_sites_raw.csv', delimiter=';', header = None)
data.rename(columns = {0: 'link', 1: 'place', 2: 'location'}, inplace = True)

In [8]:
data['place'] = data['place'].apply(lambda x: x.replace("-"," "))
data['place'] = data['place'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x))
data['place'] = data['place'].apply(lambda x: x.strip())
data['place'] = data['place'].apply(lambda x: x.capitalize())

In [9]:
len(data), len(data.place.unique())

(3810, 1190)

In [10]:
data.head()

Unnamed: 0,link,place,location
0,/wiki/List_of_World_Heritage_Sites_in_Africa,Aapravasi ghat,"Port Louis District, Mauritius.mw-parser-outpu..."
1,/wiki/List_of_World_Heritage_Sites_in_Africa,Abu mena,"Abusir, Egypt305028N 293947E / 30.84098N 29.66..."
2,/wiki/List_of_World_Heritage_Sites_in_Africa,Air and tnr natural reserves,"Arlit Department, Niger18N 9E / 18N 9E"
3,/wiki/List_of_World_Heritage_Sites_in_Africa,Aksum,"Tigray Region, Ethiopia140749N 384307E / 14.13..."
4,/wiki/List_of_World_Heritage_Sites_in_Africa,Al qala of beni hammad,"Maadid, Algeria354906N 44713E / 35.818440N 4.7..."


In [11]:
for i in tqdm(range(len(data) - 1)):
    place1 = data.iloc[i].place
    place2 = data.iloc[i+1].place
    if lev(place1, place2) < 3: # if the writing of the place differs of less than 2 characters
        data.iloc[i+1].place = place1 # only the first version of the writing is kept


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3809.0), HTML(value='')))




In [12]:
data.drop_duplicates(subset = 'place', keep='first',inplace=True)
data.sort_values('place', ascending=True, inplace = True)

In [13]:
# check that all names of places are unique
len(data), len(data.place.unique())

(1189, 1189)

In [None]:
# save the dataframe 
data.to_csv('../data/unesco_sites_cleaned.csv', sep = ';', header = False, index = False)

## Video search

Get all the links resulting from the youtube requests. Refer to video-search.py for the python script.

In [None]:
import numpy as np
import youtube_dl
import csv
import urllib
import urllib.request
import re
import unidecode
from tqdm.autonotebook import tqdm
import pytube
import pandas as pd
from pytube import YouTube

In [None]:
inputfile = csv.reader(open('../data/unesco_sites_cleaned.csv','r'), delimiter=';')

ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': 'tmp/%(id)s.%(ext)s',
'noplaylist': True,
'quiet': True,
'prefer_ffmpeg': True,
'audioformat': 'wav',
'forceduration':True
}

# create an empty dictionnary
video_dict = {}

In [None]:

# read line by line
for row in inputfile:
    
    print(row)
    # get second column (names of places)
    place = row[1]
    
    # clean string : remove accents
    place_clean1 = unidecode.unidecode(place)
    # clean string : remove spaces
    place_clean2 = place_clean1.replace(' ', '+')
    
    # add key words 
    search_words = place_clean2 + "+drone"
    
    # make a request in youtube, store the results in a list
    results = []
    html = urllib.request.urlopen("https://www.youtube.com/results?search_query=" + search_words)
    
    # store the results
    video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
  
    for video_id in tqdm(video_ids):
        try:
            ydl_opts = {'ignoreerrors': True}
            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
                myVideo = YouTube("https://www.youtube.com/watch?v=%s" % video_id)
                if (myVideo.streams.filter(res="2160p") != None) :
                    if not myVideo.age_restricted:
                        dictMeta = ydl.extract_info("https://www.youtube.com/watch?v=%s" % video_id, download=False)
            
                        video_dict.update({video_id : [place_clean1, dictMeta['duration'], dictMeta['title'], dictMeta['upload_date']]})
            
        except Exception as e:
            print("ERROR Catched and Passed", e)
            pass 
        
    
video_df = pd.DataFrame.from_dict(video_dict, orient='index', columns=['place', 'duration', 'title', 'date'])

video_df.to_csv('../data/video_info.csv', index_label = 'id')

## Screenshots

Select randomly 10 places, from which select 10 random videos, and take 3 screenshots for each video without downloading it for analysis purposes

In [None]:
import pandas as pd
import random
import youtube_dl
import numpy
import cv2

In [None]:
df = pd.read_csv('../data/video_info.csv', delimiter=',')
dict_sample = {}
places_sel = random.sample(df.place.unique().tolist(), 10)
for place in places_sel:
    ids = df.loc[df['place'] == place].id.tolist()
    ids_sample = random.sample(ids, 10)
    dict_sample.update({place : ids_sample})

In [None]:
for key, value in dict_sample.items():
    
    place = key
    for youtube_id in value:
        print(youtube_id)
        video_url = 'https://www.youtube.com/watch?v=%s' % youtube_id  #The Youtube URL
        ydl_opts = {}
        ydl = youtube_dl.YoutubeDL(ydl_opts)
        info_dict = ydl.extract_info(video_url, download=False)

        formats = info_dict.get('formats',None)

        f = formats[-1]
        url = f.get('url',None)
        cap = cv2.VideoCapture(url)

        x = 1
        count = info_dict['duration'] / 4

        while x < 4 :

            for i in range(3):

                time = x * info_dict['duration'] + ( 10 * i ) # each time, take 3 shots at 10 sec intervals
                ret, frame = cap.read()
                if not ret:
                    break
                filename = "../data/screenshots/%s-%d.png" % (youtube_id, time)

                cv2.imwrite(filename, frame)
                count += int(info_dict['duration'] / 4)
                cap.set(1,count)

            x += 1

        cap.release()