# Resolve Google links in video description

This notebook is part of the preprocessing pipeline for the links in video description. It resolves shortened links using GET requests to the [Google](https://goo.gl) website and saves the results in a new file.

In [1]:
import os
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

To scrape Google links, we use the [Amazon AWS Gateway](https://aws.amazon.com/fr/) to change our IP address.

In [2]:
from requests_ip_rotator import ApiGateway

session_urls = [
    'https://api.ipify.org',
    'https://goo.gl',
    'http://goo.gl'
]
url_gateways = [(session_url, ApiGateway(session_url)) for session_url in session_urls]

sess = requests.Session()
for url, gateway in url_gateways:
    gateway.start()
    sess.mount(url, gateway)

Starting API gateways in 10 regions.
Using 10 endpoints with name 'https://api.ipify.org - IP Rotate API' (0 new).
Starting API gateways in 10 regions.
Using 10 endpoints with name 'https://goo.gl - IP Rotate API' (0 new).
Starting API gateways in 10 regions.
Using 10 endpoints with name 'http://goo.gl - IP Rotate API' (0 new).


In [3]:
def get_ip(session):
    """ Get the IP address of the current session. """
    return session.get('https://api.ipify.org').text

# The default session is not using the proxy
assert get_ip(requests.Session()) == get_ip(requests.Session())

# The session with the proxy is changing IP addresses
assert get_ip(sess) != get_ip(sess)

In [4]:
MAX_FIX_RESOLVED_URLS = 1000    # The maximum number of URLs to fix in a single run
REQUEST_TIMEOUT = 5             # The timeout for get requests

PATH_GOOGL_URLS = '../data/generated/googl_urls.tsv'
PATH_GOOGL_RESOLVED_URLS = '../data/generated/googl_resolved_urls.tsv'
PATH_GOOGL_RESOLVED_URLS_TMP = '../data/generated/googl_resolved_urls_tmp.tsv'

In [5]:
def resolve_googl_url(url):
    """ Resolve a goo.gl URL using GET requests.
    
    Args:
        url (str): The URL to resolve.
        
    Returns:
        str: The resolved URL or None if the URL could not be resolved.
    """
    
    if url is None:
        return None
        
    try:
        output = requests.get(url, allow_redirects=True, timeout=REQUEST_TIMEOUT)
        return output.url
    except:
        return None

In [6]:
def write_future_results(url_futures, file, use_tqdm=True):
    """ Wait for the completion of the futures and write the results to the file.
    
    Args:
        url_futures (list): the list of futures
        file (str): the file to write the results to
        use_tqdm (bool): whether to use the tqdm progress bar

    Returns:
        n_retrieved_urls (int): the number of URLs that have been retrieved
        n_errors (int): the number of errors that have been encountered
    """
    
    n_retrieved_urls = 0
    n_errors = 0
    for (url, future) in tqdm(url_futures) if use_tqdm else url_futures:
        long_url = future.result()
        file.write(f'{url}\t{long_url}\n')
        if long_url is None:
            n_errors += 1
        else:
            n_retrieved_urls += 1
    return n_retrieved_urls, n_errors

def resolve_googl_urls(batch_size):
    """ Resolves a batch of bitly URLs to their original URLs, and append the results to the resolved bitly URLs file.
    
    Args:
        batch_size (int): the number of URLs to resolve in this batch
    """

    # Count the number of already resolved URLs
    with open(PATH_GOOGL_RESOLVED_URLS, 'r', encoding='utf-8') as f_res:
        n_resolved_urls = sum(1 for _ in f_res)

    # Count the number of URLs to resolve
    with open(PATH_GOOGL_URLS, 'r', encoding='utf-8') as f_res:
        n_to_resolve_urls = sum(1 for _ in f_res)

    with open(PATH_GOOGL_URLS, 'r', encoding='utf-8') as f, \
         open(PATH_GOOGL_RESOLVED_URLS, 'a', encoding='utf-8') as f_res, \
         ThreadPoolExecutor() as executor:

        n_retrieved_urls = 0
        n_errors = 0

        url_futures = []
        reached_max = False
        for idx, url in tqdm(enumerate(f.read().splitlines())):
            # Skip the already resolved URLs
            if idx < n_resolved_urls:
                continue
            # Stop if we have reached the maximum number of URLs to resolve
            if idx >= n_resolved_urls + batch_size:
                reached_max = True
                break

            # Get futures for the URLs
            url_futures.append((url, executor.submit(resolve_googl_url, url)))
        
        # Write the results of the futures
        n_retrieved_urls, n_errors = write_future_results(url_futures, f_res)
        
        if reached_max:
            print(f'Reached max lines read: {batch_size}, {n_resolved_urls + batch_size} urls resolved out of {n_to_resolve_urls} in total.')
        else:
            print(f'Finished resolving all urls.')
        print(f'Retrieved {n_retrieved_urls} urls, {n_errors} urls could not be retrieved. If the number of errors is too high, consider waiting a few moments before retrying.')

In [10]:
def fix_resolved_googl_urls():
    """ Fixes the remaining bitly URLs that could not be resolved in the first step, and overwrite the resolved bitly URLs file. """

    # Delete the tmp file if it exists
    try:
        os.remove(PATH_GOOGL_RESOLVED_URLS_TMP)
    except:
        pass

    with open(PATH_GOOGL_RESOLVED_URLS, 'r', encoding='utf-8') as f_res, \
         open(PATH_GOOGL_RESOLVED_URLS_TMP, 'w', encoding='utf-8') as f_tmp, \
         ThreadPoolExecutor() as executor:
                n_retrieved_urls = 0
                n_errors = 0

                url_futures = []
                for idx, url_rurl in tqdm(enumerate(f_res.read().splitlines())):
                    if len(url_rurl.split('\t')) != 2:
                        continue
                    
                    url, rurl = url_rurl.split('\t')
                    if rurl == 'None':
                        url_futures.append((url, executor.submit(resolve_googl_url, url)))
                    else:
                        f_tmp.write(f'{url}\t{rurl}\n')
                    
                    # Write the results of the futures every MAX_FIX_RESOLVED_URLS iterations
                    if idx % MAX_FIX_RESOLVED_URLS == 0 and idx != 0:
                        n_retrieved_urls_batch, n_errors_batch = write_future_results(url_futures, f_tmp, use_tqdm=False)
                        n_retrieved_urls += n_retrieved_urls_batch
                        n_errors += n_errors_batch
                        url_futures = []
                    
                n_retrieved_urls_batch, n_errors_batch = write_future_results(url_futures, f_tmp, use_tqdm=False)
                n_retrieved_urls += n_retrieved_urls_batch
                n_errors += n_errors_batch
                
    # Replace the old file with the new one
    os.remove(PATH_GOOGL_RESOLVED_URLS)
    os.rename(PATH_GOOGL_RESOLVED_URLS_TMP, PATH_GOOGL_RESOLVED_URLS)

    print(f'Retrieved {n_retrieved_urls} urls, {n_errors} urls could not be retrieved.')

In [13]:
def get_none_urls_count():
    """ Returns the number of URLs that could not be resolved in the resolved bitly URLs file. """
    
    with open(PATH_GOOGL_RESOLVED_URLS, 'r', encoding='utf-8') as f_res:
        n_none_urls = 0
        for url_rurl in f_res.read().splitlines():
            split = url_rurl.split('\t')
            if len(split) != 2:
                continue

            _, rurl = url_rurl.split('\t')
            if rurl == 'None':
                n_none_urls += 1
        return n_none_urls

In [14]:
n_none = get_none_urls_count()
print(f'There are {n_none} urls that could not have been resolved.')

There are 37804 urls that could not have been resolved.


The following methods are used to resolve shortened links: they retrieve a batch of shortened links and resolve them. The requests are made in parallel using futures to speed up the process.

In [None]:
# Single execution
resolve_googl_urls(100)

In [None]:
# Multiple executions
from IPython.display import clear_output

iter = 0
while True:
    print(f'Iteration {iter}')
    resolve_googl_urls(500)
    if iter % 2 == 0:
        clear_output(wait=True)
    iter += 1

Finally, we aim to fix previously failed requests by requesting their links again.

In [16]:
fix_resolved_googl_urls()

1397372it [5:11:54, 74.67it/s] 


Retrieved 7105 urls, 30699 urls could not be retrieved. If the number of errors is too high, consider waiting a few moments before retrying.


In [5]:
for url, gateway in url_gateways:
    gateway.shutdown()

Deleting gateways for site 'https://goo.gl'.
Deleted 10 endpoints with for site 'https://goo.gl'.
Deleting gateways for site 'http://goo.gl'.
Deleted 10 endpoints with for site 'http://goo.gl'.


At the end of this notebook, the Bitly links are resolved and saved in a new file named `googl_resolved_urls.tsv`.