In [None]:
import re
import os
import time
import requests
import subprocess
from tqdm import tqdm
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

from requests_ip_rotator import ApiGateway
gateway = ApiGateway('https://api-ssl.bitly.com')
gateway.start()
session = requests.Session()
session.mount('https://api-ssl.bitly.com', gateway)

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
BITLY_API_KEY = '6eb2a9c9ec5950c276bf91b89ef2b1f229408807'
MAX_FIX_RESOLVED_URLS = 1000    # The maximum number of URLs to fix in a single run
REQUEST_TIMEOUT = 5             # The timeout for get requests

NB_SECONDS_IN_HOUR = 3600
NP_SECONDS_IN_MINUTE = 60

API_AVAILABLE_AT = -1           # The time at which the API will be available again
API_LIMIT_REACHED = False       # Whether the API limit has been reached

TOTAL_API = 0
TOTAL_CURL = 0
TOTAL_GET = 0

PATH_BITLY_URLS = '../data/generated/bitly_urls.tsv/part-00000-63bb4631-8cc4-485b-b648-9896615d94d4-c000.csv'
PATH_BITLY_RESOLVED_URLS = '../data/generated/bitly_resolved_urls.tsv'
PATH_BITLY_RESOLVED_URLS_TMP = '../data/generated/bitly_resolved_urls_tmp.tsv'

In [None]:
def resolve_bitly_url_api(url: str) -> str:
    """ Resolves a bitly URL to its original URL

    Args:
        url (str): the bitly URL to resolve

    Returns:
        str: the original URL, or None if the URL could not be resolved
    """
    global API_LIMIT_REACHED
    global API_LIMIT_REACHED_AT
    
    # Do no call the API if the limit has been reached not long ago
    if API_LIMIT_REACHED and API_AVAILABLE_AT > time.time():
        return None
    else:
        API_LIMIT_REACHED = False
    
    if url is None: 
        return None
    
    url = url.replace('http://', '').replace('https://', '')
    try:
        # Post a request via the bit.ly API and retrieve the long url from the response
        response = session.post('https://api-ssl.bitly.com/v4/expand', headers={'Authorization': f'Bearer {BITLY_API_KEY}'}, json={'bitlink_id': url})
        
        # Notice the program if the API limit has been reached
        if response.status_code == 429:
            message = response.json()['message']
            API_LIMIT_REACHED_AT = time.time()
            API_LIMIT_REACHED_AT += NB_SECONDS_IN_HOUR if message == 'HOURLY_RATE_LIMIT_EXCEEDED' else NP_SECONDS_IN_MINUTE
            API_LIMIT_REACHED = True
            return None
        else:
            return response.json()['long_url'] if response.status_code == 200 else None
    except:
        return None

In [None]:
def resolve_bitly_url_curl(url):
    """ Resolves a bitly URL to its original URL using curl
    
    Args:
        url (str): the bitly URL to resolve
        
    Returns:
        str: the original URL, or None if the URL could not be resolved
    """
    global API_LIMIT_REACHED
    global API_LIMIT_REACHED_AT

    # Do no call the curl method if the limit has been reached not long ago
    # We do so also there to avoid sending too many requests to the Bitly website,
    # which blocks the IP address based on the same criteria as the API.
    if API_LIMIT_REACHED and API_AVAILABLE_AT > time.time():
        return None
    else:
        API_LIMIT_REACHED = False
        
    if url is None:
        return None
        
    url = url + '+'
    cmd = f'curl -s -L {url}'
    try:
        output = subprocess.check_output(cmd, shell=True).decode('utf-8')
        soup = BeautifulSoup(output, 'html.parser')
        url = soup.find('a', {'class': 'item-detail--url'})
        return url.get('href') if url is not None else None
    except:
        return None

In [None]:
def resolve_bitly_url_get(url):
    """ Resolves a bitly URL to its original URL using get requests
    
    Args:
        url (str): the bitly URL to resolve
    
    Returns:
        str: the original URL, or None if the URL could not be resolved
    """
    if url is None:
        return None
        
    try:
        output = requests.get(url, allow_redirects=True, timeout=REQUEST_TIMEOUT)
        return output.url
    except:
        return None

In [None]:
def resolve_bitly_url(url):
    """ Resolves a bitly URL to its original URL using the API, curl and get requests
    
    Args:
        url (str): the bitly URL to resolve
    
    Returns:
        str: the original URL, or None if the URL could not be resolved
    """
    global TOTAL_API
    global TOTAL_CURL
    global TOTAL_GET

    retrieved_url = resolve_bitly_url_api(url)
    if retrieved_url is None:
        retrieved_url = resolve_bitly_url_curl(url)
    else: 
        TOTAL_API += 1
        return retrieved_url

    if retrieved_url is None:
        retrieved_url = resolve_bitly_url_get(url)
        TOTAL_GET += 1
    else:
        TOTAL_CURL += 1
    return retrieved_url

In [None]:
def fix_bitly_url(url: str) -> str:
    """ Fixes in a second step the bitly URLs before passing them to the resolve_bitly_url function
    
    Args:
        url (str): the bitly URL to resolve
        
    Returns:
        str: the original URL, or None if the URL could not be resolved
    """
    if url is None:
        return None

    url = url.replace('www.', '')
    bitly_reg = r'https?:\/\/bit\.ly\/[a-zA-Z0-9\-\_]+'
    bitly_urls = re.findall(bitly_reg, url)
    if len(bitly_urls) == 0:
        return None
    else:
        return bitly_urls[0]

In [None]:
def write_future_results(url_futures, file, use_tqdm=True):
    """ Wait for the completion of the futures and write the results to the file
    
    Args:
        url_futures (list): the list of futures
        file (str): the file to write the results to
        use_tqdm (bool): whether to use the tqdm progress bar

    Returns:
        n_retrieved_urls (int): the number of URLs that have been retrieved
        n_errors (int): the number of errors that have been encountered
    """
    n_retrieved_urls = 0
    n_errors = 0
    for (url, future) in tqdm(url_futures) if use_tqdm else url_futures:
        long_url = future.result()
        file.write(f'{url}\t{long_url}\n')
        if long_url is None:
            n_errors += 1
        else:
            n_retrieved_urls += 1
    return n_retrieved_urls, n_errors

def resolve_bitly_urls(batch_size):
    """ Resolves a batch of bitly URLs to their original URLs, and append the results to the resolved bitly URLs file
    
    Args:
        batch_size (int): the number of URLs to resolve in this batch
    """
    global TOTAL_API
    global TOTAL_CURL
    global TOTAL_GET
    TOTAL_API = 0
    TOTAL_CURL = 0
    TOTAL_GET = 0

    with open(PATH_BITLY_RESOLVED_URLS, 'r', encoding='utf-8') as f_res:
        n_resolved_urls = sum(1 for _ in f_res)

    with open(PATH_BITLY_URLS, 'r', encoding='utf-8') as f_res:
        n_to_resolve_urls = sum(1 for _ in f_res)

    with open(PATH_BITLY_URLS, 'r', encoding='utf-8') as f:
        with open(PATH_BITLY_RESOLVED_URLS, 'a', encoding='utf-8') as f_res:

            with ThreadPoolExecutor() as executor:
                n_retrieved_urls = 0
                n_errors = 0

                url_futures = []
                reached_max = False
                for idx, url in tqdm(enumerate(f.read().splitlines())):
                    if idx < n_resolved_urls:
                        continue
                    if idx >= n_resolved_urls + batch_size:
                        reached_max = True
                        break

                    fixed_url = fix_bitly_url(url)
                    url_futures.append((url, executor.submit(resolve_bitly_url, fixed_url)))
                
                n_retrieved_urls, n_errors = write_future_results(url_futures, f_res)
                
                if reached_max:
                    print(f'Reached max lines read: {batch_size}, {n_resolved_urls + batch_size} urls resolved out of {n_to_resolve_urls} in total.')
                else:
                    print(f'Finished resolving all urls.')
                print(f'Retrieved {n_retrieved_urls} urls, {n_errors} urls could not be retrieved. If the number of errors is too high, consider waiting a few moments before retrying.')
                print(f'API: {TOTAL_API}, CURL: {TOTAL_CURL}, GET: {TOTAL_GET}')

In [None]:
def fix_resolved_bitly_urls():
    """ Fixes the remaining bitly URLs that could not be resolved in the first step, and overwrite the resolved bitly URLs file """
    global TOTAL_API
    global TOTAL_CURL
    global TOTAL_GET
    TOTAL_API = 0
    TOTAL_CURL = 0
    TOTAL_GET = 0

    # Delete the tmp file if it exists
    try:
        os.remove(PATH_BITLY_RESOLVED_URLS_TMP)
    except:
        pass

    with open(PATH_BITLY_RESOLVED_URLS, 'r', encoding='utf-8') as f_res:
        with open(PATH_BITLY_RESOLVED_URLS_TMP, 'w', encoding='utf-8') as f_tmp:
            with ThreadPoolExecutor() as executor:
                n_retrieved_urls = 0
                n_errors = 0

                url_futures = []
                for idx, url_rurl in tqdm(enumerate(f_res.read().splitlines())):
                    url, rurl = url_rurl.split('\t')
                    if rurl == 'None':
                        fixed_url = fix_bitly_url(url)
                        url_futures.append((url, executor.submit(resolve_bitly_url, fixed_url)))
                    else:
                        f_tmp.write(f'{url}\t{rurl}\n')
                    
                    if idx % MAX_FIX_RESOLVED_URLS == 0 and idx != 0:
                        n_retrieved_urls_batch, n_errors_batch = write_future_results(url_futures, f_tmp, use_tqdm=False)
                        n_retrieved_urls += n_retrieved_urls_batch
                        n_errors += n_errors_batch
                        url_futures = []
                    
                n_retrieved_urls_batch, n_errors_batch = write_future_results(url_futures, f_tmp, use_tqdm=False)
                n_retrieved_urls += n_retrieved_urls_batch
                n_errors += n_errors_batch
                
    # Replace the old file with the new one
    os.remove(PATH_BITLY_RESOLVED_URLS)
    os.rename(PATH_BITLY_RESOLVED_URLS_TMP, PATH_BITLY_RESOLVED_URLS)

    print(f'Retrieved {n_retrieved_urls} urls, {n_errors} urls could not be retrieved. If the number of errors is too high, consider waiting a few moments before retrying.')
    print(f'API: {TOTAL_API}, CURL: {TOTAL_CURL}, GET: {TOTAL_GET}')

In [None]:
def get_none_urls_count():
    """ Returns the number of URLs that could not be resolved in the resolved bitly URLs file """
    with open(PATH_BITLY_RESOLVED_URLS, 'r', encoding='utf-8') as f_res:
        n_none_urls = 0
        for url_rurl in f_res.read().splitlines():
            _, rurl = url_rurl.split('\t')
            if rurl == 'None':
                n_none_urls += 1
        return n_none_urls

In [None]:
n_none = get_none_urls_count()
print(f'There are {n_none} urls that could not have been resolved.')

In [None]:
from IPython.display import clear_output
from time import sleep

iter = 0
while True:
    print(f'Iteration {iter}')
    resolve_bitly_urls(100)
    clear_output(wait=True)
    iter += 1

In [None]:
fix_resolved_bitly_urls()

In [None]:
gateway.shutdown()