In [1]:
import re
import os
import requests
import subprocess
from tqdm import tqdm
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
BITLY_API_KEY = '6eb2a9c9ec5950c276bf91b89ef2b1f229408807'
MAX_FIX_RESOLVED_URLS = 1000

PATH_BITLY_URLS = '../data/generated/bitly_urls.tsv/part-00000-63bb4631-8cc4-485b-b648-9896615d94d4-c000.csv'
PATH_BITLY_RESOLVED_URLS = '../data/generated/bitly_resolved_urls.tsv'
PATH_BITLY_RESOLVED_URLS_TMP = '../data/generated/bitly_resolved_urls_tmp.tsv'

In [3]:
def resolve_bitly_url_api(url: str) -> str:
    """ Resolves a bitly URL to its original URL

    Args:
        url (str): the bitly URL to resolve

    Returns:
        str: the original URL, or None if the URL could not be resolved
    """
    if url is None: 
        return None

    url = url.replace('http://', '').replace('https://', '')
    try:
        # Post a request via the bit.ly API and retrieve the long url from the response
        response = requests.post('https://api-ssl.bitly.com/v4/expand', headers={'Authorization': f'Bearer {BITLY_API_KEY}'}, json={'bitlink_id': url})
        return response.json()['long_url'] if response.status_code == 200 else None

    except Exception as e:
        print(f'Could not resolve url {url}: {e}')
        return None

In [4]:
def resolve_bitly_url_curl(url):
    if url is None:
        return None
        
    url = url + '+'
    cmd = f'curl -s -L {url}'
    try:
        output = subprocess.check_output(cmd, shell=True).decode('utf-8')
        soup = BeautifulSoup(output, 'html.parser')
        url = soup.find('a', {'class': 'item-detail--url'})
        return url.get('href') if url is not None else None
    except Exception as e:
        print(f'Could not resolve url {url} with curl: {e}')
        return None

In [5]:
def resolve_bitly_url_get(url):
    if url is None:
        return None
        
    cmd = f'curl -s -L {url}'
    try:
        output = requests.get(url, allow_redirects=True)
        return output.url
    except Exception as e:
        print(f'Could not resolve url {url} with curl: {e}')
        return None

In [6]:
def resolve_bitly_url(url):
    retrieved_url = resolve_bitly_url_api(url)
    if retrieved_url is None:
        retrieved_url = resolve_bitly_url_curl(url)
    if retrieved_url is None:
        retrieved_url = resolve_bitly_url_get(url)
    return retrieved_url

In [7]:
def fix_bitly_url(url: str) -> str:
    """ Fixes in a second step the bitly URLs before passing them to the resolve_bitly_url function
    
    Args:
        url (str): the bitly URL to resolve
        
    Returns:
        str: the original URL, or None if the URL could not be resolved
    """
    if url is None:
        return None

    bitly_reg = r'https?:\/\/bit\.ly\/[a-zA-Z0-9\-\_]+'
    # take substring that match the reg
    bitly_urls = re.findall(bitly_reg, url)
    if len(bitly_urls) == 0:
        return None
    else:
        return bitly_urls[0]

In [8]:
def write_future_results(url_futures, file, use_tqdm=True):
    n_retrieved_urls = 0
    n_errors = 0
    for (url, future) in tqdm(url_futures) if use_tqdm else url_futures:
        long_url = future.result()
        file.write(f'{url}\t{long_url}\n')
        if long_url is None:
            n_errors += 1
        else:
            n_retrieved_urls += 1
    return n_retrieved_urls, n_errors

def resolve_bitly_urls(batch_size):
    """ Resolves a batch of bitly URLs to their original URLs, and append the results to the resolved bitly URLs file
    
    Args:
        batch_size (int): the number of URLs to resolve in this batch
    """
    with open(PATH_BITLY_RESOLVED_URLS, 'r', encoding='utf-8') as f_res:
        n_resolved_urls = sum(1 for _ in f_res)

    with open(PATH_BITLY_URLS, 'r', encoding='utf-8') as f_res:
        n_to_resolve_urls = sum(1 for _ in f_res)

    with open(PATH_BITLY_URLS, 'r', encoding='utf-8') as f:
        with open(PATH_BITLY_RESOLVED_URLS, 'a', encoding='utf-8') as f_res:

            with ThreadPoolExecutor() as executor:
                n_retrieved_urls = 0
                n_errors = 0

                url_futures = []
                reached_max = False
                for idx, url in tqdm(enumerate(f.read().splitlines())):
                    if idx < n_resolved_urls:
                        continue
                    if idx >= n_resolved_urls + batch_size:
                        reached_max = True
                        break

                    fixed_url = fix_bitly_url(url)
                    url_futures.append((url, executor.submit(resolve_bitly_url, fixed_url)))
                
                n_retrieved_urls, n_errors = write_future_results(url_futures, f_res)
                
                if reached_max:
                    print(f'Reached max lines read: {batch_size}, {n_resolved_urls + batch_size} urls resolved out of {n_to_resolve_urls} in total.')
                else:
                    print(f'Finished resolving all urls.')
                print(f'Retrieved {n_retrieved_urls} urls, {n_errors} urls could not be retrieved. If the number of errors is too high, consider waiting a few moments before retrying.')

In [9]:
def fix_resolved_bitly_urls():
    """ Fixes the remaining bitly URLs that could not be resolved in the first step, and overwrite the resolved bitly URLs file """
    # Delete the tmp file if it exists
    try:
        os.remove(PATH_BITLY_RESOLVED_URLS_TMP)
    except:
        pass

    with open(PATH_BITLY_RESOLVED_URLS, 'r', encoding='utf-8') as f_res:
        with open(PATH_BITLY_RESOLVED_URLS_TMP, 'w', encoding='utf-8') as f_tmp:
            with ThreadPoolExecutor() as executor:
                n_retrieved_urls = 0
                n_errors = 0

                url_futures = []
                for idx, url_rurl in tqdm(enumerate(f_res.read().splitlines())):
                    url, rurl = url_rurl.split('\t')
                    if rurl == 'None':
                        fixed_url = fix_bitly_url(url)
                        url_futures.append((url, executor.submit(resolve_bitly_url, fixed_url)))
                    else:
                        f_tmp.write(f'{url}\t{rurl}\n')
                    
                    if idx % MAX_FIX_RESOLVED_URLS == 0 and idx != 0:
                        n_retrieved_urls_batch, n_errors_batch = write_future_results(url_futures, f_tmp, use_tqdm=False)
                        n_retrieved_urls += n_retrieved_urls_batch
                        n_errors += n_errors_batch
                        url_futures = []
                    
                n_retrieved_urls_batch, n_errors_batch = write_future_results(url_futures, f_tmp, use_tqdm=False)
                n_retrieved_urls += n_retrieved_urls_batch
                n_errors += n_errors_batch
                
    # Replace the old file with the new one
    os.remove(PATH_BITLY_RESOLVED_URLS)
    os.rename(PATH_BITLY_RESOLVED_URLS_TMP, PATH_BITLY_RESOLVED_URLS)

    print(f'Retrieved {n_retrieved_urls} urls, {n_errors} urls could not be retrieved. If the number of errors is too high, consider waiting a few moments before retrying.')

In [10]:
def get_none_urls_count():
    with open(PATH_BITLY_RESOLVED_URLS, 'r', encoding='utf-8') as f_res:
        n_none_urls = 0
        for url_rurl in f_res.read().splitlines():
            _, rurl = url_rurl.split('\t')
            if rurl == 'None':
                n_none_urls += 1
        return n_none_urls

In [11]:
n_none = get_none_urls_count()
print(f'There are {n_none} urls that could not have been resolved.')

There are 4409 urls that could not have been resolved.


In [102]:
resolve_bitly_urls(5000)

39237it [00:00, 143200.39it/s]
100%|██████████| 5000/5000 [06:33<00:00, 12.71it/s]

Reached max lines read: 5000, 39237 urls resolved out of 2603507 in total.
Retrieved 1552 urls, 3448 urls could not be retrieved. If the number of errors is too high, consider waiting a few moments before retrying.





In [12]:
fix_resolved_bitly_urls()

27001it [00:04, 3307.06it/s] 

Could not resolve url https://bit.ly/2HqUVNN with curl: HTTPSConnectionPool(host='bitsonar.live', port=443): Max retries exceeded with url: /215d638c185c249 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002BCD546E460>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Could not resolve url http://bit.ly/1YD9GBr with curl: HTTPConnectionPool(host='www.newfrog.com', port=80): Max retries exceeded with url: /p/syma-x5sw-2-4ghz-rc-quadcopter-with-2mp-camera-headless-mode-fpv-app-98544.html?utm_source=youtubere&utm_medium=rcgutt&utm_campaign=90014xu (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002BCD64887C0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Could not resolve url http://bit.ly/TRAX0210-GanjaKidz with curl: HTTPSConnectionPool(host='hardcoreitalia.life', port=443): Max retries exceeded with url: /music/809/yoshiko-x-redrums-ganja-kidz (Caused by SSLError(Ce

27001it [00:20, 3307.06it/s]