In [1]:
## Webscraping
from bs4 import BeautifulSoup
import urllib2 as ul
import sys, os, math, shutil
import zipfile
import re
import datetime
from dateutil.parser import parse

## PostGres DB
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

%load_ext sql
%config SqlMagic.autopandas=True

## Python 3-like
from __future__ import absolute_import, division, print_function

  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")


In [2]:
## ------------------------------------
def get_npages(soup):
    """
    Retrieves the total number of pages to crawl
    for a particular letter on dafont
    """
    
    n = 0
    
    ## Locate all links in the file
    for link in soup.find_all('a'):
        
        ## Retrive link text
        link_text = link.get('href')
        
        if link_text is None: continue
        
        ## Look only for the links that refer to other dafont pages
        if 'alpha.php' in link_text:
            try:
                ## Retrieve the last number of the link, corresponding to the page number
                current_n = int(re.findall(r'\d+', link_text)[-1])
            except IndexError:
                current_n = 0
            if current_n > n:
                n = current_n
                
    return n

In [3]:
## Specify the headers necessary to read the dafont.com pages
USERAGENT = 'something'
HEADERS = {'User-Agent': USERAGENT}

## Specify the URL pattern for dafont.com pages
dafont_pattern = 'http://www.dafont.com/alpha.php?lettre={0}&page={1}'

## --------------------------------------
def get_dafont_page_soup(lettre, i):
    """
    Returns the soup for a particular dafont page
    """
    
    request = ul.Request(dafont_pattern.format(lettre, i), headers=HEADERS)
    response = ul.urlopen(request)
    soup = BeautifulSoup(response, 'lxml')
    response.close()
    
    return soup
    

In [4]:
## --------------------------------------
def unicode_bullshit(text):
    """
    Deals with weird encodings
    """
    return ''.join([c if ord(c) < 128 else '_' for c in text])

In [5]:
class FontInfo(object):
    """
    A class to store the info scrapped from dafont
    about a particular font
    """
    
    font_exts = ['otf', 'ttf']
    
    ## ----------------------------------------
    def __init__(self, name, url, licensing, download_link, timestamp):
        """
        Constructor
        """

        self.name = unicode_bullshit(name)
        self.url  = unicode_bullshit(url)
        self.licensing = unicode_bullshit(licensing)
        self.download_link = unicode_bullshit(download_link)
        self.timestamp = timestamp
        self.paths = []
        
        
    ## ----------------------------------------
    def __repr__(self):
        """
        string representation for print method
        """
        
        return """
        {0}
        URL : {1}
        {2}
        download : {3}
        """.format(
            self.name,
            self.url,
            self.licensing,
            self.download_link
        )
    
    
    ## ----------------------------------------
    def download(self, path):
        """
        Download the font
        """
        
        ## Remember where we're from
        cwd = os.getcwd()
        
        ## Create the destination directory
        try:
            os.mkdir(path)
        except:
            pass
        
        ## Create a temporary directory
        tmp_path = os.path.join(path,'tmp')
        
        try:
            os.mkdir(tmp_path)
        except:
            shutil.rmtree(tmp_path)
            os.mkdir(tmp_path)
        
        os.chdir(tmp_path)
        
        ## download file name, assuming it's a zip file first
        fname = self.download_link.split('=')[-1] + '.zip'
        
        ## Download the file
        data = ul.urlopen(self.download_link)
        
        ## Save the file to disk
        with open(fname, 'wb') as f:
            f.write(data.read())
            f.close()
            
        ## open the zip file
        try:
            zf = zipfile.ZipFile(fname, 'r')
        except zipfile.BadZipfile:
            os.chdir(cwd)
            return False
        
        for f in zf.namelist():
            ext = f.split('.')[-1].lower()
            if not ext in self.font_exts: continue
            
            try:
                zf.extract(f)
            except:
                os.chdir(cwd)
                return False
            
            ## Remove directory structure
            if os.pathsep in f:
                shutil.move(f, os.path.split(f)[-1])
            
            new_path = os.path.join(path, os.path.split(f)[-1])
            new_path = unicode_bullshit(new_path)
            shutil.move(os.path.join(tmp_path, f), new_path)
            self.paths.append(new_path)
            
        if not self.paths:
            os.chdir(cwd)
            return False
            
        os.chdir(cwd)
        return True
        
             


## ----------------------------------------
def get_font_infos(soup):
    """
    Returns a list of FontInfo objects from a dafont page 
    """
    
    font_infos = []
    
    html_names = soup.find_all('div', class_='lv1left dfbg')
    html_infos = soup.find_all('div', class_='lv2right')
    html_dnlds = soup.find_all('div', class_='dlbox')
    
    for html_name, html_info, html_dnld in zip(html_names, html_infos, html_dnlds):
        name      = html_name.a.get_text()
        href      = html_name.a.get('href')
        
        try:
            licensing = html_info.find_all('a', class_='tdn help black')[0].get_text()
        except:
            licensing = 'Unknown'
            
        dl_link = html_dnld.a.get('href')
        
        url = 'http://www.dafont.com/{0}'.format(href)
        
        ## Go get the date
        request = ul.Request(url, headers=HEADERS)
        response = ul.urlopen(request)
        date_soup = BeautifulSoup(response, 'lxml')
        response.close()
        
        timestamp = 0.0
        
        potential_dates = date_soup.find_all('div', class_='dfsmall')
        for potential_date in potential_dates:
            text = potential_date.get_text()
            if 'First seen on DaFont' in text:
                text = text.split(':')[1].split('-')[0]
                if 'before' in text.lower():
                    break
                else:
                    try:
                        dt = parse(text)
                        timestamp = (dt - datetime.datetime(2004,1,1)).total_seconds()
                    except:
                        continue
                    break
                    
        font_infos.append(FontInfo(name, url, licensing, dl_link, timestamp))
        
    return font_infos


In [16]:
## Connect to the DB
dbname = 'fontdb'

username = ''
pswd = ''

with open('db_credentials', 'r') as f:
    credentials = f.readlines()
    f.close()
    
    username = credentials[0].rstrip()
    pswd     = credentials[1].rstrip()

database_address = 'postgresql://{0}:{1}@fontdbinstance.c9mwqfkzqqmh.us-west-2.rds.amazonaws.com:5432/{2}'.format(username,pswd,dbname)
if not database_exists(database_address):
    create_database(database_address)

engine = create_engine(database_address)

In [17]:
## Different possible lettres for the dafont urls
lettres = [
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
    'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '%23'
]

In [18]:
import boto
from boto.s3.key import Key
s3 = boto.connect_s3()
bucket = s3.get_bucket('fontfinder-fontfiles', validate=False)

for lettre in lettres:
    
    soup = get_dafont_page_soup(lettre, 1)
    n_pages = get_npages(soup)
    
    print ('Letter {0}, ~{1} fonts ...'.format(lettre, n_pages*20))
    
    for i in range(1, n_pages+1):
        
        ## Don't start from the beginning every time
        # if lettre in  ['a', 'b']: continue
        #if lettre == 'b' and i < 67: continue
        
        print("Now downloading fonts for letter '{0}', page {1}/{2} ...".format(lettre, i, n_pages))
    
        soup = get_dafont_page_soup(lettre, i)
        font_infos = get_font_infos(soup)
    
        names          = []
        urls           = []
        licensings     = []
        download_links = []
        origins        = []
        timestamps     = []
        aws_bucket     = []
        aws_bucket_key = []
    
        for font_info in font_infos:
            #print(font_info)
            if not font_info.download('/Users/mtm/Projects/FontFinder/dafont_fonts2'): continue
                
            for font_path in font_info.paths:
        
                font_file_name = os.path.split(font_path)[-1]
                aws_key        = 'dafont_fonts/{0}'.format(font_file_name).replace(' ', '_')
                
                k = Key(bucket)
                k.key = aws_key
                k.set_contents_from_filename(font_path)
        
                names.append(font_info.name)
                urls.append(font_info.url)
                licensings.append(font_info.licensing)
                download_links.append(font_info.download_link)
                timestamps.append(font_info.timestamp)
                origins.append('dafont.com')
                aws_bucket.append('fontfinder-fontfiles')
                aws_bucket_key.append(aws_key)
                
        df = pd.DataFrame(
            {
                'name'           : names,
                'url'            : urls,
                'licensing'      : licensings,
                'download_link'  : download_links,
                'aws_bucket'     : aws_bucket,
                'aws_bucket_key' : aws_bucket_key,
                'origin'         : origins,
                'timestamp'      : timestamps
            }
        )
    
        df.to_sql('font_metadata', engine, if_exists='append')

Letter a, ~1860 fonts ...
Now downloading fonts for letter 'a', page 1/93 ...
Now downloading fonts for letter 'a', page 2/93 ...
Now downloading fonts for letter 'a', page 3/93 ...
Now downloading fonts for letter 'a', page 4/93 ...
Now downloading fonts for letter 'a', page 5/93 ...
Now downloading fonts for letter 'a', page 6/93 ...
Now downloading fonts for letter 'a', page 7/93 ...
Now downloading fonts for letter 'a', page 8/93 ...
Now downloading fonts for letter 'a', page 9/93 ...
Now downloading fonts for letter 'a', page 10/93 ...
Now downloading fonts for letter 'a', page 11/93 ...
Now downloading fonts for letter 'a', page 12/93 ...
Now downloading fonts for letter 'a', page 13/93 ...
Now downloading fonts for letter 'a', page 14/93 ...
Now downloading fonts for letter 'a', page 15/93 ...
Now downloading fonts for letter 'a', page 16/93 ...
Now downloading fonts for letter 'a', page 17/93 ...
Now downloading fonts for letter 'a', page 18/93 ...
Now downloading fonts for let

DatabaseError: (psycopg2.DatabaseError) SSL SYSCALL error: Operation timed out
 [SQL: 'select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s'] [parameters: {'name': u'font_metadata'}]