In [3]:
## Webscraping
from bs4 import BeautifulSoup
import urllib2 as ul
import sys, os, math, shutil
import zipfile
import re

## PostGres DB
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

%load_ext sql
%config SqlMagic.autopandas=True

## Python 3-like
from __future__ import absolute_import, division, print_function

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [4]:
## ------------------------------------
def get_npages(soup):
    """
    Retrieves the total number of pages to crawl
    for a particular letter on dafont
    """
    
    n = 0
    
    ## Locate all links in the file
    for link in soup.find_all('a'):
        
        ## Retrive link text
        link_text = link.get('href')
        
        if link_text is None: continue
        
        ## Look only for the links that refer to other dafont pages
        if 'alpha.php' in link_text:
            try:
                ## Retrieve the last number of the link, corresponding to the page number
                current_n = int(re.findall(r'\d+', link_text)[-1])
            except IndexError:
                current_n = 0
            if current_n > n:
                n = current_n
                
    return n

In [5]:
## Specify the headers necessary to read the dafont.com pages
USERAGENT = 'something'
HEADERS = {'User-Agent': USERAGENT}

## Specify the URL pattern for dafont.com pages
dafont_pattern = 'http://www.dafont.com/alpha.php?lettre={0}&page={1}'

## --------------------------------------
def get_dafont_page_soup(lettre, i):
    """
    Returns the soup for a particular dafont page
    """
    
    request = ul.Request(dafont_pattern.format(lettre, i), headers=HEADERS)
    response = ul.urlopen(request)
    soup = BeautifulSoup(response, 'lxml')
    response.close()
    
    return soup
    

In [26]:
## --------------------------------------
def unicode_bullshit(text):
    """
    Deals with weird encodings
    """
    return ''.join([c if ord(c) < 128 else '_' for c in text])

In [47]:
class FontInfo(object):
    """
    A class to store the info scrapped from dafont
    about a particular font
    """
    
    font_exts = ['otf', 'ttf']
    
    ## ----------------------------------------
    def __init__(self, name, url, licensing, download_link):
        """
        Constructor
        """

        self.name = unicode_bullshit(name)
        self.url  = unicode_bullshit(url)
        self.licensing = unicode_bullshit(licensing)
        self.download_link = unicode_bullshit(download_link)
        self.paths = []
        
        
    ## ----------------------------------------
    def __repr__(self):
        """
        string representation for print method
        """
        
        return """
        {0}
        URL : {1}
        {2}
        download : {3}
        """.format(
            self.name,
            self.url,
            self.licensing,
            self.download_link
        )
    
    
    ## ----------------------------------------
    def download(self, path):
        """
        Download the font
        """
        
        ## Remember where we're from
        cwd = os.getcwd()
        
        ## Create a temporary directory
        tmp_path = os.path.join(path,'tmp')
        
        try:
            os.mkdir(tmp_path)
        except:
            shutil.rmtree(tmp_path)
            os.mkdir(tmp_path)
        
        os.chdir(tmp_path)
        
        ## download file name, assuming it's a zip file first
        fname = self.download_link.split('=')[-1] + '.zip'
        
        ## Download the file
        data = ul.urlopen(self.download_link)
        
        ## Save the file to disk
        with open(fname, 'wb') as f:
            f.write(data.read())
            f.close()
            
        ## open the zip file
        try:
            zf = zipfile.ZipFile(fname, 'r')
        except zipfile.BadZipfile:
            os.chdir(cwd)
            return False
        
        for f in zf.namelist():
            ext = f.split('.')[-1].lower()
            if not ext in self.font_exts: continue
            
            try:
                zf.extract(f)
            except:
                os.chdir(cwd)
                return False
            
            ## Remove directory structure
            if os.pathsep in f:
                shutil.move(f, os.path.split(f)[-1])
            
            new_path = os.path.join(path, os.path.split(f)[-1])
            new_path = unicode_bullshit(new_path)
            shutil.move(os.path.join(tmp_path, f), new_path)
            self.paths.append(new_path)
            
        if not self.paths:
            os.chdir(cwd)
            return False
            
        os.chdir(cwd)
        return True
        
             


## ----------------------------------------
def get_font_infos(soup):
    """
    Returns a list of FontInfo objects from a dafont page 
    """
    
    font_infos = []
    
    html_names = soup.find_all('div', class_='lv1left dfbg')
    html_infos = soup.find_all('div', class_='lv2right')
    html_dnlds = soup.find_all('div', class_='dlbox')
    
    for html_name, html_info, html_dnld in zip(html_names, html_infos, html_dnlds):
        name      = html_name.a.get_text()
        href      = html_name.a.get('href')
        
        try:
            licensing = html_info.find_all('a', class_='tdn help black')[0].get_text()
        except:
            licensing = 'Unknown'
            
        dl_link = html_dnld.a.get('href')
        
        url = 'http://www.dafont.com/{0}'.format(href)
        
        font_infos.append(FontInfo(name, url, licensing, dl_link))
        
    return font_infos


In [42]:
## Connect to the DB
dbname = 'fonts_db'
username = 'mtm'
pswd = ''

engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print(engine.url)

postgresql://mtm:@localhost/fonts_db


In [43]:
## Different possible lettres for the dafont urls
lettres = [
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
    'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '%23'
]

In [48]:
for lettre in lettres:
    
    soup = get_dafont_page_soup(lettre, 1)
    n_pages = get_npages(soup)
    
    print ('Letter {0}, ~{1} fonts ...'.format(lettre, n_pages*20))
    
    for i in range(1, n_pages+1):
        
        ## Don't start from the beginning every time
        if lettre in  ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q']: continue
        if lettre == 'r' and i < 17: continue
            
        
        print("Now downloading fonts for letter '{0}', page {1}/{2} ...".format(lettre, i, n_pages))
    
        soup = get_dafont_page_soup(lettre, i)
        font_infos = get_font_infos(soup)
    
        names          = []
        urls           = []
        licensings     = []
        download_links = []
        paths1         = []
        paths2         = []
        paths3         = []
    
        for font_info in font_infos:
            #print(font_info)
            if not font_info.download('/Users/mtm/Projects/FontFinder/dafont_fonts'): continue
        
            names.append(font_info.name)
            urls.append(font_info.url)
            licensings.append(font_info.licensing)
            download_links.append(font_info.download_link)
        
            paths1.append(font_info.paths[0])
        
            if len(font_info.paths) > 1:
                paths2.append(font_info.paths[1])
            else:
                paths2.append(None)
            
            if len(font_info.paths) > 2:
                paths3.append(font_info.paths[2])
            else:
                paths3.append(None)
            
        df = pd.DataFrame(
            {
                'name'          : names,
                'url'           : urls,
                'licensing'     : licensings,
                'download_link' : download_links,
                'local_path1'   : paths1,
                'local_path2'   : paths2,
                'local_path3'   : paths3
            }
        )
    
        df.to_sql('font_metadata', engine, if_exists='append')

Letter a, ~1860 fonts ...
Letter b, ~2100 fonts ...
Letter c, ~2260 fonts ...
Letter d, ~1920 fonts ...
Letter e, ~860 fonts ...
Letter f, ~1340 fonts ...
Letter g, ~1120 fonts ...
Letter h, ~1160 fonts ...
Letter i, ~620 fonts ...
Letter j, ~680 fonts ...
Letter k, ~1240 fonts ...
Letter l, ~1200 fonts ...
Letter m, ~2080 fonts ...
Letter n, ~740 fonts ...
Letter o, ~640 fonts ...
Letter p, ~1760 fonts ...
Letter q, ~220 fonts ...
Letter r, ~1180 fonts ...
Now downloading fonts for letter 'r', page 17/59 ...
Now downloading fonts for letter 'r', page 18/59 ...
Now downloading fonts for letter 'r', page 19/59 ...
Now downloading fonts for letter 'r', page 20/59 ...
Now downloading fonts for letter 'r', page 21/59 ...
Now downloading fonts for letter 'r', page 22/59 ...
Now downloading fonts for letter 'r', page 23/59 ...
Now downloading fonts for letter 'r', page 24/59 ...
Now downloading fonts for letter 'r', page 25/59 ...
Now downloading fonts for letter 'r', page 26/59 ...
Now down

In [None]:
%sql postgresql://mtm:@localhost/fonts_db

In [None]:
df_font_metadata = %sql SELECT * FROM font_metadata

In [None]:
df_font_metadata