# Regular scrapper for the fanworks in FFnet. Collects metadata on unrestricted works listed for a search query.

1) load the relevant python libraries
2) define the scraping function for query pages
3) set up the database name, the search query url, and the hold time 3-5 s
4) create the fanwork database (in MySQL)
*5a) If the script is interrupted, call on the last page queried with the variable below (uncommented) *
5) test the scrapper on FFnet query page,
6) populate the database from FFnet 
    this runs until it brakes because of page laoding issues or it reaches the end of the list of works.
7) check the status of the database and scraping 
8) cleaning up duplicates in a database

And that's it. Now you have a database of metadata on fanworks on FFnet according to some search queury, like Sherlock/Harry Potter Crossovers, or Agent Carter (TV), or anything you like, really. I wouldn't try downloading the whole archive, that's just rude, but otherwise, go to town.

The query structure for FFnet is fits to the metadata presented.

The strapper collects the metadata available from the information listed on FFnet query pages. For each work listed, the database retains:

1) 'url': String

2) 'Node': Int

3) 'Title': String

4) 'Creator': String

5) 'Fandoms': List of strings

6) 'Summary': String

7)  'Rating': String {K, K+, T, M}

8) 'Language': String

9) 'Genre': List of strings

10) 'Relationships': List of strings

11) 'Characters': List of strings

12) 'Chapters': Int

13) 'Words': Int

14) 'Reviews': Int

15) 'Favs': Int

16) 'Follows': Int

17) 'Update': Int (YYYYMMDD, date last updated)

18) 'Published': Int (YYYYMMDD, date first published)

19) 'Complete': Int {1 for complete, 0 for Incomplete}

Some feilds may be empty of they are not reported on the query page, as, for example, FFnet doesn't report Relationships when the creator doesn't specify.


In [None]:
# 1) load the relevant python libraries

# necessary libraries. One day I will use beautiful soup.
import requests
import lxml
import time
from lxml import html
from dumptruck import DumpTruck
from time import strptime
from datetime import datetime
from datetime import date
from collections import OrderedDict

from collections import Counter
from collections import deque
import math



In [None]:
# 2) define the scraping function for query pages

def query_FFN_work(titles,wurls,summaries,stats,times,fandoms,today):
    
    entry = {'url': '', 
             'Node': 0, 
             'Title': '', 
             'Creator': '', 
             'Fandoms': [fandoms], 
             'Summary': '' , 
             'Rating': '', 
             'Language': '' , 
             'Genre': [], 
             'Relationships': [], 
             'Characters': [],
             'Chapters': 0, 
             'Words': 0, 
             'Reviews': 0, 
             'Favs': 0, 
             'Follows': 0, 
             'Updated': 0, 
             'Published': 0,  
             'Complete': 0}
    
    if len(wurls)>0:
        if titles[0] == 'reviews':
            titles.popleft()
        entry['Title'] = titles.popleft()
        entry['Creator'] = titles.popleft()
        entry['url'] = wurls.popleft()
        entry['Node'] = entry['url'].split('/')[4]
        entry['Summary'] = summaries.popleft()

        ## stats and times
        stat = stats.popleft()
        while len(stats)> 0 and stats[0][:5] != 'Rated':
            stat += stats.popleft()
        details = deque(stat.split(' - '))
        #print details

        if details[-1] == 'Complete':
            entry['Complete'] = 1
            details.pop()
        if details[0][:5] == 'Rated':
            r = details.popleft().split(': ')
            entry['Rating'] = r[-1]
            entry['Language'] = details.popleft()
        if details[0][:4] != 'Chap':
            genre = details.popleft().split('/')
            for g in genre:
                if g == 'Comfort': # arg Hurt/Comfort
                    entry['Genre'][-1] += '/'
                    entry['Genre'][-1] += g
                else:
                    entry['Genre'].append(g)
        if details[0][:4] == 'Chap':
            entry['Chapters'] = int(details.popleft().split(': ')[-1])
        if details[0][:4] == 'Word':
            dig = details.popleft().split(': ')[-1].split(',')
            if len(dig) == 1:
                numb = int(dig[0])
            else:  
                numb = 0
                i = 0
                while len(dig)>0:
                    numb += int(dig.pop())*math.pow(1000, i)
                    i+=1
                numb = int(numb)
            entry['Words'] = numb
        if details[0][:4] == 'Revi':
            dig = details.popleft().split(': ')[-1].split(',')
            if len(dig) == 1:
                numb = int(dig[0])
            else:  
                numb = 0
                i = 0
                while len(dig)>0:
                    numb += int(dig.pop())*math.pow(1000, i)
                    i+=1
                numb = int(numb)
            entry['Reviews'] = numb
        if details[0][:4] == 'Favs':
            dig = details.popleft().split(': ')[-1].split(',')
            if len(dig) == 1:
                numb = int(dig[0])
            else:    
                numb = 0
                i = 0
                while len(dig)>0:
                    numb += int(dig.pop())*math.pow(1000, i)
                    i+=1
                numb = int(numb)
            entry['Favs'] = numb
        if details[0][:4] == 'Foll':
            dig = details.popleft().split(': ')[-1].split(',')
            if len(dig) == 1:
                numb = int(dig[0])
            else:  
                numb = 0
                i = 0
                while len(dig)>0:
                    numb += int(dig.pop())*math.pow(1000, i)
                    i+=1
                numb = int(numb)
            entry['Follows'] = numb
            
        if details[0] == 'Updated: ':
        #if details[0][:4] == 'Upda':
            details.popleft()
            t = times.popleft()
            t = t.split('/')
            if len(t) < 2:
                entry['Updated'] = today
            else: 
                if len(t) < 3:
                    year = today/10000
                    entry['Updated'] = year + int(t[0])*100 + int(t[1])
                else:
                    entry['Updated'] = int(t[2])*10000 + int(t[0])*100 + int(t[1])

        #if details[0][:4] == 'Publ':
        if details[0] =='Published: ':
            details.popleft()
            t = times.popleft()
            t = t.split('/')
            if len(t) < 2:
                entry['Published'] = today
            else: 
                if len(t) < 3:
                    year = today/10000
                    entry['Published'] = int(year)*10000 + int(t[0])*100 + int(t[1])
                else:
                    entry['Published'] = int(t[2])*10000 + int(t[0])*100 + int(t[1])

        if entry['Updated'] == 0:
            entry['Updated'] = entry['Published']
#         if entry['Published'] == 0:
#             entry['Published'] = entry['Updated']

        if len(details) == 1:
            r = details.popleft().split(']')
            #print r
            for rel in r:
                if len(rel)>0:
                    if rel[0] == '[':
                        rel = rel[1:]
                        entry['Relationships'].append(rel)
                    entry['Characters'] += rel.split(', ')
                #print entry['Characters']

        # check for parsing failures 
        if len(details)!= 0:
            print 'Failed parse'
            print details
            print entry['Node']
        #print entry    
        
    return [entry,titles,wurls,summaries,stats,times]

In [None]:
# 3) set up the database name, the search query url, and the hold interval (3 seconds is recommended)

#Specify database name
database_name = "FFN_StarTrek-Voyager"

url = 'https://www.fanfiction.net/tv/StarTrek-Voyager/?&srt=1&r=10'
# specify pause time between page quaries, in seconds.
hold = 5

In [None]:
# 4) create the fanwork database (in MySQL)

#set up data base, using current date to specify database
TODAY = datetime.today().isoformat()
today = int(TODAY[0:4])*10000+int(TODAY[5:7])*100+int(TODAY[8:10])
fandoms = 'StarTrek: Voyager'

# not sure if all this is necessary
LAST = 0
PAGES = 363
pages = 0
lastNode = 12489605 # from most recent work?

DBname = database_name + "_" + TODAY[:10] + ".db"
print DBname
#Example: DBname = 'AO3_Supernatural_2016-05-02.db'

dt = DumpTruck(dbname=DBname)
if 'dumptruck' in dt.tables():
    data = dt.dump()
    print 'Adding to existing database'
    print len(data)
    print data[0]
    print data[-1]
    # dt.drop()
else:
    print 'Generating new database'

In [None]:
# 5a) If the script is interrupted, call on the last page queried with the variable below (uncommented) 

#url = 'https://www.fanfiction.net/tv/Sherlock/?&srt=2&r=10&p=2003'


In [None]:
# 5) test the scrapper on FFnet query page,

# access the query page and extract the list of max 25 works 
page = requests.get(url)
tree = html.fromstring(page.text)
tree.make_links_absolute(url)

works = tree.xpath('//div[@class="z-list zhover zpointer "]')
if len(works) < 1:
    print url
    time.sleep(hold)
    page = requests.get(url)
    tree = html.fromstring(page.text)
    tree.make_links_absolute(url)
    works = tree.xpath('//div[@class="z-list zhover zpointer "]')

titles = deque(works[0].xpath('//div[@class="z-list zhover zpointer "]/a/text()'))
wurls = deque(works[0].xpath('//a[@class="stitle"]/@href'))
summaries = deque(works[0].xpath('//div[@class ="z-indent z-padtop"]/text()'))
stats = deque(works[0].xpath('//div[@class ="z-padtop2 xgray"]/text()'))
times = deque(works[0].xpath('//div[@class ="z-padtop2 xgray"]/span/text()'))
# print times
# print stats

links = works[0].xpath('//center[@style="margin-top:5px;margin-bottom:5px;"]/a/@href')
nextURL = links[-1]
print nextURL

while len(wurls)>20:
    [item,titles,wurls,summaries,stats,times] = query_FFN_work(titles,wurls,summaries,stats,times,fandoms,today)    
    print item
    dt.insert(item)
# print times
# print stats


In [None]:
# 6) populate the database from FFnet
LAST = 0
while LAST < 1:
    page = requests.get(nextURL)
    tree = html.fromstring(page.text)
    tree.make_links_absolute(nextURL)

    works = tree.xpath('//div[@class="z-list zhover zpointer "]')
    if len(works) < 1:
        print nextURL
        time.sleep(4)
        page = requests.get(nextURL)
        tree = html.fromstring(page.text)
        tree.make_links_absolute(nextURL)
        works = tree.xpath('//div[@class="z-list zhover zpointer "]')
        
    titles = deque(works[0].xpath('//div[@class="z-list zhover zpointer "]/a/text()'))
    wurls = deque(works[0].xpath('//a[@class="stitle"]/@href'))
    summaries = deque(works[0].xpath('//div[@class ="z-indent z-padtop"]/text()'))
    stats = deque(works[0].xpath('//div[@class ="z-padtop2 xgray"]/text()'))
    times = deque(works[0].xpath('//div[@class ="z-padtop2 xgray"]/span/text()'))

    while len(wurls)>0:
        [item,titles,wurls,summaries,stats,times] = query_FFN_work(titles,wurls,summaries,stats,times,fandoms,today)    
        dt.insert(item)
        
    # call the next page, if we haven't reached the end of PAGES or search
    links = works[0].xpath('//center[@style="margin-top:5px;margin-bottom:5px;"]/a/@href')
    if pages < PAGES and int(links[-2].split('=')[-1])>int(links[-1].split('=')[-1]):
        nextURL = links[-1]
        print nextURL
        pages +=1
    else:
        LAST = 1
    
    lastNode = item['Node']
    print lastNode
        
    # pause before next line
    time.sleep(hold)
    
print 'Finnished run through archive.'

In [None]:
# if for whatever reason it's necessary to limit the number of query pages accessed...

PAGES = 2295 

In [None]:
# 7) check the status of the database and scraping 

data = dt.dump()
print url
print data[-1]
print LAST

In [None]:
# 8) cleaning up duplicates in a database
DBname = DBname[:-3]+"_cleaned.db"
cleanDB = DumpTruck(dbname=DBname)
if 'dumptruck' in cleanDB.tables():
    cleanDB.drop() # nothing to see here
#data = dt.dump() # data to be cleaned of duplicates
NODES = set() # numbers 

# work in data, check NODE against NODES list
# if the node is not already there, insert it and insert work into clearDB

for work in data:
    if work['Node'] not in NODES:
        cleanDB.insert(work)
        NODES.add(work['Node'])

print len(data)
cleandata = cleanDB.dump()
print len(cleandata)
print len(NODES)
print DBname