## Import Libraries

In [33]:
import bs4
import re
from bs4 import BeautifulSoup, SoupStrainer
import httplib2
import requests
import pandas as pd
import csv

## Some Functions

In [34]:
#################### CSV ####################
#initialize csv
def init_csv():
  row_lst = ['Fic Name', 'Author', 'Relationships', 'Link', 'Kudo_Rate', 'Warning', 'Hits', 'Chapters', 'Summary']
  with open(r'bestworks.csv', "w+", newline='') as file:
  # opening the file with w+ mode truncates the file\
    file.truncate() # create empty csv
    writer = csv.writer(file)
    writer.writerow(row_lst)
    file.close()

#write csv
def write_csv(title, author, relationships, link_str, hits, kudo, warning, chapters, summary):
  with open(r'bestworks.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    link_str = remove_brackets(link_str)
    if int(kudo)== 0 or int(hits) == 0:
      KR = 'n/a'
    else:
      KR = int(kudo)/int(hits) * 100
    summary = concate_summary_paragraph(summary)
    relationships = concate_summary_paragraph(relationships)
    writer.writerow([title, author, relationships, link_str, KR, warning, hits, chapters, summary])
    file.close()

#################### Data ####################

def remove_brackets(text):
  text = text.replace('[', '')
  text = text.replace(']', '')
  return text

def parse_text_only(tag):
  if tag is None:
    return 0
  if type(tag) == bs4.element.Tag:
    return tag.get_text().strip()
  else:
    return tag.strip()

def lst_parse_text_only(tags):
  tag_lst = []
  if tags is None:
    return 0
  for tag in tags:
    if type(tag) == bs4.element.Tag:
      tag_lst.append(tag.get_text().strip())
    else:
      tag_lst.append(tag.strip())
  return tag_lst

def print_summary(sum_lst):
  print('Summary:')
  for para in sum_lst:
    print(para)

# concate list
def concate_summary_paragraph(sum_lst):
  summary_para = ''
  for para in sum_lst:
    summary_para += para + ' | '
  return summary_para



################ MAIN ################

def Generate_work_data(results):
  for work in results.find_all('li', {'role': 'article'}):
      print(work)
      # get link, title, author
      work_data = work.find('h4', {'class': "heading"})
      link = work_data.find('a', href=True)
      title = parse_text_only(work_data.find('a', href=True))
      # print(title)
      author = parse_text_only(work_data.find('a', {'rel': 'author'}))

      # get warning and relationship tags
      work_tags = work.find('ul', {'class': "tags commas"})
      warining_tag = parse_text_only(work_tags.find('a', {'class': 'tag'}))
      relationships_tags = lst_parse_text_only(work_tags.find_all('li', {'class': 'relationships'}))

      # Get Work Summary
      try: # in case there is no summary
        work_summary = work.find('blockquote', {'class': "userstuff summary"})
        summary = lst_parse_text_only(work_summary.find_all('p'))

      except AttributeError:
          # do another thing
          continue

      # get hits, kudos, chapters, word counts, etc
      work_stats = work.find('dl', {'class': "stats"})
      hits = parse_text_only(work_stats.find('dd', {'class': 'hits'}))
      language = parse_text_only(work_stats.find('dd', {'class': 'language'}))
      chapters = parse_text_only(work_stats.find('dd', {'class': 'chapters'}))
      kudos = parse_text_only(work_stats.find('dd', {'class': 'kudos'}))
      bookmarks = parse_text_only(work_stats.find('dd', {'class': 'bookmarks'}))

      link_string = '[' + str('https://archiveofourown.org' + str(link['href'])) + ']' # parse link

      # Updatecsv
      write_csv(title, author, relationships_tags, link_string, hits, kudos, warining_tag, chapters, summary)


def Generate_work_data_test(results):
  try:
    for work in results.find_all('li', {'role': 'article'}):
      # print(work)
      # get link, title, author
      work_data = work.find('h4', {'class': "heading"})
      link = work_data.find('a', href=True)
      title = parse_text_only(work_data.find('a', href=True))
      # print(title)
      author = parse_text_only(work_data.find('a', {'rel': 'author'}))

      # get warning and relationship tags
      work_tags = work.find('ul', {'class': "tags commas"})
      warining_tag = parse_text_only(work_tags.find('a', {'class': 'tag'}))
      relationships_tags = lst_parse_text_only(work_tags.find_all('li', {'class': 'relationships'}))

      # Get Work Summary
      try: # in case there is no summary
        work_summary = work.find('blockquote', {'class': "userstuff summary"})
        summary = lst_parse_text_only(work_summary.find_all('p'))

      except AttributeError:
          # do another thing
          continue

      # get hits, kudos, chapters, word counts, etc
      work_stats = work.find('dl', {'class': "stats"})
      hits = parse_text_only(work_stats.find('dd', {'class': 'hits'}))
      language = parse_text_only(work_stats.find('dd', {'class': 'language'}))
      chapters = parse_text_only(work_stats.find('dd', {'class': 'chapters'}))
      kudos = parse_text_only(work_stats.find('dd', {'class': 'kudos'}))
      bookmarks = parse_text_only(work_stats.find('dd', {'class': 'bookmarks'}))

      link_string = '[' + str('https://archiveofourown.org' + str(link['href'])) + ']' # parse link

      # Updatecsv
      write_csv(title, author, relationships_tags, link_string, hits, kudos, warining_tag, chapters, summary)

  except AttributeError:
    print('meet error')
    # print(results)
    return 0


# Scrape through pages loop

def loop_pages(general_url, start_page, end_page):
  for i in range (start_page, end_page):
    URL = str(general_url) + "?page=" + str(i) # get page number url
    # print(url)
    # print(URL)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find("ol", {'class':'work index group'}) #得到当前页面的所有work
    # print(results)
    # Generate_work_data(results) # loop所有work，得到相应的info然后写入csv
    Generate_work_data_test(results)
    print("working on page " + str(i) + "...")

## Main

In [37]:
# all the variables:

# the kudoRate filter. Mostly when it is higher than 9 then it would be a good fic.
# Reminder: but if this is a fic with multiple chapter&high words counts, then the kudo rate would be lower.

kudo_rate_requirement = 9
page_num = 165 # check how many pages in total for scraping
url = 'https://archiveofourown.org/tags/Steve%20Harrington*s*Eddie%20Munson/works' # you can replace any ships's link you want to here. As long as it match the format(you can just c/p from AO3)

init_csv() # initialzie a empty CSV

# split the pages in 3 parts
start_p = 1
p1 = int(page_num/ 1.5)
end_p = page_num

# IF MEET NONE TYPE ERROR ：
#JUST RELOAD AND WAIT SEVERAL MINS AND RUN AGAIN, might have this probem when requesting too many time or too much pages...sort of.

# loop_pages(url, 1, 100)
loop_pages(url, 120, 165)



working on page 120...
working on page 121...
working on page 122...
working on page 123...
working on page 124...
working on page 125...
working on page 126...
working on page 127...
working on page 128...
working on page 129...
working on page 130...
working on page 131...
working on page 132...
working on page 133...
working on page 134...
working on page 135...
working on page 136...
working on page 137...
working on page 138...
working on page 139...
working on page 140...
working on page 141...
working on page 142...
working on page 143...
working on page 144...
working on page 145...
working on page 146...
working on page 147...
working on page 148...
working on page 149...
working on page 150...
working on page 151...
working on page 152...
working on page 153...
working on page 154...
working on page 155...
working on page 156...
working on page 157...
working on page 158...
working on page 159...
working on page 160...
working on page 161...
working on page 162...
working on 

In [38]:
# 读取csv, drop invalid columns
df = pd.read_csv("bestworks.csv", sep=",")
df = df.dropna()

# drop low kudo rates
indexNames = df[df['Kudo_Rate'] < kudo_rate_requirement].index
df.drop(indexNames , inplace=True)

# sort by KR and Hits
df.sort_values(["Hits", "Kudo_Rate"], axis=0, ascending=[False, False], inplace=True) # sorting bu Hits and KR
# df.sort_values(["Kudo_Rate"], axis=0, ascending=[False], inplace=True) # sorting only by KR

# drop any duplaicates
df.drop_duplicates(inplace=True)

df.to_csv('best_works_sorted.csv', sep='\t', encoding='utf-8')
df.head(20) # show the first 20 columns in output file

Unnamed: 0,Fic Name,Author,Relationships,Link,Kudo_Rate,Warning,Hits,Chapters,Summary
224,the affliction of the feeling,nondz (pinkjook),Steve Harrington/Eddie Munson | Robin Buckley ...,https://archiveofourown.org/works/39458634,13.320681,No Archive Warnings Apply,61123,2/2,"“Hold on,” Robin interrupts. “Hold on, is this..."
150,chelsea,randomascas,Steve Harrington/Eddie Munson | Robin Buckley/...,https://archiveofourown.org/works/39402249,9.464535,No Archive Warnings Apply,45792,10/10,"“Well, I didn’t think you were a good dude unt..."
819,Crimson and Clover,Plastiktramps,Steve Harrington/Eddie Munson | Robin Buckley/...,https://archiveofourown.org/works/39430926,10.353652,No Archive Warnings Apply,41029,1/1,“There’s no immediate supernatural danger to o...
288,"Hey Babe, Your Hair's Alright (Hey Babe, Let's...",BowieBond,Steve Harrington/Eddie Munson | Eddie Munson &...,https://archiveofourown.org/works/39597543,9.37533,Creator Chose Not To Use Archive Warnings,37876,11/11,"His bed was familiar, comfortable, warm. He ne..."
603,Support Local,Ijustlikereadingcutefics,Steve Harrington/Eddie Munson | Minor or Backg...,https://archiveofourown.org/works/39294930,9.186983,No Archive Warnings Apply,36998,6/6,Eddie raised an eyebrow. “You… like metal?” | ...
746,Over and Over,Plastiktramps,Steve Harrington/Eddie Munson |,https://archiveofourown.org/works/39496527,10.064112,No Archive Warnings Apply,35095,1/1,"“I’m… sorry?” he tries, and Eddie immediately ..."
502,we're just kids in the dark,FagurFiskur,Steve Harrington/Eddie Munson | Robin Buckley/...,https://archiveofourown.org/works/39628143,16.507212,No Archive Warnings Apply,33761,2/2,“Just relax.” Eddie runs his fingers through S...
524,you shook me all night long,joehardys,Steve Harrington/Eddie Munson |,https://archiveofourown.org/works/39643800,9.385635,No Archive Warnings Apply,33498,1/1,"There’s a beat of silence before Eddie asks, “..."
801,falling in love ( is hard on the knees ),lowpoli,Steve Harrington/Eddie Munson |,https://archiveofourown.org/works/39445827,9.881744,No Archive Warnings Apply,33233,1/1,"Steve's stuck babysitting, an unsurprising rev..."
547,you're the one (that i want),wayferette,Steve Harrington/Eddie Munson |,https://archiveofourown.org/works/39289032,10.286534,No Archive Warnings Apply,32771,6/6,Steve is a hopeless romantic with a notched be...


In [39]:
# download the csv file.
# you can open it in excel to get direct acess to the links. (use hyperlink)
from google.colab import files
files.download('best_works_sorted.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>