In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
response = requests.get('https://www.goodreads.com/list/show/6934.Science_Fiction_Books_by_Female_Authors')
doc = BeautifulSoup(response.text)

In [3]:
all_texts = doc.find_all('tr', itemtype='http://schema.org/Book')
rows = []
for text in all_texts:
    row={}
    row['Rank'] = text.find(class_='number').text.strip()
    row['Title'] = text.find(class_='bookTitle').text.strip()
    row['Author'] = text.find(class_='authorName').text.strip()
    row['Score'] = text.find('a', href='#').text.strip()
    row['Votes'] = text.find('a', href='#').find_next_sibling('a').text.strip()
    row['Rating'] = text.find(class_='minirating').text.strip()
    rows.append(row)
rows

df = pd.DataFrame(rows, columns=['Rank', 'Title', 'Author', 'Score', 'Votes', 'Rating'])
df.head()

Unnamed: 0,Rank,Title,Author,Score,Votes,Rating
0,1,The Handmaid's Tale,Margaret Atwood,"score: 30,733",314 people voted,"4.09 avg rating — 1,103,055 ratings"
1,2,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,"score: 28,553",292 people voted,"4.33 avg rating — 5,742,381 ratings"
2,3,Frankenstein,Mary Wollstonecraft Shelley,"score: 21,909",224 people voted,"3.78 avg rating — 1,023,619 ratings"
3,4,"A Wrinkle in Time (Time Quintet, #1)",Madeleine L'Engle,"score: 18,720",196 people voted,"4.01 avg rating — 903,708 ratings"
4,5,The Left Hand of Darkness (Hainish Cycle #4),Ursula K. Le Guin,"score: 17,920",184 people voted,"4.06 avg rating — 98,822 ratings"


In [4]:
import re

In [5]:
df['Number_of_Ratings'] = df.Rating.str.extract("— (.*)")
df['Number_of_Ratings'] = df['Number_of_Ratings'].str.extract('(.*) ratings')
df.head()

Unnamed: 0,Rank,Title,Author,Score,Votes,Rating,Number_of_Ratings
0,1,The Handmaid's Tale,Margaret Atwood,"score: 30,733",314 people voted,"4.09 avg rating — 1,103,055 ratings",1103055
1,2,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,"score: 28,553",292 people voted,"4.33 avg rating — 5,742,381 ratings",5742381
2,3,Frankenstein,Mary Wollstonecraft Shelley,"score: 21,909",224 people voted,"3.78 avg rating — 1,023,619 ratings",1023619
3,4,"A Wrinkle in Time (Time Quintet, #1)",Madeleine L'Engle,"score: 18,720",196 people voted,"4.01 avg rating — 903,708 ratings",903708
4,5,The Left Hand of Darkness (Hainish Cycle #4),Ursula K. Le Guin,"score: 17,920",184 people voted,"4.06 avg rating — 98,822 ratings",98822


In [6]:
df['Number_of_Ratings'] = df.Number_of_Ratings.str.replace(',', '')
df.head()

Unnamed: 0,Rank,Title,Author,Score,Votes,Rating,Number_of_Ratings
0,1,The Handmaid's Tale,Margaret Atwood,"score: 30,733",314 people voted,"4.09 avg rating — 1,103,055 ratings",1103055
1,2,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,"score: 28,553",292 people voted,"4.33 avg rating — 5,742,381 ratings",5742381
2,3,Frankenstein,Mary Wollstonecraft Shelley,"score: 21,909",224 people voted,"3.78 avg rating — 1,023,619 ratings",1023619
3,4,"A Wrinkle in Time (Time Quintet, #1)",Madeleine L'Engle,"score: 18,720",196 people voted,"4.01 avg rating — 903,708 ratings",903708
4,5,The Left Hand of Darkness (Hainish Cycle #4),Ursula K. Le Guin,"score: 17,920",184 people voted,"4.06 avg rating — 98,822 ratings",98822


In [7]:
df['Score'] = df['Score'].str.extract(r'score:(.*)')

In [8]:
df['Score'] = df['Score'].str.replace(',', '')
df.head()

Unnamed: 0,Rank,Title,Author,Score,Votes,Rating,Number_of_Ratings
0,1,The Handmaid's Tale,Margaret Atwood,30733,314 people voted,"4.09 avg rating — 1,103,055 ratings",1103055
1,2,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,28553,292 people voted,"4.33 avg rating — 5,742,381 ratings",5742381
2,3,Frankenstein,Mary Wollstonecraft Shelley,21909,224 people voted,"3.78 avg rating — 1,023,619 ratings",1023619
3,4,"A Wrinkle in Time (Time Quintet, #1)",Madeleine L'Engle,18720,196 people voted,"4.01 avg rating — 903,708 ratings",903708
4,5,The Left Hand of Darkness (Hainish Cycle #4),Ursula K. Le Guin,17920,184 people voted,"4.06 avg rating — 98,822 ratings",98822


In [9]:
df['Name'] = df.Title.str.replace(r'[(].*[)]', '')
df['Series'] = df.Title.str.extract("\((.*),")
df['Number in Series'] = df['Title'].str.extract(r", #([\d])")
df['Rating'] = df['Rating'].str.extract(r'(\d[.]\d\d)')
df.head()

Unnamed: 0,Rank,Title,Author,Score,Votes,Rating,Number_of_Ratings,Name,Series,Number in Series
0,1,The Handmaid's Tale,Margaret Atwood,30733,314 people voted,4.09,1103055,The Handmaid's Tale,,
1,2,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,28553,292 people voted,4.33,5742381,The Hunger Games,The Hunger Games,1.0
2,3,Frankenstein,Mary Wollstonecraft Shelley,21909,224 people voted,3.78,1023619,Frankenstein,,
3,4,"A Wrinkle in Time (Time Quintet, #1)",Madeleine L'Engle,18720,196 people voted,4.01,903708,A Wrinkle in Time,Time Quintet,1.0
4,5,The Left Hand of Darkness (Hainish Cycle #4),Ursula K. Le Guin,17920,184 people voted,4.06,98822,The Left Hand of Darkness,,


In [11]:
df = df.drop(columns='Title')

In [12]:
df.head()

Unnamed: 0,Rank,Author,Score,Votes,Rating,Number_of_Ratings,Name,Series,Number in Series
0,1,Margaret Atwood,30733,314 people voted,4.09,1103055,The Handmaid's Tale,,
1,2,Suzanne Collins,28553,292 people voted,4.33,5742381,The Hunger Games,The Hunger Games,1.0
2,3,Mary Wollstonecraft Shelley,21909,224 people voted,3.78,1023619,Frankenstein,,
3,4,Madeleine L'Engle,18720,196 people voted,4.01,903708,A Wrinkle in Time,Time Quintet,1.0
4,5,Ursula K. Le Guin,17920,184 people voted,4.06,98822,The Left Hand of Darkness,,


In [15]:
column_titles = ['Rank', 'Name', 'Author', 'Series', 'Number in Series', 'Score', 'Votes', 'Rating', 'Number_of_Ratings']
df.reindex(columns=column_titles)

Unnamed: 0,Rank,Name,Author,Series,Number in Series,Score,Votes,Rating,Number_of_Ratings
0,1,The Handmaid's Tale,Margaret Atwood,,,30733,314 people voted,4.09,1103055
1,2,The Hunger Games,Suzanne Collins,The Hunger Games,1,28553,292 people voted,4.33,5742381
2,3,Frankenstein,Mary Wollstonecraft Shelley,,,21909,224 people voted,3.78,1023619
3,4,A Wrinkle in Time,Madeleine L'Engle,Time Quintet,1,18720,196 people voted,4.01,903708
4,5,The Left Hand of Darkness,Ursula K. Le Guin,,,17920,184 people voted,4.06,98822
5,6,Divergent,Veronica Roth,Divergent,1,13326,138 people voted,4.21,2603197
6,7,Catching Fire,Suzanne Collins,The Hunger Games,2,12749,133 people voted,4.29,2200976
7,8,The Giver,Lois Lowry,The Giver,1,12399,129 people voted,4.12,1535394
8,9,Kindred,Octavia E. Butler,,,11070,116 people voted,4.23,72373
9,10,The Dispossessed,Ursula K. Le Guin,,,10731,112 people voted,4.21,71658


In [16]:
df = df.reindex(columns=column_titles)

In [17]:
df.head()

Unnamed: 0,Rank,Name,Author,Series,Number in Series,Score,Votes,Rating,Number_of_Ratings
0,1,The Handmaid's Tale,Margaret Atwood,,,30733,314 people voted,4.09,1103055
1,2,The Hunger Games,Suzanne Collins,The Hunger Games,1.0,28553,292 people voted,4.33,5742381
2,3,Frankenstein,Mary Wollstonecraft Shelley,,,21909,224 people voted,3.78,1023619
3,4,A Wrinkle in Time,Madeleine L'Engle,Time Quintet,1.0,18720,196 people voted,4.01,903708
4,5,The Left Hand of Darkness,Ursula K. Le Guin,,,17920,184 people voted,4.06,98822
