# Stardew Valley Wiki scraping tool

This code scrapes the official Stardew Valley wiki for all of the fruits, veggies, and fish. It then performs some data cleanup.

In [1]:
import requests
from bs4 import BeautifulSoup
import re

In [2]:
req = requests.get('https://stardewvalleywiki.com/Fruits')
soup = BeautifulSoup(req.text, 'html.parser')

# Matches out the fruit name from the <a> tag
pattern = r'<a href="/(\w*(?:\s\w*)?)" title="\1">\1</a>'

# Isolate the big table at the start of the page
table = soup.find_all('table', class_ = 'wikitable')[0].tbody

fruitlist = []
# Loop over the rows of the table non-recursively, skipping the first two as they're just table metadata
for row in table.find_all('tr', recursive = False)[2:]:
    # Isolate the table data element, which contains a single <a> tag
    fruit = row.find_all('td', recursive = False)[1]
    # Pick out the <a> tag, convert it to a string, and replace the potential "_" in it (from the wiki URL) with a space for Regex ease
    fruit = fruit.a.__str__().replace('_', ' ')
    # Take the Regex match out, getting our fruit name!
    fruit = re.search(pattern, fruit).group(1)
    print(fruit)
    fruitlist.append(fruit)

Ancient Fruit
Apple
Apricot
Banana
Blackberry
Blueberry
Cactus Fruit
Cherry
Coconut
Cranberries
Crystal Fruit
Grape
Hot Pepper
Mango
Melon
Orange
Peach
Pineapple
Pomegranate
Qi Fruit
Rhubarb
Salmonberry
Spice Berry
Starfruit
Strawberry
Wild Plum


In [3]:
req = requests.get('https://stardewvalleywiki.com/Vegetables')
soup = BeautifulSoup(req.text, 'html.parser')

# Matches out the veg name from the <a> tag
pattern = r'<a href="/(\w*(?:\s\w*)?)" title="\1">\1</a>'

# Isolate the big table at the start of the page
table = soup.find_all('table', class_ = 'wikitable')[0].tbody

veglist = []
# Loop over the rows of the table non-recursively, skipping the first two as they're just table metadata
for row in table.find_all('tr', recursive = False)[2:]:
    # Isolate the table data element, which contains a single <a> tag
    veg = row.find_all('td', recursive = False)[1]
    # Pick out the <a> tag, convert it to a string, and replace the potential "_" in it (from the wiki URL) with a space for Regex ease
    veg = veg.a.__str__().replace('_', ' ')
    # Take the Regex match out, getting our veg name!
    veg = re.search(pattern, veg).group(1)
    print(veg)
    veglist.append(veg)


Amaranth
Artichoke
Beet
Bok Choy
Cauliflower
Corn
Eggplant
Fiddlehead Fern
Garlic
Green Bean
Hops
Kale
Parsnip
Potato
Pumpkin
Radish
Red Cabbage
Taro Root
Tea Leaves
Tomato
Unmilled Rice
Wheat
Yam


In [4]:
req = requests.get('https://stardewvalleywiki.com/Fish')
soup = BeautifulSoup(req.text, 'html.parser')

# Matches out the fish name from the <a> tag
pattern = r'<a href="/(\w*(?:\s\w*)?)" title="\1">\1</a>'

# Isolate the big table at the start of the page
table = soup.find_all('table', class_ = 'wikitable')[0].tbody

fishlist = []
# Loop over the rows of the table non-recursively, skipping the first two as they're just table metadata
for row in table.find_all('tr', recursive = False)[2:]:
    # Isolate the table data element, which contains a single <a> tag
    fish = row.find_all('td', recursive = False)[1]
    # Pick out the <a> tag, convert it to a string, and replace the potential "_" in it (from the wiki URL) with a space for Regex ease
    fish = fish.a.__str__().replace('_', ' ')
    # Take the Regex match out, getting our fish name!
    fish = re.search(pattern, fish).group(1)
    print(fish)
    fishlist.append(fish)


Anchovy
Tuna
Sardine
Bream
Largemouth Bass
Smallmouth Bass
Rainbow Trout
Salmon
Walleye
Perch
Carp
Catfish
Pike
Sunfish
Red Mullet
Herring
Eel
Octopus
Red Snapper
Squid
Sea Cucumber
Super Cucumber
Ghostfish
Stonefish
Ice Pip
Lava Eel
Sandfish
Scorpion Carp
Flounder
Midnight Carp
Sturgeon
Tiger Trout
Bullhead
Tilapia
Chub
Dorado
Albacore
Shad
Lingcod
Halibut
Woodskip
Void Salmon
Slimejack
Stingray
Lionfish
Blue Discus


Now we've got all our three lists of fruits, veggies, and fish in Stardew. Let's get parsing...

In [5]:
fruitraw = []
for fruit in fruitlist:
    fruitraw.append((fruit, BeautifulSoup(requests.get(f'https://stardewvalleywiki.com/{fruit.replace(" ", "_")}').text)))

In [6]:
fruitdata = []
for name, fruit in fruitraw:
    values = {}
    values['name'] = name

    table = fruit.find_all('table', {'id': 'infoboxtable'})[0].tbody
    rows = table.find_all('tr', recursive = False)

    offset = 0
    while True:
        try:
            t1 = rows[9 + offset].find_all('td', recursive = False)[0].table.tbody
            values['price'] = t1.find_all('table')[0].tbody.tr.find_all('td')[1].text.strip()[:-1]
            break
        except:
            offset += 1
            if offset == 20:
                break
    if offset == 20:
        continue

    pattern = r'(blue|red|orange|yellow|green|brown|purple|pink) dye'

    for line in fruit.find_all(string = re.compile('dye')):
        if match := re.search(pattern, line):
            values['color'] = match.group(1)
            break

    fruitdata.append(values)

In [7]:
vegraw = []
for veg in veglist:
    vegraw.append((veg, BeautifulSoup(requests.get(f'https://stardewvalleywiki.com/{veg.replace(" ", "_")}').text)))

In [8]:
vegdata = []
for name, veg in vegraw:
    values = {}
    values['name'] = name

    table = veg.find_all('table', {'id': 'infoboxtable'})[0].tbody
    rows = table.find_all('tr', recursive = False)

    offset = 0
    while True:
        try:
            t1 = rows[9 + offset].find_all('td', recursive = False)[0].table.tbody
            values['price'] = t1.find_all('table')[0].tbody.tr.find_all('td')[1].text.strip()[:-1]
            break
        except:
            offset += 1
            if offset == 20:
                break
    if offset == 20:
        continue

    pattern = r'(blue|red|orange|yellow|green|brown|purple|pink|pale violet|white) dye'

    for line in veg.find_all(string = re.compile('dye')):
        if match := re.search(pattern, line):
            values['color'] = match.group(1)
            break

    vegdata.append(values)

In [9]:
fishraw = []
for fish in fishlist:
    fishraw.append((fish, BeautifulSoup(requests.get(f'https://stardewvalleywiki.com/{fish.replace(" ", "_")}').text)))

In [26]:
fishdata = []
# scorpion carp not actually legendary but needs the same adjustment
legendaries = ['Legend', 'Crimsonfish', 'Angler', 'Glacierfish', 'Mutant Carp', 'Scorpion Carp'] 
for name, fish in fishraw:
    values = {}

    leg = 0
    if name in legendaries:
        leg = 1

    table = fish.find_all('table', {'id': 'infoboxtable'})[0].tbody
    rows = table.find_all('tr', recursive = False)

    values['name'] = name
    values['location'] = [a.text.strip() for a in rows[4].find_all('td', recursive = False)[1].find_all('a')]
    if values['location'] == '':
        values['location'] = rows[4].find_all('td', recursive = False)[1].text.strip()
    values['time'] = rows[5].find_all('td', recursive = False)[1].text.strip()
    values['time'] = re.match(r'.*(\d{1,2}[ap]m [-\u2013] \d{1,2}[ap]m|Any).*', values['time']).group(1)
    
    # Season needs some extra work for multiple seasons
    seasons = [a.text for a in rows[6].find_all('td', recursive = False)[1].find_all('a')]
    stops = ['Ginger Island', 'Secret Woods', 'Rain Totem']
    for stop in stops:
        if stop in seasons:
            seasons = seasons[:seasons.index(stop)]

    values['seasons'] = seasons

    # Price needs... even more
    pricetable = (rows[13 + leg].find_all('td', recursive = False)[0].table.tbody # Get main price chart
        .find_all('tr', recursive = False)[2] # Skip over headers
        .find_all('table')[0].tbody) # Get weird internal table that actually holds price values

    pricerows = pricetable.find_all('tr', recursive = False)

    values['price'] = pricerows[0].find_all('td')[1].text.strip()[:-1]

    fishdata.append(values)

In [27]:
newfish = []
for fish in fishdata:
    values = {}
    values['Name'] = fish['name'].lower()
    values['Sell'] = fish['price']

    locations = fish['location']
    values['Where/How'] = fish['location']
    for location in locations:
        if location == 'Ocean' or location == 'Pirate Cove' or location == 'Ginger Island':
            values['Where/How'] = 'Sea'
        elif location == 'Mountain Lake' or location == 'Witch\'s Swamp' or location == 'Secret Woods' or location == 'The Desert':
            values['Where/How'] = 'Pond'
        elif location == 'Town' or location == 'Forest River' or location == 'Mutant Bug Lair':
            values['Where/How'] = 'River'
        elif location == 'The Mines':
            values['Where/How'] = 'Mines'

    time = fish['time']
    if time == 'Any':
        time = 'All day'
    else:
        pattern = r'(\d{1,2})(\w{2}).*(\d{1,2})(\w{2})'
        time = '{} {} \u2013 {} {}'.format(*re.search(pattern, time).groups()).upper()

    seasons = fish['seasons']
    if seasons == ['All']:
        seasons = ['Winter', 'Spring', 'Summer', 'Fall']
    
    for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']:
        values[f'NH {month}'] = 'NA'
        values[f'SH {month}'] = 'NA'

    seasonmap = {
        'Winter': ['Dec', 'Jan', 'Feb'],
        'Spring': ['Mar', 'Apr', 'May'],
        'Summer': ['Jun', 'Jul', 'Aug'],
        'Fall': ['Sep', 'Oct', 'Nov'],
    }

    for season in seasons:
        for month in seasonmap[season]:
            values[f'NH {month}'] = time
            values[f'SH {month}'] = time
        
    newfish.append(values)

KeyError: 'seasons'

In [16]:
import os
import csv

OUTPUT_DIR = 'stardewvalley'

if os.path.exists(OUTPUT_DIR) and os.path.isfile(OUTPUT_DIR):
    os.remove(OUTPUT_DIR)

if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

In [25]:
# Create fish.csv
with open(os.path.join(OUTPUT_DIR, 'fish.csv'), 'w') as fout:
    writer = csv.DictWriter(fout, fieldnames = list(fishdata[0].keys()))
    writer.writeheader()
    writer.writerows(fishdata)

with open(os.path.join(OUTPUT_DIR, 'newfish.csv'), 'w') as fout:
    writer = csv.DictWriter(fout, fieldnames = list(newfish[0].keys()))
    writer.writeheader()
    writer.writerows(newfish)

In [None]:
# Create fruit.csv
with open(os.path.join(OUTPUT_DIR, 'fruit.csv'), 'w') as fout:
    writer = csv.DictWriter(fout, fieldnames = list(fruitdata[0].keys()))
    writer.writeheader()
    writer.writerows(fruitdata)

In [None]:
# Create veg.csv
with open(os.path.join(OUTPUT_DIR, 'veg.csv'), 'w') as fout:
    writer = csv.DictWriter(fout, fieldnames = list(vegdata[0].keys()))
    writer.writeheader()
    writer.writerows(vegdata)