<h1 style = "font-size:25px">Nexus site scraping</h1>
<h2 style="font-size:15px">  Author: Jacob Massengill, et. al. (add your name if you edit this)</h2>

<p>This notebook defines classes and functions that will be used to scrape the Nexus skyrim site
</p>

In [24]:
#define a timer which will be used to 
#regulate the speed at which pages will be
#retrieved

#pages retrieved/sec = (1/self.mtime)
import time
class StopWatch:
    def __init__(self, time):
        self.mtime = time
    def start(self):
        ctime = time.time()+self.mtime
        while(ctime > time.time()):
            pass

In [25]:
#demo of stop_watch
timer = StopWatch(1)
for i in reversed(range(10)):
    timer.start()
    print (i+1, end=" ")

10 9 8 7 6 5 4 3 2 1 

In [68]:
#container class for information on mod
class ModBlock:
    def __init__(self,html=None):
        if html != None:
            self.from_html(html)
    def print_mod(self):
        print('{0}: {1}'.format('url', self.url))
        print('{0}: {1}'.format('likes', self.likes))
        print('{0}: {1}'.format('downloads', self.downloads))
        print('{0}: {1}'.format('name', self.name))
        print('{0}: {1}'.format('description', self.des))
        print('{0}: {1}'.format('created', self.created))
        print('{0}: {1}'.format('updated', self.update))
        print('{0}: {1}'.format('created by', self.creator))
    #should be an entry from get_nexus_mods
    def from_html(self, html):
        self.url = html.find('a', class_='image bubble-open pb-hover pb-left pb-ajax pb-forceclose', href=True)['href']
        self.likes = html.find('span', class_='likes').text
        self.downloads = html.find('span', class_='downloads').text
        self.name = html.find('a', class_='title').text
        self.des = html.find('div', class_=None).text
        self.created = html.find('div', class_='category-file-hover-released').text
        self.update = html.find('div', class_='category-file-hover-updated').text
        self.creator = html.find('a', class_='user').text
    #len(mlist) = 8
    def from_list(self, mlist):
        self.url = mlist[0]
        self.likes = mlist[1]
        self.downloads = mlist[2]
        self.name = mlist[3]
        self.des = mlist[4]
        self.created = mlist[5]
        self.update = mlist[6]
        self.creator = mlist[7]
    def get_id(self):
        id = [s for s in self.url.split('/') if s.isdigit()]
        return id[0]
    def to_list(self):
        data = [self.url, self.likes, self.downloads, self.name, '\''+self.des+'\'', self.created, self.update, self.creator]
        return data

In [27]:
import requests
from bs4 import BeautifulSoup
#gets a list of the nexus mods at url
#currently just gets the html. 
#change this to put things into a class
def get_nexus_mods(url):
    page = requests.get(url)
    
    if page.status_code != 200:
        print('bad error code')
        return None
    
    soup = BeautifulSoup(page.text, 'html')

    blockList = soup.find('ul', class_="block-list")
    popboxes = blockList.find_all('li', class_='popbox')
    
    return popboxes

In [30]:
#returns a list of modblocks from pages start to end (inclusive)
#grabs pages at a rate of rr/sec
#both start and end should be a positive integer, start < end
#verbose decides whether to print status or not
def get_nexus_mods_from_pages(start = 1, end = 1, rr = 1, verbose=False):
    timer = StopWatch(rr)
    mods = []
    for i in range(start, end+1):
        if verbose:
            print('\rgetting page {0}/{1}'.format(i,end), end=' ')

        timer.start()
        url = 'http://www.nexusmods.com/skyrim/mods/searchresults/?src_order=3&src_sort=0&src_view=1&src_tab=1&src_language=0&page='+str(i)+'&pUp=1'
        
        modList = get_nexus_mods(url)
        if modList == None:
            print('no mods on page {0}\nare you sure you\'re in range?'.format(url))
        for e in modList:
            mods.append(ModBlock(e))
    if verbose:
        print('\ndone')
    return mods

In [69]:
#get all mods from skyrim site demo
mods = get_nexus_mods_from_pages(end=3, verbose=True)

getting page 3/3 
done


In [70]:
mods[0].print_mod()

url: http://www.nexusmods.com/skyrim/mods/607/
likes: 131,006
downloads: 17,062,541
name: Skyrim HD - 2K Texture...
description: 
						The most downloaded high resolution texture mod for skyrim.						
created: Released: 19/11/2011 - 01:03AM
updated: Updated: 04/10/2015 - 12:49PM
created by: NebuLa1


In [63]:
import json, os

#mlist is a list of ModBlocks.
#This function writes mlist to a text file
def modblock_to_json(mlist=[], name = 'mods.json', mode='w'):
    jall = {}
    for mod in mlist:
        jall.update({mod.get_id(): mod.to_list()})
        
    data = {}
    if mode == 'a':
        if os.path.isfile(name):
            data = json_to_modblock(name)
            for mod in data:
                jall.update({mod.get_id(): mod.to_list()})
        
    with open(name, mode) as outfile:
        json.dump(jall, outfile)

In [57]:
import json

#returns a list of modblocks generated from name
def json_to_modblock(name='mods.json'):
    with open(name, 'r') as infile:
        jall = json.loads(infile.read())
    
    jmods = []
    for mod in jall.values():
        modblock = ModBlock()
        modblock.from_list(mod)
        jmods.append(modblock)
    return jmods

<h1 style="color: red; font-size: 60px;">Don't run<h1>
<p style="font-size: 16px">it should be run once to grab every mod on the skyrim site, but that is it.
   use it to base further site grabs.<br>
   instead, load lists using the json files</p>

In [None]:
#get all the mods from the skyrim site
#save them to json files
for i in range(13):
    allmods = get_nexus_mods_from_pages(start=100*(i)+1, end = 100*(i+1), verbose=True)
    modblock_to_json(allmods, name=('skyrim_mods_'+str(i+1)+'.json'))
        

In [None]:
allmods = get_nexus_mods_from_pages(start=1301, end=1322, verbose=True)
modblock_to_json(allmods, name='skyrim_mods_14.json')

<p style="font-size: 24px">the script below shows how to use the json files with modblocks</p>

In [110]:
modsj = json_to_modblock('skyrim_mods_all.json')
modsj[1].url
len(modsj)

39654