<h1 style = "font-size:25px">Nexus site scraping</h1>
<h2 style="font-size:15px">  Author: Jacob Massengill, et. al. (add your name if you edit this)</h2>

<p>This notebook defines classes and functions that will be used to scrape the Nexus skyrim site
</p>

In [3]:
#define a timer which will be used to 
#regulate the speed at which pages will be
#retrieved

#pages retrieved/sec = (1/self.mtime)
import time
class StopWatch:
    def __init__(self, time):
        self.mtime = time
    def start(self):
        ctime = time.time()+self.mtime
        while(ctime > time.time()):
            pass

In [None]:
#demo of stop_watch
timer = StopWatch(1)
for i in reversed(range(10)):
    timer.start()
    print (i+1, end=" ")

In [4]:
#container class for information on mod
class ModBlock:
    def __init__(self,html=None):
        if html != None:
            self.from_html(html)
    def print_mod(self):
        print('{0}: {1}'.format('url', self.url))
        print('{0}: {1}'.format('likes', self.likes))
        print('{0}: {1}'.format('downloads', self.downloads))
        print('{0}: {1}'.format('name', self.name))
        print('{0}: {1}'.format('description', self.des))
        print('{0}: {1}'.format('created', self.created))
        print('{0}: {1}'.format('updated', self.update))
        print('{0}: {1}'.format('created by', self.creator))
    #should be an entry from get_nexus_mods
    def from_html(self, html):
        self.url = html.find('a', class_='image bubble-open pb-hover pb-left pb-ajax pb-forceclose', href=True)['href']
        self.likes = html.find('span', class_='likes').text
        self.downloads = html.find('span', class_='downloads').text
        self.name = html.find('a', class_='title')['title']
        self.des = html.find('div', class_=None).text
        self.created = html.find('div', class_='category-file-hover-released').text
        self.update = html.find('div', class_='category-file-hover-updated').text
        self.creator = html.find('a', class_='user').text
    #len(mlist) = 8
    def from_list(self, mlist):
        self.url = mlist[0]
        self.likes = mlist[1]
        self.downloads = mlist[2]
        self.name = mlist[3]
        self.des = mlist[4]
        self.created = mlist[5]
        self.update = mlist[6]
        self.creator = mlist[7]
    def get_id(self):
        id = [s for s in self.url.split('/') if s.isdigit()]
        return id[0]
    def to_list(self):
        data = [self.url, self.likes, self.downloads, self.name, '\''+self.des+'\'', self.created, self.update, self.creator]
        return data

In [5]:
import requests
from bs4 import BeautifulSoup
#gets a list of the nexus mods at url
#currently just gets the html. 
#change this to put things into a class
def get_nexus_mods(url):
    page = requests.get(url)
    
    if page.status_code != 200:
        print('bad error code')
        return None
    
    soup = BeautifulSoup(page.text, 'html5lib')

    blockList = soup.find('ul', class_="block-list")
    popboxes = blockList.find_all('li', class_='popbox')
    
    return popboxes

In [6]:
#returns a list of modblocks from pages start to end (inclusive)
#grabs pages at a rate of rr/sec
#both start and end should be a positive integer, start < end
#verbose decides whether to print status or not
#site is the name of the site you wish to gather from. multi-word names should be pushed
#together. ex) 'worldoftanks'
def get_nexus_mods_from_pages(start = 1, end = 1, rr = 1, site='skyrim',verbose=False):
    timer = StopWatch(rr)
    mods = []
    for i in range(start, end+1):
        if verbose:
            print('\rgetting page {0}/{1}'.format(i,end), end=' ')

        timer.start()
        url = 'http://www.nexusmods.com/'+site+'/mods/searchresults/?src_order=3&src_sort=0&src_view=1&src_tab=1&src_language=0&page='+str(i)+'&pUp=1'
        
        modList = get_nexus_mods(url)
        if modList == None:
            print('no mods on page {0}\nare you sure you\'re in range?'.format(url))
        for e in modList:
            mods.append(ModBlock(e))
    if verbose:
        print('\ndone')
    return mods

In [7]:
#get all mods from skyrim site demo
mods = get_nexus_mods_from_pages(end=3, verbose=True)
#get all mods from oblivion site demo
obmods = get_nexus_mods_from_pages(end=3, verbose=True, site='oblivion')

getting page 3/3 
done
getting page 3/3 
done


In [8]:
mods[0].print_mod()
obmods[0].print_mod()

url: http://www.nexusmods.com/skyrim/mods/607/
likes: 132,727
downloads: 17,114,406
name: Skyrim HD - 2K Textures
description: 
						The most downloaded high resolution texture mod for skyrim.						
created: Released: 19/11/2011 - 01:03AM
updated: Updated: 12/10/2015 - 01:47AM
created by: NebuLa1
url: http://www.nexusmods.com/oblivion/mods/5296/
likes: 27,446
downloads: 1,527,021
name: Unofficial Oblivion Patch
description: 
						An Oblivion mod that fixes over 2,500 bugs (and 70,000 object placement errors.)						
created: Released: 02/07/2006 - 11:34AM
updated: Updated: 10/08/2015 - 06:53AM
created by: QuarnAndKivan


In [9]:
import json, os

#mlist is a list of ModBlocks.
#This function writes mlist to a text file
def modblock_to_json(mlist=[], name = 'mods.json', mode='w'):
    jall = {}
    for mod in mlist:
        jall.update({mod.get_id(): mod.to_list()})
        
    data = {}
    if mode == 'a':
        if os.path.isfile(name):
            data = json_to_modblock(name)
            for mod in data:
                jall.update({mod.get_id(): mod.to_list()})
        
    with open(name, mode) as outfile:
        json.dump(jall, outfile)

In [10]:
import json

#returns a list of modblocks generated from name
def json_to_modblock(name='mods.json'):
    with open(name, 'r') as infile:
        jall = json.loads(infile.read())
    
    jmods = []
    for mod in jall.values():
        modblock = ModBlock()
        modblock.from_list(mod)
        jmods.append(modblock)
    return jmods

<h1 style="color: red; font-size: 60px;">Don't run<h1>
<p style="font-size: 16px">it should be run once to grab every mod on the skyrim site, but that is it.
   use it to base further site grabs.<br>
   instead, load lists using the json files</p>

In [None]:
#get all the mods from the skyrim site
#save them to json files
for i in range(13):
    allmods = get_nexus_mods_from_pages(start=100*(i)+1, end = 100*(i+1), verbose=True, rr=0.1)
    modblock_to_json(allmods, name=('skyrim_mods_'+str(i+1)+'.json'))
        

In [None]:
allmods = get_nexus_mods_from_pages(start=1301, end=1327, verbose=True, rr=0.1)
modblock_to_json(allmods, name='skyrim_mods_14.json')

In [None]:
import json
allmods = {}
for i in range(14):
    mods = json.load(open('skyrim_mods_'+str(i+1)+'.json','r'))
    allmods.update(mods)
with open('skyrim_mods_all.json', 'w') as file:
    json.dump(allmods, file)

<p style="font-size: 24px">the script below shows how to use the json files with modblocks</p>

In [11]:
import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) 

modsj = json_to_modblock('skyrim_mods_all.json')
modsj.sort(key=lambda i: locale.atoi(i.downloads))
modsj.reverse()

modblock_to_json(modsj, 'skyrim_mods_all.json')

In [None]:
#Script to determine amount of mods with a graph
import json
import requests
from bs4 import BeautifulSoup

print("Hi")
allmods = json_to_modblock('skyrim_mods_all.json')

statscount = 0;


timer = StopWatch(.1)

for i in range(0, len(allmods)):
    timer.start()
    
    page = requests.get(allmods[i].url)
    soup = BeautifulSoup(page.text, 'html')
    
    tabs = soup.find_all('span', class_="desc")
    print(i)
    
    for tab in tabs:
        check = tab.text
        if check == "Stats":
            statscount= statscount+1
            
print(statscount)