# Part 1: Gathering Forum Data

Author: David Skarbrevik

W266 NLP course 

Fall 2017

Part 1 of Final Project

### Make a class to gather the data

In [3]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
import re
import os
import ast
import sys
import pandas as pd
from collections import defaultdict

class Blizzard_Forum_Scraper():
    
    overwatch_forum_url = "overwatch/22813879"
    sc2_forum_url = "sc2/40568"
    
    def __init__(self, forum_type):
        
        if forum_type not in ["sc2","overwatch"]:
            raise NameError("please set 'forum_type' paramater to 'sc2' or 'overwatch'") 
        
        self.forum_type = forum_type
        self.data_pd = pd.DataFrame()
        self.url_open_errors = []
        # this is where data will be stored until it is transfered to our Pandas Dataframe
        self.tmp_data = {"user":[], "date":[], "time":[],"upvotes":[], "downvotes":[], 
                         "op?":[], "topic":[], "text":[], "user_posts":[], "topic_id":[]}   
     
    # helper function for get_forum_data()
    def get_post_data(self, url):
        
        index = 1
        
        # loop over pages of a specific post until the post has no more pages (most will only have 1)
        while True:
            paged_url = "{0}?page={1}".format(url,index)
            for i in range(10):
                try:
                    uClient = urlopen(paged_url)
                except:
                    continue
                break
            else:
                self.url_open_errors.append(paged_url)
                continue
                    
            page_html = uClient.read()
            uClient.close()
            page_soup = soup(page_html, "html.parser")
            
            # if no more pages, we will see an error page
            no_more_pages = page_soup.find("section",{"class":"Error"})
            if no_more_pages:
                return index-1
                
            #if not an error page, get the data from this page  
            else:
                all_replies = page_soup.find_all("div", {"class":"TopicPost"})
                
                user_data = {"user":"", "date":"", "time":"","upvotes":"", "downvotes":"", 
                             "op?":"", "topic":"", "text":"", "user_posts":"", "topic_id":""}
                 
                topic_data = page_soup.find("section",{"class":"Topic"})["data-topic"]

                user_data["topic_id"] = re.search(r'(?<="id":)\d*', topic_data).group()

                for reply in all_replies:

                    some_data = ast.literal_eval(reply["data-topic-post"])
                    time_data = reply.find("a",{"class":"TopicPost-timestamp"})["data-tooltip-content"]
                    user_data["user"] = some_data["author"]["name"]
                    user_data["date"] = time_data.split(" ")[0]
                    user_data["time"] = time_data.split(" ")[1]
                    user_data["upvotes"] = some_data["rank"]["voteUp"]
                    user_data["downvotes"] = some_data["rank"]["voteDown"]
                    user_data["topic"] = page_soup.find("span",{"class":"Topic-title"}).text
                    user_data["text"] = reply.find("div",{"class":"TopicPost-bodyContent"}).text
                    user_data["user_posts"] = reply.find("span",{"class":"Author-posts"}).text.strip()

                    if reply["id"] == "post-1":
                        user_data["op?"] = True
                    else:
                        user_data["op?"] = False
                    
                    for key in user_data.keys():
                        self.tmp_data[key].append(user_data[key])
                index += 1

        
    def get_forum_data(self, initial_page=1, max_page=10000000, output=True):
        
        ####################################################################
        # PARAMETERS EXPLAINED                                             #                            
        # initial_page: first page to scrape.                              #                            
        # max_page: final page to scrape.                                  #                            
        # output: if True, save to file, else just keep data in Pandas DF. #                            
        ####################################################################
        
        ##########################################
        # Step 1) try to catch user input errors #
        ##########################################
        if type(initial_page) != int:
            raise TypeError("type(initial_page) was set to a '{}' but must be type 'int'".format(type(initial_page.__name__)))
        elif initial_page <= 0:
            raise Exception("initial_page must be an 'int' greater than 0")
    
        if type(max_page) != int:
            raise TypeError("type(max_page) was set to a '{}' but must be type 'int'".format(type(max_page.__name__)))
        elif max_page < initial_page:
            raise Exception("max_page must be greater than or equal to initial_page")
        if output not in (True, False):
            raise TypeError("output parameter must be a boolean.")
        
        if output:
            file_name = "{0}_database.csv".format(self.forum_type)
            save_location = os.path.join(os.getcwd(),file_name)
            if os.path.isfile(file_name):
                print("\x1b[1;31m" + "WARNING: " + "\x1b[0m" + "Webscrape will be appended to {0}. \
                      \n Close program now and rename file if that is not what you wanted.".format(save_location)) 
        else:
            raise NameError('set output parameter to "csv" or "json" or False') 

        
        ##########################################################
        # Step 2) set up important variables for data collection #
        ##########################################################
        tmp_db_exists = False                                     
        for key in self.tmp_data.keys():
            if self.tmp_data[key]:
                tmp_db_exists = True
        if tmp_db_exists:
            self.tmp_data = {"user":[], "date":[], "time":[],"upvotes":[], "downvotes":[], 
                 "op?":[], "topic":[], "text":[], "user_posts":[], "topic_id":[]}   
        
        if self.forum_type == "sc2":
            forum_of_interest = self.sc2_forum_url
        elif self.forum_type == "overwatch":
            forum_of_interest = self.overwatch_forum_url

        error_count = 0
        
        ###########################################################
        # Step 3) loop over all desired pages to gather html data #
        ###########################################################
        
        for curr_page in range(initial_page, max_page+1):

            ##############################                                  
            # Step 3.1) download webpage #
            ############################## 
            forum_url = 'https://us.battle.net/forums/en/{specific_forum}/?page={curr_page}'.format(specific_forum = forum_of_interest,                            
                                                                                             curr_page = curr_page)
            # in case network connection issue, retry up to 10 times        
            for i in range(10):
                try:
                    uClient = urlopen(forum_url)
                except:
                    continue
                error_count = 0
                break
            else:
                self.url_open_errors.append(forum_url)
                error_count += 1
                if error_count >= 5:
                    raise Exception("Five failed webpages in a row. Possible IP ban. Check then restart web scrapper at last successfully visited page ({}).".format(curr_page-5))
                continue
            page_html = uClient.read()
            uClient.close()
            print("{} was opened and read successfully.".format(forum_url)) # for progress checking during runtime
            page_soup = soup(page_html, "html.parser")
            
            # if user page range exceeds pages of forum this will end the loop
            no_more_pages = page_soup.find("span",{"class":"Forum-createFirstTopic"})
            if no_more_pages:
                max_page = curr_page-1
                break
            
            ###################################                                  
            # Step 3.2) get data from webpage #
            ###################################
            soup_details = page_soup.find_all("a",{"class":"ForumTopic"})
            num_post_per_page = 0
            pages_in_post = 0
            for i in range(len(soup_details)):
                url = "https://us.battle.net"+soup_details[i]['href']
                pages_in_post += self.get_post_data(url)
                

            print("There were {0} posts in page {1} with {2} total pages of replies".format(len(soup_details), curr_page, pages_in_post)) # for quality checking
            
            
            ############################################################
            # Step 3.3) save data in batches to disk or just Pandas df #
            ############################################################
            if output and (curr_page % 10) == 0:
                print("Finished reading pages {0} to {1}. Saving to disk.".format(curr_page-9, curr_page))
                if os.path.isfile(save_location): # first time saving so include headers
                    self.data_pd = pd.DataFrame.from_dict(self.tmp_data)
                    self.data_pd.to_csv(file_name, mode='a', header=False, index=False)
                else: 
                    self.data_pd = pd.DataFrame.from_dict(self.tmp_data)
                    self.data_pd.to_csv(file_name, index=False)

                self.tmp_data = {"user":[], "date":[], "time":[],"upvotes":[], "downvotes":[], 
                 "op?":[], "topic":[], "text":[], "user_posts":[], "topic_id":[]}     

        
        #########################################################################
        # Step 4) save to disk anything that wasn't already batch saved earlier #
        #########################################################################
        if output:
            if os.path.isfile(save_location): 
                self.data_pd = pd.DataFrame.from_dict(self.tmp_data)  
                self.data_pd.to_csv(file_name, mode='a', index=False, header=False)
            else:
                self.data_pd = pd.DataFrame.from_dict(self.tmp_data) 
                self.data_pd.to_csv(file_name, index=False) # first time saving
               
        unopened_links = len(self.url_open_errors)
        print("Web scrape complete! Database was saved to {}".format(save_location))
        print("The last page read was {}".format(max_page))
        print("There were {} pages that could not be opened.".format(unopened_links))
        if unopened_links > 0 and unopened_links < 200:
            print("These are the pages that could not be opened:")
            for link in self.url_open_errors:
                print(link)

    
        

### Using our new scraper class

In [1]:
import os
os.chdir("path\\to\\folder")

In [2]:
file_name = "overwatch_database.csv"
if not os.path.isfile(os.path.join(os.getcwd(),file_name)):
    print("no file")
else:
    print("there's already a file there")

there's already a file there


In [4]:
WebScraper = Blizzard_Forum_Scraper(forum_type="overwatch")

In [5]:
WebScraper.get_forum_data(initial_page=9691, max_page=10000)

 Close program now and rename file if that is not what you wanted.
https://us.battle.net/forums/en/overwatch/22813879/?page=9691 was opened and read successfully.
There were 50 posts in page 9691 with 57 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9692 was opened and read successfully.
There were 50 posts in page 9692 with 60 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9693 was opened and read successfully.
There were 50 posts in page 9693 with 59 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9694 was opened and read successfully.
There were 50 posts in page 9694 with 52 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9695 was opened and read successfully.
There were 50 posts in page 9695 with 64 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9696 was opened and read successfully.
There were 50 posts in page 9696 with 57 total replies
https://us.battle.net/forum

https://us.battle.net/forums/en/overwatch/22813879/?page=9742 was opened and read successfully.
There were 50 posts in page 9742 with 53 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9743 was opened and read successfully.
There were 50 posts in page 9743 with 68 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9744 was opened and read successfully.
There were 50 posts in page 9744 with 59 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9745 was opened and read successfully.
There were 50 posts in page 9745 with 57 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9746 was opened and read successfully.
There were 50 posts in page 9746 with 62 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9747 was opened and read successfully.
There were 50 posts in page 9747 with 69 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9748 was opened and read successfully

There were 50 posts in page 9794 with 77 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9795 was opened and read successfully.
There were 50 posts in page 9795 with 61 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9796 was opened and read successfully.
There were 50 posts in page 9796 with 56 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9797 was opened and read successfully.
There were 50 posts in page 9797 with 55 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9798 was opened and read successfully.
There were 50 posts in page 9798 with 64 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9799 was opened and read successfully.
There were 50 posts in page 9799 with 56 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9800 was opened and read successfully.
There were 50 posts in page 9800 with 57 total replies
Finished reading pages 9791 to 9800. Sa

There were 50 posts in page 9847 with 61 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9848 was opened and read successfully.
There were 50 posts in page 9848 with 84 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9849 was opened and read successfully.
There were 50 posts in page 9849 with 95 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9850 was opened and read successfully.
There were 50 posts in page 9850 with 60 total replies
Finished reading pages 9841 to 9850. Saving to disk.
https://us.battle.net/forums/en/overwatch/22813879/?page=9851 was opened and read successfully.
There were 50 posts in page 9851 with 56 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9852 was opened and read successfully.
There were 50 posts in page 9852 with 66 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9853 was opened and read successfully.
There were 50 posts in page 9853 with 60 

There were 50 posts in page 9900 with 57 total replies
Finished reading pages 9891 to 9900. Saving to disk.
https://us.battle.net/forums/en/overwatch/22813879/?page=9901 was opened and read successfully.
There were 50 posts in page 9901 with 66 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9902 was opened and read successfully.
There were 50 posts in page 9902 with 63 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9903 was opened and read successfully.
There were 50 posts in page 9903 with 60 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9904 was opened and read successfully.
There were 50 posts in page 9904 with 54 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9905 was opened and read successfully.
There were 50 posts in page 9905 with 57 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9906 was opened and read successfully.
There were 50 posts in page 9906 with 60 

https://us.battle.net/forums/en/overwatch/22813879/?page=9953 was opened and read successfully.
There were 50 posts in page 9953 with 54 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9954 was opened and read successfully.
There were 50 posts in page 9954 with 51 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9955 was opened and read successfully.
There were 50 posts in page 9955 with 53 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9956 was opened and read successfully.
There were 50 posts in page 9956 with 54 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9957 was opened and read successfully.
There were 50 posts in page 9957 with 52 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9958 was opened and read successfully.
There were 50 posts in page 9958 with 57 total replies
https://us.battle.net/forums/en/overwatch/22813879/?page=9959 was opened and read successfully

***

## Testing Playground

In [51]:
forum_url = 'https://us.battle.net/forums/en/overwatch/22813879/?page=2'
try:
    uClient = urlopen(forum_url)
except:
    print("Error opening {}".format(forum_url))  
page_html = uClient.read()
uClient.close()
print("{} was opened and read successfully.".format(forum_url)) # for progress checking during runtime
page_soup = soup(page_html, "html.parser")

# if user page range exceeds pages of forum this will end the loop
error_check = page_soup.find("span",{"class":"Forum-createFirstTopic"})
if error_check:
    print("404 page??")
    
    
soup_details = page_soup.find_all("a",{"class":"ForumTopic"})
num_post_per_page = 0
for i in range(len(soup_details)):

    num_post_per_page += 1

    url = "https://us.battle.net"+soup_details[i]['href']
    print(url)

https://us.battle.net/forums/en/overwatch/22813879/?page=2 was opened and read successfully.
https://us.battle.net/forums/en/overwatch/topic/20761856054
https://us.battle.net/forums/en/overwatch/topic/20762016016
https://us.battle.net/forums/en/overwatch/topic/20760756896
https://us.battle.net/forums/en/overwatch/topic/20761687589
https://us.battle.net/forums/en/overwatch/topic/20761767391
https://us.battle.net/forums/en/overwatch/topic/20762016058
https://us.battle.net/forums/en/overwatch/topic/20761636637
https://us.battle.net/forums/en/overwatch/topic/20761687047
https://us.battle.net/forums/en/overwatch/topic/20761707402
https://us.battle.net/forums/en/overwatch/topic/20761886048
https://us.battle.net/forums/en/overwatch/topic/20762046057
https://us.battle.net/forums/en/overwatch/topic/20761787334
https://us.battle.net/forums/en/overwatch/topic/20761856049
https://us.battle.net/forums/en/overwatch/topic/20760947874
https://us.battle.net/forums/en/overwatch/topic/20762046066
https:/

In [17]:
tmp_data = defaultdict(lambda: [])
tmp_data2 = defaultdict(lambda: [])

tmp_data["something"].append("some value")
tmp_data2["something_else"].append("some other value")

tmp_data = {**tmp_data, **tmp_data2}
tmp_data

#                 date = soup_details[i]['data-created-date']
#                 time = soup_details[i]['data-created-time']
#                 user = soup_details[i].find('span',{"ForumTopic-author"}).text
#                 replies = soup_details[i].find('span',{"ForumTopic-replies"}).text
#                 last_reply_time = soup_details[i].find('span',{"ForumTopic-timestamp"}).text.strip()
#                 topic = soup_details[i].find('span',{"ForumTopic-title"}).text.strip()
#                 text = soup_details[i].find('span',{"ForumTopic--preview"}).text
                
#                 # make sure that we're not missing data
#                 tmp_list = [tmp_date, tmp_time, tmp_user, tmp_replies, tmp_last_reply_time, tmp_text]
#                 for val in tmp_list:
#                     if val is None:
#                         raise Exception("Received None type data")        
                
#                 self.tmp_data["user"].append(user)
#                 self.tmp_data["topic"].append(topic)
#                 self.tmp_data["time"].append(time)
#                 self.tmp_data["date"].append(date)
#                 self.tmp_data["text"].append(text)
#                 self.tmp_data["downvotes"].append(replies)
#                 self.tmp_data["upvotes"].append(last_reply_time)

{'something': ['some value'], 'something_else': ['some other value']}

In [33]:
#uClient = urlopen("https://us.battle.net/forums/en/overwatch/topic/20761647418") # test replies page
uClient = urlopen("https://us.battle.net/forums/en/overwatch/topic/20761647418?page=1") # test posts page
page_html = uClient.read()
uClient.close()
        
page_soup = soup(page_html, "html.parser")  
#soup_details = page_soup.findAll("a",{"class":"ForumTopic"})

all_replies = page_soup.find_all("div", {"class":"TopicPost"})

user_data = {"user":"", "date":"", "time":"","upvotes":"", "downvotes":"", 
             "op?":"", "topic":"", "text":"", "user_posts":"", "topic_id":""}


#user_data["topic_id"] = 
topic_data = page_soup.find("section",{"class":"Topic"})["data-topic"]

user_data["topic_id"] = re.search(r'(?<="id":)\d*', topic_data).group()

for reply in all_replies:

    some_data = ast.literal_eval(reply["data-topic-post"])
    time_data = reply.find("a",{"class":"TopicPost-timestamp"})["data-tooltip-content"]
    user_data["user"] = some_data["author"]["name"]
    user_data["date"] = time_data.split(" ")[0]
    user_data["time"] = time_data.split(" ")[1]
    user_data["upvotes"] = some_data["rank"]["voteUp"]
    user_data["downvotes"] = some_data["rank"]["voteDown"]
    user_data["topic"] = page_soup.find("span",{"class":"Topic-title"}).text
    user_data["text"] = reply.find("div",{"class":"TopicPost-bodyContent"}).text
    user_data["user_posts"] = reply.find("span",{"class":"Author-posts"}).text.strip()

    if reply["id"] == "post-1":
        user_data["op?"] = True
    else:
        user_data["op?"] = False
    

    print(user_data)

{'user': 'Terranguard', 'date': '02/12/2018', 'time': '06:58', 'upvotes': 36, 'downvotes': 24, 'op?': True, 'topic': 'make that Moira healing through barriers a feature', 'text': "I don't care if it's a bug or not It's perfect for her as a main healer. Her healing is already on a recourse and she's a main healer. She needs to be able to heal through barriers.https://www.overbuff.com/heroes go here and click competitive and you'll see Mercy is still on top.people couldn't figure out how to click the comp button where I posted this below so I had to clarify it here.", 'user_posts': '9761 posts', 'topic_id': '20761647418'}
{'user': 'alfislegend', 'date': '02/12/2018', 'time': '07:00', 'upvotes': 18, 'downvotes': 29, 'op?': False, 'topic': 'make that Moira healing through barriers a feature', 'text': "If Mei can't freeze through barriers, why the hell should Moira be able to heal through them? It's not like this bug fix will ruin her, she's the strongest healer in the game right now.", 'us

In [30]:
test = '''{ "id":20761647418, "lastPosition":90,"forum":{"id":22813879},
 "isSticky":false,"isFeatured":false,"isLocked":false,"isHidden":false,
 "isFrozen":false, "isSpam":false, "pollId":0 }'''

test = re.search(r'(?<="id":)\d*', test).group()

print(test)

20761647418


In [12]:
test = '\n\n9759 posts\n\n'.strip()
test

'9759 posts'

In [44]:
test = page_soup.find_all("a", {"class":'ForumTopic'})
links = []
print(len(test))
for i in range(len(test)):
    links.append("https://us.battle.net/"+test[i]['href'])
    print(test[i]['href'])

for link in links:
    uClient = urlopen(link)
    page_html = uClient.read()
    uClient.close()
    
    page_soup = soup(page_html, "html.parser")
    
    all_replies = page_soup.find_all("section", {"class":"TopicPost"})
    
    for reply in all_replies:
        tmp = 

48
/forums/en/overwatch/topic/20744265634
/forums/en/overwatch/topic/20744215778
/forums/en/overwatch/topic/20761088072
/forums/en/overwatch/topic/20761046611
/forums/en/overwatch/topic/20761806957
/forums/en/overwatch/topic/20761727407
/forums/en/overwatch/topic/20761627391
/forums/en/overwatch/topic/20761667418
/forums/en/overwatch/topic/20761717466
/forums/en/overwatch/topic/20761807478
/forums/en/overwatch/topic/20761827450
/forums/en/overwatch/topic/20761607422
/forums/en/overwatch/topic/20761817498
/forums/en/overwatch/topic/20761637373
/forums/en/overwatch/topic/20761727406
/forums/en/overwatch/topic/20761687591
/forums/en/overwatch/topic/20761667157
/forums/en/overwatch/topic/20761717477
/forums/en/overwatch/topic/20761777431
/forums/en/overwatch/topic/20761767349
/forums/en/overwatch/topic/20761647486
/forums/en/overwatch/topic/20761627390
/forums/en/overwatch/topic/20761687589
/forums/en/overwatch/topic/20761687614
/forums/en/overwatch/topic/20761727410
/forums/en/overwatch/t

In [69]:
import sys
try: 
    uClient = urlopen("https://us.battle.net/forums/en/overwatch/topic/20761727407/?page=2")

except:
    print("oh nooo")
print("we made it here")

    
#     page_html = uClient.read()
# uClient.close()

oh nooo
we made it here


### What we want from posts pages (for each reply/post):


* author
* num likes and dislikes [DONE]
* num posts of author
* time of posting
* text of post
* is it OP?
* text of OP


In [5]:
index = 1
while True:
    uClient = urlopen("https://us.battle.net/forums/en/overwatch/topic/20761727407?page={}".format(index))
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "html.parser")
    error_check = page_soup.find("section",{"class":"Error"})
    if error_check:
        print("There's nothing on page {}!".format(index))
        break
    else:
        print("there's stuff on page {}!".format(index))

        all_replies = page_soup.find_all("div", {"class":"TopicPost"})
        likes = []
        author

        for reply in all_replies:
            likes.append(reply["data-topic-post"])
        count = 0
        for like in likes:
            tmp = ast.literal_eval(like)
            upvotes = tmp["rank"]["voteUp"]
            downvotes= tmp["rank"]["voteDown"]
            print("This post got {} score".format(upvotes - downvotes))
            count +=1
        print(count)
        index += 1

print("now we're out here!")



This post got 9 score
This post got 14 score
This post got -2 score
This post got 7 score
This post got 3 score
This post got 2 score
This post got 2 score
This post got 0 score
This post got -2 score
This post got 0 score
This post got 0 score
This post got 1 score
This post got -2 score
This post got 0 score
This post got 0 score
This post got -1 score
This post got 0 score
17
0
0
0
0
now we're out here!


In [12]:
index = 1
while True:
    uClient = urlopen("https://us.battle.net/forums/en/overwatch/topic/20761727407?page={}".format(index))
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "html.parser")
    error_check = page_soup.find("section",{"class":"Error"})
    if error_check:
        print("There's nothing on page {}!".format(index))
        break
    else:
        print("there's stuff on page {}!".format(index))
    index += 1


there's stuff on page 1!
There's nothing on page 2!


In [53]:
all_replies

[<div class="TopicPost " data-topic='{ "sticky":"false","featured":"false","locked":"true","frozen":"false","hidden":"false","pollId":"0"}' data-topic-post='{"id":"207601793405","valueVoted":0,"rank":{"voteUp":21,"voteDown":12},"author":{"id":"207601283573","name":"Dancewknives"}}' id="post-1">
 <!-- Deprecated: Deeplink for existing Quotes 02/19/2016 -->
 <span id="1"></span>
 <div class="TopicPost-content">
 <aside class="TopicPost-author">
 <div class="Author-block">
 <div class="Author" data-topic-post-body-content="true" id=""><a class="Author-avatar " href="https://playoverwatch.com/en-us/career/xbl/Dancewithknives"><img alt="" src="https://blzgdapipro-a.akamaihd.net/game/unlocks/0x0250000000000C40.png"/></a><div class="Author-details"> <span class="Author-name">
 <a class="Author-name--profileLink" href="https://playoverwatch.com/en-us/career/xbl/Dancewithknives">Dancewknives</a>
 </span>
 <span class="Author-posts">
 <a class="Author-posts" data-toggle="tooltip" data-tooltip-co

In [80]:
new_user = "user5"

if new_user in data:
    print("he's there")
else:
    print("nope")
    
data[new_user] = {"text":["hiiii"],"time":["11/24/2017"]}
data_pd_again = pd.DataFrame.from_dict(data, orient="index")

he's there


In [81]:
data_pd_again

Unnamed: 0,text,time
user1,"[this, is some text they posted, some other po...","[11/2/2017, 11/3/2017]"
user2,[someone else said this thing],[11/3/2017]
user5,[hiiii],[11/24/2017]


In [None]:
new_user = 

In [105]:
new_data_pd = pd.DataFrame.from_dict(data, orient="index")
new_data_pd

Unnamed: 0,text,time
user1,"[this, is some text they posted, some other po...","[11/2/2017, 11/3/2017]"
user2,[someone else said this thing],[11/3/2017]


In [106]:
new_data_pd.to_json("test_file.json")

In [99]:
new_data_pd

Unnamed: 0,text,time
user1,"[this, is some text they posted, some other po...","[11/2/2017, 11/3/2017]"
user2,"[someone else said this thing, some good ol text]","[11/3/2017, 11/4/2017]"


In [24]:
data_pd.head()

Unnamed: 0,text,time
user1,"[this, is some text they posted, some other po...","[11/2/2017, 11/3/2017]"
user2,someone else said this thing,11/3/2017


In [26]:
data_pd.index

Index(['user1', 'user2'], dtype='object')

In [50]:
new_data = {"user3":{"text":["making a new forum post"],"time":["11/4/2017"]}}
new_data_pd = pd.DataFrame.from_dict(new_data, orient="index")
data_pd = data_pd.append(new_data_pd)

data_pd

Unnamed: 0,text,time
user1,"[this, is some text they posted, some other po...","[11/2/2017, 11/3/2017]"
user2,[someone else said this thing],[11/3/2017]
user3,[making a new forum post],[11/4/2017]


In [45]:
data_pd

Unnamed: 0,text,time
user1,"[this, is some text they posted, some other po...","[11/2/2017, 11/3/2017]"
user2,someone else said this thing,11/3/2017
