# Part 1: Gathering Forum Data

Author: David Skarbrevik

W266 NLP course 

Fall 2017

Part 1 of Final Project

### Make a class to gather the data

In [9]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
import re
import os
import pandas as pd
from collections import defaultdict

class Blizzard_Forum_Scraper():
    
    overwatch_forum_url = "overwatch/22813879"
    sc2_forum_url = "sc2/40568"
    
    def __init__(self, forum_type):
        
        if forum_type not in ["sc2","overwatch"]:
            raise NameError("please set 'forum_type' paramater to 'sc2' or 'overwatch'") 
        
        self.forum_type = forum_type
        self.tmp_data = defaultdict(lambda: []) # all data will be stored here and converted to Pandas Dataframe
        self.data_pd = pd.DataFrame()

    def get_forum_pages(self, initial_page = 1, max_page = 10000, output=True):
        
        ##########################################
        # Step 1) try to catch user input errors #
        ##########################################
        if type(initial_page) != int:
            raise TypeError("type(initial_page) was set to a '{}' but must be type 'int'".format(type(initial_page.__name__)))
        elif initial_page <= 0:
            raise Exception("initial_page must be an 'int' greater than 0")
    
        if type(max_page) != int:
            raise TypeError("type(max_page) was set to a '{}' but must be type 'int'".format(type(max_page.__name__)))
        elif max_page < initial_page:
            raise Exception("max_page must be greater than or equal to initial_page")
        if output not in (True, False):
            raise TypeError("output parameter must be a boolean.")
        
        if output:
            file_name = "{0}_database.csv".format(self.forum_type)
            save_location = os.getcwd()+"\{}".format(file_name)
            if os.path.isfile(file_name):
                raise Exception("{} already exists. Please remove that file before calling this function.".format(file_name)) 
        else:
            raise NameError('set output parameter to "csv" or "json" or False') 

        
        ##########################################################
        # Step 2) set up important variables for data collection #
        ##########################################################
        if self.tmp_data:
            self.tmp_data = defaultdict(lambda: [])
        
        if self.forum_type == "sc2":
            forum_of_interest = self.sc2_forum_url
        elif self.forum_type == "overwatch":
            forum_of_interest = self.overwatch_forum_url

        url_page_num = initial_page
        forum_data_class = re.compile(r'^ForumTopic[^(-timestamp*)]*$')
        num_page_count = 0 # important for batch saving to disk


        ###########################################################
        # Step 3) loop over all desired pages to gather html data #
        ###########################################################
        while True:
            num_page_count += 1
#             if url_page_num % 10 == 0:
#                 print("Currently reading page {}".format(url_page_num))
            forum_url = 'https://us.battle.net/forums/en/{specific_forum}/?page={curr_page}'.format(specific_forum = forum_of_interest,                            
                                                                                            curr_page = url_page_num)
            
            uClient = urlopen(forum_url)
            page_html = uClient.read()
            uClient.close()
            
            print("{} was opened and read successfully.".format(forum_url)) # for progress checking during runtime
            
            # parse the HTML
            page_soup = soup(page_html, "html.parser")
            
            # stop if no more posts
            if len(page_soup) == 0:
                break
            
            soup_details = page_soup.findAll("a",{"class":forum_data_class})
            num_post_per_page = 0
            # extract all meaningful data from soup and save in a dictionary
            for i in range(len(soup_details)):
            
                tmp_date = soup_details[i]['data-created-date']
                tmp_time = soup_details[i]['data-created-time']
                tmp_user = soup_details[i].find('span',{"ForumTopic-author"}).text
                tmp_replies = soup_details[i].find('span',{"ForumTopic-replies"}).text
                tmp_last_reply_time = soup_details[i].find('span',{"ForumTopic-timestamp"}).text.strip()
                tmp_topic = soup_details[i].find('span',{"ForumTopic-title"}).text.strip()
                tmp_text = soup_details[i].find('span',{"ForumTopic--preview"}).text
                
                tmp_list = [tmp_date, tmp_time, tmp_user, tmp_replies, tmp_last_reply_time, tmp_text]
                for val in tmp_list:
                    if val is None:
                        raise Exception("Received None type data")
                num_post_per_page += 1
                self.tmp_data["user"].append(tmp_user)
                self.tmp_data["topic"].append(tmp_topic)
                self.tmp_data["time"].append(tmp_time)
                self.tmp_data["date"].append(tmp_date)
                self.tmp_data["text"].append(tmp_text)
                self.tmp_data["num_replies"].append(tmp_replies)
                self.tmp_data["time_last_reply"].append(tmp_last_reply_time)
                
            
            print("There were {0} posts in page {1}.".format(num_post_per_page, url_page_num )) # for quality checking
            
            # save in batches if parsing a lot of pages
            if output is not False and (num_page_count % 100) == 0:
                if self.data_pd.empty: # first time saving so include headers
                    self.data_pd = pd.DataFrame.from_dict(self.tmp_data)
                    self.data_pd.to_csv(file_name, index=False)
                else: 
                    self.data_pd = pd.DataFrame.from_dict(self.tmp_data)
                    self.data_pd.to_csv(file_name, mode='a', header=False, index=False)
             
                self.tmp_data = defaultdict(lambda: [])    
 
            # stop if max page reached
            if url_page_num >= max_page:
                break
             
            url_page_num += 1
        
        #########################################################################
        # Step 4) save to disk anything that wasn't already batch saved earlier #
        #########################################################################
        if output is not False:
            if self.data_pd.empty:
                self.data_pd = pd.DataFrame.from_dict(self.tmp_data)  
                self.data_pd.to_csv(file_name, index=False)
            else:
                self.data_pd = pd.DataFrame.from_dict(self.tmp_data)
                self.data_pd.to_csv(file_name, mode='a', index=False, header=False)
        
        return "Web scrape complete! Database was saved to {}".format(save_location)


### Using our new scraper class

In [23]:
WebScraper = Blizzard_Forum_Scraper(forum_type="overwatch")

In [24]:
WebScraper.get_forum_pages(initial_page=7692, max_page=8655)

https://us.battle.net/forums/en/overwatch/22813879/?page=7692 was opened and read successfully.
There were 50 posts in page 7692.
https://us.battle.net/forums/en/overwatch/22813879/?page=7693 was opened and read successfully.
There were 50 posts in page 7693.
https://us.battle.net/forums/en/overwatch/22813879/?page=7694 was opened and read successfully.
There were 50 posts in page 7694.
https://us.battle.net/forums/en/overwatch/22813879/?page=7695 was opened and read successfully.
There were 50 posts in page 7695.
https://us.battle.net/forums/en/overwatch/22813879/?page=7696 was opened and read successfully.
There were 50 posts in page 7696.
https://us.battle.net/forums/en/overwatch/22813879/?page=7697 was opened and read successfully.
There were 50 posts in page 7697.
https://us.battle.net/forums/en/overwatch/22813879/?page=7698 was opened and read successfully.
There were 50 posts in page 7698.
https://us.battle.net/forums/en/overwatch/22813879/?page=7699 was opened and read successf

'Web scrape complete! Database was saved to C:\\Users\\skarb\\Desktop\\GitHub\\Analyzing-Forum-Text\\overwatch_database.csv'

***

## Testing Playground

In [167]:
tester = defaultdict(lambda: defaultdict(lambda: []))

tester["something"]["something_else"].append("some more text")
tester["something"]["something_else"].append("some other text")

tester["something"]["something_else"]

['some more text', 'some other text']

In [23]:
fake_data =  pd.DataFrame()

if fake_data.empty:
    print("no df")
else:
    print("there is a df")

no df


In [26]:
test = False

if test is not False:
    print("hi")

In [97]:
class test_class():
    def __init__(self):
        self.test_val = test_val
        
    def change_val(self):
        self.test_val = 4
    
    def print_val(self):
        print (self.test_val)

TypeError: int() takes at most 2 arguments (3 given)

In [27]:
test = defaultdict(lambda:0)

test["user"] = ["user1"]
test["user"].append("user2")
test["rating"] = ["3.0"]
test["rating"].append("2.0")

test_to_pd = pd.DataFrame.from_dict(test)
test_to_pd."{}".format("to_csv")("test_file.csv", index=False)

SyntaxError: invalid syntax (<ipython-input-27-2ff6dca1a916>, line 9)

In [22]:
test = defaultdict(lambda:0)
test["user"] = ["user3"]
test["user"].append("user4")
test["rating"] = ["4.0"]
test["rating"].append("2.0")
df = pd.DataFrame.from_dict(test)
df.to_csv("test_file.csv", mode='a', header=False, index=False)
# with open('test_file.csv', 'a') as file:
#     df.to_csv(file, header=False, index=False)

In [4]:
!dir

 Volume in drive C has no label.
 Volume Serial Number is 6EC3-2C9D

 Directory of C:\Users\skarb\Desktop\GitHub\Analyzing-Forum-Text

11/17/2017  12:24 PM    <DIR>          .
11/17/2017  12:24 PM    <DIR>          ..
11/16/2017  10:57 AM    <DIR>          .ipynb_checkpoints
11/16/2017  04:29 PM            66,757 EDA_on_SC2.ipynb
11/15/2017  11:50 AM                53 README.md
11/16/2017  10:44 AM       136,349,248 sc2_database.csv
11/17/2017  12:24 PM                40 test_file.csv
11/17/2017  12:22 PM           314,189 WebScrapping.ipynb
               5 File(s)    136,730,287 bytes
               3 Dir(s)  15,519,404,032 bytes free


In [95]:
tester = test_class(3)
tester.print_val()
tester.change_val()
tester.print_val()
print(tester.test_val)

3
4
4


In [119]:
data = {"user1":{"text":["this, is some text they posted","some other post from user1"],"time":["11/2/2017","11/3/2017"]} , 
        "user2":{"text":["someone else said this thing"],"time":["11/3/2017"]}}

#data_pd = pd.DataFrame.from_dict(data, orient="index")

In [122]:
from collections import defaultdict
data = defaultdict(lambda: 0)
data["someone else"] = "hi again"
print(data)

defaultdict(<function <lambda> at 0x000002D21F3EC048>, {'someone else': 'hi again'})


In [80]:
new_user = "user5"

if new_user in data:
    print("he's there")
else:
    print("nope")
    
data[new_user] = {"text":["hiiii"],"time":["11/24/2017"]}
data_pd_again = pd.DataFrame.from_dict(data, orient="index")

he's there


In [81]:
data_pd_again

Unnamed: 0,text,time
user1,"[this, is some text they posted, some other po...","[11/2/2017, 11/3/2017]"
user2,[someone else said this thing],[11/3/2017]
user5,[hiiii],[11/24/2017]


In [None]:
new_user = 

In [105]:
new_data_pd = pd.DataFrame.from_dict(data, orient="index")
new_data_pd

Unnamed: 0,text,time
user1,"[this, is some text they posted, some other po...","[11/2/2017, 11/3/2017]"
user2,[someone else said this thing],[11/3/2017]


In [106]:
new_data_pd.to_json("test_file.json")

In [99]:
new_data_pd

Unnamed: 0,text,time
user1,"[this, is some text they posted, some other po...","[11/2/2017, 11/3/2017]"
user2,"[someone else said this thing, some good ol text]","[11/3/2017, 11/4/2017]"


In [24]:
data_pd.head()

Unnamed: 0,text,time
user1,"[this, is some text they posted, some other po...","[11/2/2017, 11/3/2017]"
user2,someone else said this thing,11/3/2017


In [26]:
data_pd.index

Index(['user1', 'user2'], dtype='object')

In [50]:
new_data = {"user3":{"text":["making a new forum post"],"time":["11/4/2017"]}}
new_data_pd = pd.DataFrame.from_dict(new_data, orient="index")
data_pd = data_pd.append(new_data_pd)

data_pd

Unnamed: 0,text,time
user1,"[this, is some text they posted, some other po...","[11/2/2017, 11/3/2017]"
user2,[someone else said this thing],[11/3/2017]
user3,[making a new forum post],[11/4/2017]


In [45]:
data_pd

Unnamed: 0,text,time
user1,"[this, is some text they posted, some other po...","[11/2/2017, 11/3/2017]"
user2,someone else said this thing,11/3/2017
