In [5]:
import wikichatter as wc
import mwapi
import requests
import json
import csv

In [6]:
#When working with mwapi, you should begin your file with a session, with a user_agent
#this allows you to make multiple calls without getting shut out of the api

session = mwapi.Session('https://en.wikipedia.org', user_agent='ewhit')

#Creating a query
>params holds the parameters you want to use when calling mwapi. This makes it easy to  
manipulate each parameter, particularly "titles" Here, we can see that titles  
has been set to ["Wikipedia:List of controversial issues"](https://en.wikipedia.org/wiki/Wikipedia:List_of_controversial_issues) 
These parameters will do the following:
    >>Tell mwapi to make a query (action), to get all of the links present (prop) on the page 
    >>"Wikipedia:List of controversial issues" (titles), and to return this in json format (format)
>The query will end up looking like this:  
    >>[https://en.wikipedia.org/w/api.php?action=query&prop=links&titles=Wikipedia:List%20of%20controversial%20issues&pllimit=max](https://en.wikipedia.org/w/api.php?action=query&prop=links&titles=Wikipedia:List%20of%20controversial%20issues&pllimit=max)

In [7]:
def params():
    return{
    "action": "query",
    "titles" : "Wikipedia:List of controversial issues",
    "prop": "links",
    "format": "json",
    "pllimit" : "1"
    }


#kwargs calls params in order to be set - this is what you will end up passing
kwargs=params()

#This holds the results of your query, and continuation=True tells session.get to keep going when it reaches the
#limit (of 50, I believe), so that you can get *all* of the links on the page
#Because we wanted all the titles on the page, query will be a generator
query = session.get(**kwargs, continuation = True)

In [8]:
#Pagelist is a list that will ultimately hold the titles of all the controversial pages
pageList = []

#This loops through all the results of the earlier query
#However, because what we want to do next is acquire the talk page for each controversial
#page, we will need to do a bit of trimming
#Ultimately, this bit of code will put "Talk:page title" into our list of pages to visit later
for request in query:
    title = json.dumps(request)
    index = title.find('links')
    temp = title[index:]
    index2 = temp.find('title')
    temp2 = temp[index2:]
    temp3 = temp2[9:]
    index3 = temp3.find('"')
    temp4 = temp3[:index3]
    toTalk = "Talk:"
    temp5 = toTalk + temp4
    pageList.append(temp5)
    

print("end for loop")

end for loop


#Queries
> Each query will return something like this: 
>> {"query": {"pages": {"27985631": {"links": [{"ns": 0, "title": "1919 World Series"}], "ns": 4, "title": "Wikipedia:List of controversial issues", "pageid": 27985631}}}, "continue": {"plcontinue": "27985631|0|1953_Iranian_coup_d'\u00e9tat", "continue": "||"}}  

>Because we want to query each individual talk page, we will need to parse the title of the talk page from what each query returns, which is what the above code does. After this loop has run, we should have a list of things that look like "Talk:1919 World Series" 

>After parsing and setting the param to the appropriate parameters, a query that looks like this should be generated:  
>>[https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&titles=Talk:1919%20World%20Series&format=json](https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&titles=Talk:1919%20World%20Series&format=json)  
>This will get the content of the page queried  
>This content is then passed to wc.parse so that it can be parsed - it looks quite unreadable initially  
>This appears to be where errors are happening

In [None]:
#This is a dictionary to hold the talk page title, and the content of that talk page
content = {}

#This loops through each page in our list of controversial talk pages
for page in pageList:
    
    #Once again, we want to define the parameters we will send to session.get
    #Here, we are again querying, and "titles" : page will query the page with the corresponding title
    #(which is why we just stored the talk page titles earlier)
    #"prop" : "revisions" gets the most recent revision, and "rvprop" : "content" gets the current content
    def param(page):
        return{
        "action": "query",
        "titles" : page,
        "prop" : "revisions",
        "rvprop" : "content",
        "format": "json",
        }
    
    #Here we again set kwargs to send to session.get
    kwargs2 = param(page)
    query = session.get(**kwargs2, continuation=True )
    
    #Because this portion of code was throwing errors, I have used several try-catch blocks
        #Several were used because the first kind returned a TypeError, so I caught that as well
    #try-catch block
    i = 0
    for request in query:
        #This is the text that results from the individual query (so, the talk page content)
        text = json.dumps(request)

        try:
            #Here, we try to parse the talk page content
            try:
                parsed_text = wc.parse(text)
            except NameError:
                i = i + 1  
            except NoUsernameError(parsed_text):
                i = i + 1
        except TypeError:
            i = i + 1

        #Here, we add an entry to the dictionary of the current page's title and parsed text
        content[page] = parsed_text
            
print("finished querrying")

In [None]:
#This simply prints the page titles and their corresponding cleaned text to a file
#with open("parsedTalkPages.csv", "w", newline = '') as w:
    #writer = csv.writer(w, delimiter = ',')
    #for key, value in content.items():
     #   writer.writerow([key, value])