In [34]:
import json
import requests
import re

In [26]:
def create_wiki_query(query):
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    title = f"titles={query}"
    content = "prop=revisions&rvprop=content"
    dataformat ="format=json"
    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
    return query

In [27]:
print(create_wiki_query('List_of_South_Park_characters'))

https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&titles=List_of_South_Park_characters&format=json


In [28]:
# a = json.loads(wikitext)

# page_id = list(a['query']['pages'].keys())[0]

# content = a['query']['pages'][page_id]['revisions'][0]['*']

# content


def get_wikicontent(query_link):
    '''
    query link: string of the wikipedia query. Can be the link to a page
    '''
    # Create query
    query = create_wiki_query(query_link)
    
    # Get wiki response
    wikiresponse = requests.get(query)
    wikitext = wikiresponse.text
    
    # Use json to read wiki response
    a = json.loads(wikitext)
    
    # Get the page id unique for each page
    page_id = list(a['query']['pages'].keys())[0]
    content = a['query']['pages'][page_id]['revisions'][0]['*'] # Hope there is a revisions key and a *
    return content
    
    
    
    

In [31]:
main_page= get_wikicontent('List_of_South_Park_characters')

From the main page, we initially focus on the main and secondary characters. The idea is to get the wikitext for the two groups. Scrape the main article for each character from the `main` link. Careful, this link is enclosed in `{{}}`.

In [64]:
# Main characters
pattern = r'[\{]{2}(main)\|([\w\s.]*)[\}]{2}'
matches = re.findall(pattern, main_page)

Create a dictionary with the name as the key and wikilink as the value

In [65]:
characters_dict = {}

for match in matches:
    #match is of the form ('main', character name)
    characters_dict[match[1]]=match[1].replace(' ', '_')

In [67]:
characters_dict

{'Stan Marsh': 'Stan_Marsh',
 'Kyle Broflovski': 'Kyle_Broflovski',
 'Eric Cartman': 'Eric_Cartman',
 'Kenny McCormick': 'Kenny_McCormick',
 'Butters Stotch': 'Butters_Stotch',
 'Randy and Sharon Marsh': 'Randy_and_Sharon_Marsh',
 'Mr. Garrison': 'Mr._Garrison',
 'Gerald and Sheila Broflovski': 'Gerald_and_Sheila_Broflovski',
 'Jimmy Valmer': 'Jimmy_Valmer',
 'Wendy Testaburger': 'Wendy_Testaburger'}

Now go for the reccuring characters. Some of them, who have a specific pattern in the wiki page content.

In [68]:
pattern = r'\|[\[]{2}([\w\s.\(\)#]+)\|*([\w\s.]*)[\]]{2}[\}]{2}'
matches = re.findall(pattern, main_page)
matches

[('List of recurring South Park characters#Scott Malkinson',
  'Scott Malkinson'),
 ('Mr. Mackey', ''),
 ('PC Principal', ''),
 ('List of South Park Elementary staff#Strong Woman', 'Strong Woman'),
 ('Shelley Marsh', 'Shelly Marsh'),
 ('Grandpa Marsh', 'Grandpa Marvin Marsh'),
 ('Jimbo Kern', ''),
 ('Ike Broflovski', ''),
 ('Liane Cartman', ''),
 ('Stuart McCormick', ''),
 ('Stuart and Carol McCormick', 'Carol McCormick'),
 ('Karen McCormick (South Park)', 'Karen McCormick'),
 ('Stephen and Linda Stotch', 'Stephen Stotch'),
 ('Stephen and Linda Stotch', 'Linda Stotch'),
 ('Officer Barbrady', ''),
 ('Ned Gerblansky', ''),
 ('Tuong Lu Kim', ''),
 ('Father Maxi', ''),
 ('Mayor McDaniels', 'Mayor Mary McDaniels'),
 ('Alphonse Mephisto', 'Dr. Alphonse Mephesto'),
 ('Mr. Slave', ''),
 ('List of recurring South Park characters#Sergeant Harrison Yates',
  'Harrison Yates'),
 ('Betsy Donovan', ''),
 ('List of staff at South Park Elementary#Ms. Choksondik', 'Ms. Choksondik'),
 ('List of staff at

In [69]:
for match in matches:
    #match is of the form ('main', character name)
    if match[1]:
        characters_dict[match[1]]=match[0].replace(' ', '_')
    elif not match[1] and not match[0]:
        continue
    else:
        characters_dict[match[0]]=match[0].replace(' ', '_')

In [70]:
characters_dict

{'Stan Marsh': 'Stan_Marsh',
 'Kyle Broflovski': 'Kyle_Broflovski',
 'Eric Cartman': 'Eric_Cartman',
 'Kenny McCormick': 'Kenny_McCormick',
 'Butters Stotch': 'Butters_Stotch',
 'Randy and Sharon Marsh': 'Randy_and_Sharon_Marsh',
 'Mr. Garrison': 'Mr._Garrison',
 'Gerald and Sheila Broflovski': 'Gerald_and_Sheila_Broflovski',
 'Jimmy Valmer': 'Jimmy_Valmer',
 'Wendy Testaburger': 'Wendy_Testaburger',
 'Scott Malkinson': 'List_of_recurring_South_Park_characters#Scott_Malkinson',
 'Mr. Mackey': 'Mr._Mackey',
 'PC Principal': 'PC_Principal',
 'Strong Woman': 'List_of_South_Park_Elementary_staff#Strong_Woman',
 'Shelly Marsh': 'Shelley_Marsh',
 'Grandpa Marvin Marsh': 'Grandpa_Marsh',
 'Jimbo Kern': 'Jimbo_Kern',
 'Ike Broflovski': 'Ike_Broflovski',
 'Liane Cartman': 'Liane_Cartman',
 'Stuart McCormick': 'Stuart_McCormick',
 'Carol McCormick': 'Stuart_and_Carol_McCormick',
 'Karen McCormick': 'Karen_McCormick_(South_Park)',
 'Stephen Stotch': 'Stephen_and_Linda_Stotch',
 'Linda Stotch': 'S

There are still some characters left. Same procedure, different pattern

In [71]:
pattern = r'\|\s{1}[\[]{2}([\w\s.-]+)\]\][\r\n]+'
matches = re.findall(pattern, main_page)
matches

['Bebe Stevens',
 'Clyde Donovan',
 'Craig Tucker',
 'Heidi Turner',
 'Timmy Burch',
 'Tolkien Black',
 'Tweek Tweak',
 'Big Gay Al',
 'Terrance and Phillip',
 'Pip Pirrup']

In [73]:
for match in matches:
    characters_dict[match]=match.replace(' ', '_')

In [77]:
characters_dict

{'Stan Marsh': 'Stan_Marsh',
 'Kyle Broflovski': 'Kyle_Broflovski',
 'Eric Cartman': 'Eric_Cartman',
 'Kenny McCormick': 'Kenny_McCormick',
 'Butters Stotch': 'Butters_Stotch',
 'Randy and Sharon Marsh': 'Randy_and_Sharon_Marsh',
 'Mr. Garrison': 'Mr._Garrison',
 'Gerald and Sheila Broflovski': 'Gerald_and_Sheila_Broflovski',
 'Jimmy Valmer': 'Jimmy_Valmer',
 'Wendy Testaburger': 'Wendy_Testaburger',
 'Scott Malkinson': 'List_of_recurring_South_Park_characters#Scott_Malkinson',
 'Mr. Mackey': 'Mr._Mackey',
 'PC Principal': 'PC_Principal',
 'Strong Woman': 'List_of_South_Park_Elementary_staff#Strong_Woman',
 'Shelly Marsh': 'Shelley_Marsh',
 'Grandpa Marvin Marsh': 'Grandpa_Marsh',
 'Jimbo Kern': 'Jimbo_Kern',
 'Ike Broflovski': 'Ike_Broflovski',
 'Liane Cartman': 'Liane_Cartman',
 'Stuart McCormick': 'Stuart_McCormick',
 'Carol McCormick': 'Stuart_and_Carol_McCormick',
 'Karen McCormick': 'Karen_McCormick_(South_Park)',
 'Stephen Stotch': 'Stephen_and_Linda_Stotch',
 'Linda Stotch': 'S

In [85]:
import copy

tmp_dict = copy.deepcopy(characters_dict)

for character, link in tmp_dict.items():
    if ' and ' in character:
        split_list = character.split(" ")
        if len(split_list)>3:
            characters_dict[" ".join([split_list[0], split_list[-1]])] = link
            characters_dict[" ".join([split_list[2], split_list[-1]])] = link
            del characters_dict[character]

del tmp_dict

In [89]:
# Also make the link a full wikilink
base_wiki_url = r'https://en.wikipedia.org/wiki/'

for character, link in characters_dict.items():
    characters_dict[character] = base_wiki_url+link

In [90]:
characters_dict

{'Stan Marsh': 'https://en.wikipedia.org/wiki/Stan_Marsh',
 'Kyle Broflovski': 'https://en.wikipedia.org/wiki/Kyle_Broflovski',
 'Eric Cartman': 'https://en.wikipedia.org/wiki/Eric_Cartman',
 'Kenny McCormick': 'https://en.wikipedia.org/wiki/Kenny_McCormick',
 'Butters Stotch': 'https://en.wikipedia.org/wiki/Butters_Stotch',
 'Mr. Garrison': 'https://en.wikipedia.org/wiki/Mr._Garrison',
 'Jimmy Valmer': 'https://en.wikipedia.org/wiki/Jimmy_Valmer',
 'Wendy Testaburger': 'https://en.wikipedia.org/wiki/Wendy_Testaburger',
 'Scott Malkinson': 'https://en.wikipedia.org/wiki/List_of_recurring_South_Park_characters#Scott_Malkinson',
 'Mr. Mackey': 'https://en.wikipedia.org/wiki/Mr._Mackey',
 'PC Principal': 'https://en.wikipedia.org/wiki/PC_Principal',
 'Strong Woman': 'https://en.wikipedia.org/wiki/List_of_South_Park_Elementary_staff#Strong_Woman',
 'Shelly Marsh': 'https://en.wikipedia.org/wiki/Shelley_Marsh',
 'Grandpa Marvin Marsh': 'https://en.wikipedia.org/wiki/Grandpa_Marsh',
 'Jimbo 

We also probably want to start a `pandas` dataframe to store this info. Name, firstname/nickname, full link, texts etc.

In [96]:
import pandas as pd

characters_df = pd.DataFrame.from_dict({'name':list(characters_dict.keys()), 'wiki_link':list(characters_dict.values())})

In [107]:
characters_df = characters_df.set_index('name')
characters_df

Unnamed: 0_level_0,wiki_link
name,Unnamed: 1_level_1
Stan Marsh,https://en.wikipedia.org/wiki/Stan_Marsh
Kyle Broflovski,https://en.wikipedia.org/wiki/Kyle_Broflovski
Eric Cartman,https://en.wikipedia.org/wiki/Eric_Cartman
Kenny McCormick,https://en.wikipedia.org/wiki/Kenny_McCormick
Butters Stotch,https://en.wikipedia.org/wiki/Butters_Stotch
Mr. Garrison,https://en.wikipedia.org/wiki/Mr._Garrison
Jimmy Valmer,https://en.wikipedia.org/wiki/Jimmy_Valmer
Wendy Testaburger,https://en.wikipedia.org/wiki/Wendy_Testaburger
Scott Malkinson,https://en.wikipedia.org/wiki/List_of_recurrin...
Mr. Mackey,https://en.wikipedia.org/wiki/Mr._Mackey


In [119]:
# Create a column of first names or nicknames. Could be a list for cartman, but if I had to use only one, I would use the surname
first_names = []

titles = ['mr.', 'ms.', 'mrs.', 'dr.', 'pc', 'officer', 'father']

for character in characters_df.index.to_list():
    split_list = character.split(' ')
    if len(split_list)==2:
        if split_list[0]=='Eric':
            first_names.append(split_list[1])
        else:
            first_names.append(split_list[0])
    else:
        first_names.append('')

In [120]:
first_names

['Stan',
 'Kyle',
 'Cartman',
 'Kenny',
 'Butters',
 'Mr.',
 'Jimmy',
 'Wendy',
 'Scott',
 'Mr.',
 'PC',
 'Strong',
 'Shelly',
 '',
 'Jimbo',
 'Ike',
 'Liane',
 'Stuart',
 'Carol',
 'Karen',
 'Stephen',
 'Linda',
 'Officer',
 'Ned',
 '',
 'Father',
 '',
 '',
 'Mr.',
 'Harrison',
 'Betsy',
 'Ms.',
 'Ms.',
 'Saddam',
 'Principal',
 'Mr.',
 'Jason',
 'Mrs.',
 'Bebe',
 'Clyde',
 'Craig',
 'Heidi',
 'Timmy',
 'Tolkien',
 'Tweek',
 '',
 '',
 'Pip',
 'Randy',
 'Sharon',
 'Gerald',
 'Sheila']