In [28]:
import json
import requests
import re

In [29]:
def create_wiki_query(query):
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    title = f"titles={query}"
    content = "prop=revisions&rvprop=content"
    dataformat ="format=json"
    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
    return query

In [30]:
print(create_wiki_query('List_of_South_Park_characters'))

https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&titles=List_of_South_Park_characters&format=json


In [31]:
# a = json.loads(wikitext)

# page_id = list(a['query']['pages'].keys())[0]

# content = a['query']['pages'][page_id]['revisions'][0]['*']

# content


def get_wikicontent(query_link):
    '''
    query link: string of the wikipedia query. Can be the link to a page
    '''
    # Create query
    query = create_wiki_query(query_link)
    
    # Get wiki response
    wikiresponse = requests.get(query)
    wikitext = wikiresponse.text
    
    # Use json to read wiki response
    a = json.loads(wikitext)
    
    # Get the page id unique for each page
    page_id = list(a['query']['pages'].keys())[0]
    content = a['query']['pages'][page_id]['revisions'][0]['*'] # Hope there is a revisions key and a *
    return content
    
    
    
    

In [32]:
main_page= get_wikicontent('List_of_South_Park_characters')

From the main page, we initially focus on the main and secondary characters. The idea is to get the wikitext for the two groups. Scrape the main article for each character from the `main` link. Careful, this link is enclosed in `{{}}`.

In [33]:
# Main characters
pattern = r'[\{]{2}(main)\|([\w\s.]*)[\}]{2}'
matches = re.findall(pattern, main_page)

Create a dictionary with the name as the key and wikilink as the value

In [34]:
characters_dict = {}

for match in matches:
    #match is of the form ('main', character name)
    characters_dict[match[1]]=match[1].replace(' ', '_')

In [35]:
characters_dict

{'Stan Marsh': 'Stan_Marsh',
 'Kyle Broflovski': 'Kyle_Broflovski',
 'Eric Cartman': 'Eric_Cartman',
 'Kenny McCormick': 'Kenny_McCormick',
 'Butters Stotch': 'Butters_Stotch',
 'Randy and Sharon Marsh': 'Randy_and_Sharon_Marsh',
 'Mr. Garrison': 'Mr._Garrison',
 'Mr. Mackey': 'Mr._Mackey',
 'Gerald and Sheila Broflovski': 'Gerald_and_Sheila_Broflovski',
 'Liane Cartman': 'Liane_Cartman',
 'Jimmy Valmer': 'Jimmy_Valmer',
 'Tolkien Black': 'Tolkien_Black',
 'Wendy Testaburger': 'Wendy_Testaburger'}

Now go for the reccuring characters. Some of them, who have a specific pattern in the wiki page content.

In [36]:
pattern = r'\|[\[]{2}([\w\s.\(\)#]+)\|*([\w\s.]*)[\]]{2}[\}]{2}'
matches = re.findall(pattern, main_page)
matches

[('List of recurring South Park characters#Scott Malkinson',
  'Scott Malkinson'),
 ('PC Principal', ''),
 ('List of South Park Elementary staff#Strong Woman', 'Strong Woman'),
 ('Shelley Marsh', 'Shelly Marsh'),
 ('Grandpa Marsh', 'Grandpa Marvin Marsh'),
 ('Jimbo Kern', ''),
 ('Ike Broflovski', ''),
 ('Stuart McCormick', ''),
 ('Stuart and Carol McCormick', 'Carol McCormick'),
 ('Karen McCormick (South Park)', 'Karen McCormick'),
 ('Stephen and Linda Stotch', 'Stephen Stotch'),
 ('Stephen and Linda Stotch', 'Linda Stotch'),
 ('Officer Barbrady', ''),
 ('Ned Gerblansky', ''),
 ('Tuong Lu Kim', ''),
 ('Father Maxi', ''),
 ('Mayor McDaniels', 'Mayor Mary McDaniels'),
 ('Alphonse Mephisto', 'Dr. Alphonse Mephesto'),
 ('Mr. Slave', ''),
 ('List of recurring South Park characters#Sergeant Harrison Yates',
  'Harrison Yates'),
 ('Betsy Donovan', ''),
 ('List of staff at South Park Elementary#Ms. Choksondik', 'Ms. Choksondik'),
 ('List of staff at South Park Elementary#Ms. Crabtree', 'Ms. Cr

In [37]:
for match in matches:
    #match is of the form ('main', character name)
    if match[1]:
        characters_dict[match[1]]=match[0].replace(' ', '_')
    elif not match[1] and not match[0]:
        continue
    else:
        characters_dict[match[0]]=match[0].replace(' ', '_')

In [38]:
characters_dict

{'Stan Marsh': 'Stan_Marsh',
 'Kyle Broflovski': 'Kyle_Broflovski',
 'Eric Cartman': 'Eric_Cartman',
 'Kenny McCormick': 'Kenny_McCormick',
 'Butters Stotch': 'Butters_Stotch',
 'Randy and Sharon Marsh': 'Randy_and_Sharon_Marsh',
 'Mr. Garrison': 'Mr._Garrison',
 'Mr. Mackey': 'Mr._Mackey',
 'Gerald and Sheila Broflovski': 'Gerald_and_Sheila_Broflovski',
 'Liane Cartman': 'Liane_Cartman',
 'Jimmy Valmer': 'Jimmy_Valmer',
 'Tolkien Black': 'Tolkien_Black',
 'Wendy Testaburger': 'Wendy_Testaburger',
 'Scott Malkinson': 'List_of_recurring_South_Park_characters#Scott_Malkinson',
 'PC Principal': 'PC_Principal',
 'Strong Woman': 'List_of_South_Park_Elementary_staff#Strong_Woman',
 'Shelly Marsh': 'Shelley_Marsh',
 'Grandpa Marvin Marsh': 'Grandpa_Marsh',
 'Jimbo Kern': 'Jimbo_Kern',
 'Ike Broflovski': 'Ike_Broflovski',
 'Stuart McCormick': 'Stuart_McCormick',
 'Carol McCormick': 'Stuart_and_Carol_McCormick',
 'Karen McCormick': 'Karen_McCormick_(South_Park)',
 'Stephen Stotch': 'Stephen_and

There are still some characters left. Same procedure, different pattern

In [45]:
pattern = r'\|\s{1}[\[]{2}([\w\s.-]+)\]\][\r\n]+'
matches = re.findall(pattern, main_page)
matches

['Bebe Stevens',
 'Heidi Turner',
 'Timmy Burch',
 'Tweek Tweak',
 'Big Gay Al',
 'Terrance and Phillip',
 'Pip Pirrup']

In [40]:
for match in matches:
    characters_dict[match]=match.replace(' ', '_')

In [41]:
characters_dict

{'Stan Marsh': 'Stan_Marsh',
 'Kyle Broflovski': 'Kyle_Broflovski',
 'Eric Cartman': 'Eric_Cartman',
 'Kenny McCormick': 'Kenny_McCormick',
 'Butters Stotch': 'Butters_Stotch',
 'Randy and Sharon Marsh': 'Randy_and_Sharon_Marsh',
 'Mr. Garrison': 'Mr._Garrison',
 'Mr. Mackey': 'Mr._Mackey',
 'Gerald and Sheila Broflovski': 'Gerald_and_Sheila_Broflovski',
 'Liane Cartman': 'Liane_Cartman',
 'Jimmy Valmer': 'Jimmy_Valmer',
 'Tolkien Black': 'Tolkien_Black',
 'Wendy Testaburger': 'Wendy_Testaburger',
 'Scott Malkinson': 'List_of_recurring_South_Park_characters#Scott_Malkinson',
 'PC Principal': 'PC_Principal',
 'Strong Woman': 'List_of_South_Park_Elementary_staff#Strong_Woman',
 'Shelly Marsh': 'Shelley_Marsh',
 'Grandpa Marvin Marsh': 'Grandpa_Marsh',
 'Jimbo Kern': 'Jimbo_Kern',
 'Ike Broflovski': 'Ike_Broflovski',
 'Stuart McCormick': 'Stuart_McCormick',
 'Carol McCormick': 'Stuart_and_Carol_McCormick',
 'Karen McCormick': 'Karen_McCormick_(South_Park)',
 'Stephen Stotch': 'Stephen_and

Finally, some cases that were not handled, to get as many characters possible:

In [55]:
pattern = r"\|{1}\s{1}[\[]{2}([\s\w\(\)]+)\|{1}([\w\s]+)[\]]{2}[\r\n]{1}"

matches = re.findall(pattern, main_page)
matches

[('Jesus and Pals', 'Jesus'),
 ('Priesthood in the Catholic Church', 'Catholic priest'),
 ('Santa Claus (South Park)', 'Santa'),
 ('Towelie (character)', 'Towelie'),
 ('Chef (South Park)', 'Chef'),
 ('Satan (South Park)', 'Satan')]

In [56]:
for match in matches:
    #match is of the form ('main', character name)
    if match[1]:
        characters_dict[match[1]]=match[0].replace(' ', '_')
    elif not match[1] and not match[0]:
        continue
    else:
        characters_dict[match[0]]=match[0].replace(' ', '_')


In [57]:
import copy

tmp_dict = copy.deepcopy(characters_dict)

for character, link in tmp_dict.items():
    if ' and ' in character:
        split_list = character.split(" ")
        if len(split_list)>3:
            characters_dict[" ".join([split_list[0], split_list[-1]])] = link
            characters_dict[" ".join([split_list[2], split_list[-1]])] = link
            del characters_dict[character]

del tmp_dict

In [58]:
# Also make the link a full wikilink
base_wiki_url = r'https://en.wikipedia.org/wiki/'

for character, link in characters_dict.items():
    characters_dict[character] = base_wiki_url+link

In [59]:
characters_dict

{'Stan Marsh': 'https://en.wikipedia.org/wiki/Stan_Marsh',
 'Kyle Broflovski': 'https://en.wikipedia.org/wiki/Kyle_Broflovski',
 'Eric Cartman': 'https://en.wikipedia.org/wiki/Eric_Cartman',
 'Kenny McCormick': 'https://en.wikipedia.org/wiki/Kenny_McCormick',
 'Butters Stotch': 'https://en.wikipedia.org/wiki/Butters_Stotch',
 'Mr. Garrison': 'https://en.wikipedia.org/wiki/Mr._Garrison',
 'Mr. Mackey': 'https://en.wikipedia.org/wiki/Mr._Mackey',
 'Liane Cartman': 'https://en.wikipedia.org/wiki/Liane_Cartman',
 'Jimmy Valmer': 'https://en.wikipedia.org/wiki/Jimmy_Valmer',
 'Tolkien Black': 'https://en.wikipedia.org/wiki/Tolkien_Black',
 'Wendy Testaburger': 'https://en.wikipedia.org/wiki/Wendy_Testaburger',
 'Scott Malkinson': 'https://en.wikipedia.org/wiki/List_of_recurring_South_Park_characters#Scott_Malkinson',
 'PC Principal': 'https://en.wikipedia.org/wiki/PC_Principal',
 'Strong Woman': 'https://en.wikipedia.org/wiki/List_of_South_Park_Elementary_staff#Strong_Woman',
 'Shelly Marsh

We also probably want to start a `pandas` dataframe to store this info. Name, firstname/nickname, full link, texts etc.

In [60]:
import pandas as pd

characters_df = pd.DataFrame.from_dict({'name':list(characters_dict.keys()), 'wiki_link':list(characters_dict.values())})

In [61]:
characters_df = characters_df.set_index('name')
characters_df

Unnamed: 0_level_0,wiki_link
name,Unnamed: 1_level_1
Stan Marsh,https://en.wikipedia.org/wiki/Stan_Marsh
Kyle Broflovski,https://en.wikipedia.org/wiki/Kyle_Broflovski
Eric Cartman,https://en.wikipedia.org/wiki/Eric_Cartman
Kenny McCormick,https://en.wikipedia.org/wiki/Kenny_McCormick
Butters Stotch,https://en.wikipedia.org/wiki/Butters_Stotch
Mr. Garrison,https://en.wikipedia.org/wiki/Mr._Garrison
Mr. Mackey,https://en.wikipedia.org/wiki/Mr._Mackey
Liane Cartman,https://en.wikipedia.org/wiki/Liane_Cartman
Jimmy Valmer,https://en.wikipedia.org/wiki/Jimmy_Valmer
Tolkien Black,https://en.wikipedia.org/wiki/Tolkien_Black


In [62]:
# Create a column of first names or nicknames. Could be a list for cartman, but if I had to use only one, I would use the surname
first_names = []

titles = ['mr.', 'ms.', 'mrs.', 'dr.', 'pc', 'officer', 'father', 'catholic']


for character in characters_df.index.to_list():
    split_list = character.split(' ')
    if split_list[0].lower() in titles:
        first_names.append(split_list[-1].title())
    elif len(split_list)==2:
        if split_list[0]=='Eric':
            first_names.append(split_list[1])
        else:
            first_names.append(split_list[0])
    else:
        first_names.append(character)

In [63]:
first_names

['Stan',
 'Kyle',
 'Cartman',
 'Kenny',
 'Butters',
 'Garrison',
 'Mackey',
 'Liane',
 'Jimmy',
 'Tolkien',
 'Wendy',
 'Scott',
 'Principal',
 'Strong',
 'Shelly',
 'Grandpa Marvin Marsh',
 'Jimbo',
 'Ike',
 'Stuart',
 'Carol',
 'Karen',
 'Stephen',
 'Linda',
 'Barbrady',
 'Ned',
 'Tuong Lu Kim',
 'Maxi',
 'Mayor Mary McDaniels',
 'Mephesto',
 'Slave',
 'Harrison',
 'Betsy',
 'Choksondik',
 'Crabtree',
 'Saddam',
 'Principal',
 'Hankey',
 'Jason',
 'Nelson',
 'Bebe',
 'Heidi',
 'Timmy',
 'Tweek',
 'Big Gay Al',
 'Terrance and Phillip',
 'Pip',
 'Jesus',
 'Priest',
 'Santa',
 'Towelie',
 'Chef',
 'Satan',
 'Randy',
 'Sharon',
 'Gerald',
 'Sheila']

In [64]:
characters_df['first_name'] = first_names

In [65]:
characters_df

Unnamed: 0_level_0,wiki_link,first_name
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Stan Marsh,https://en.wikipedia.org/wiki/Stan_Marsh,Stan
Kyle Broflovski,https://en.wikipedia.org/wiki/Kyle_Broflovski,Kyle
Eric Cartman,https://en.wikipedia.org/wiki/Eric_Cartman,Cartman
Kenny McCormick,https://en.wikipedia.org/wiki/Kenny_McCormick,Kenny
Butters Stotch,https://en.wikipedia.org/wiki/Butters_Stotch,Butters
Mr. Garrison,https://en.wikipedia.org/wiki/Mr._Garrison,Garrison
Mr. Mackey,https://en.wikipedia.org/wiki/Mr._Mackey,Mackey
Liane Cartman,https://en.wikipedia.org/wiki/Liane_Cartman,Liane
Jimmy Valmer,https://en.wikipedia.org/wiki/Jimmy_Valmer,Jimmy
Tolkien Black,https://en.wikipedia.org/wiki/Tolkien_Black,Tolkien


In [66]:
characters_df.to_csv('characters_csv.csv')

In [72]:
import pickle

try:
    with open('wiki_characters_df.pickle', 'rb') as f:
        pass
except FileNotFoundError as e:
    print(e)
    with open('wiki_characters_df.pickle', 'wb') as f:
        pickle.dump(characters_df, f)

[Errno 2] No such file or directory: 'wiki_characters_df.pickle'
