In [1]:
%%html
<style>
  td {text-align: left !important;}
</style>

# Processing South Park Wikia
This Python notebook processes the raw script data available on the South Park Wikia. Does it by looping through season by season, episode by episode.

The scripts for each episode are parsed by going through the main table in on the webpage for each episode. The main table contains two columns, the right column for the character speaking the line and the left column for the line spoken.

| Character       | Line                                                                  |
|:----------------|:----------------------------------------------------------------------|
| Kyle Broflovski | Ah, damn it! My little brother's trying to follow me to school again. |
| Ike Broflovski  | Zeeponaner.                                                           |
| Kyle            | Ike, you can't come to school with me. *[Ike Chortles.]*              |
|                 | *At the bus stop*                                                     |

The following is done for processing:
1. The asides (e.g. '[Ike Chortles]') are removed.
2. Characters with shortened names (e.g. Kyle and Kyle Broflovski) have their lines under the same name.
3. Rows where actions are written (e.g. 'At the bus stop') are removed (rows with no character cell).

In [2]:
# necessary imports
import pandas as pd
import requests
import bs4
import re
from bs4 import BeautifulSoup

### URL and Script Processing
Processes the season URLs to navigate to the correct link for each season as well as removes the on-screen actions shown in the captions (e.g. '[Ike Chortles]' or '[Cartman bunny hops]').

In [3]:
# main page link
url = "http://southpark.wikia.com"
# location of all links to scripts
portal = "/wiki/Portal:Scripts"

def strip_season_url(link):
    return link[link.find('/Season'):]

def strip_extras(text):
    text = re.sub(r'\[[^\]]*\]', ' ', text).strip()
    return text

### File Output
Writes the given character and line data to a given CSV file. Allows to write csv files per episode or per season, or even the entire script data in one file.

In [4]:
def write_data(dictionary, file_name):
    df = pd.DataFrame(dictionary)
    df = df[['season', 'episode', 'character', 'line']]
    df.to_csv(file_name + '.csv', encoding='utf-8')

### Character Processing
Processes the cast list on the Wikia page per episode and returns a list containing all the characters that appear in the episode. The characters for each line is also processed because after their first occurance, the character names are shortend (e.g. Kyle Broflovski to Kyle and Mr. Herbert Garrison to Mr. Garrison).

In [5]:
# creates a list of the characters from the episode
def get_characters(ul_list):
    characters = list()
    for li in ul_list.findAll('li'):
        characters.append(li.get_text().strip())
    return characters

In [6]:
# matches a shorter character name to the full name from the episode cast
def match_character(name, char_list):
    subset = name.split(' ')
    for char in char_list:
        if set(subset).issubset(set(char.split(' '))):
            return char
    return name

### Episode Processing
Processes the main table on the page, which contains two columns: 'character' and 'line'. Each row contains one line said by a character. The lines containing actions on-screen (e.g. 'At the bus stop') are removed by checking for empty character cells, but non-empty line cells. Lines that multiple characters say at the same time are split apart into two, adjacent lines.

In [7]:
# parse the episode by getting all characters and matching lines for each character
def parse_episode(link, season, episode):
    print('----------Episode:', link)
    episode_page = BeautifulSoup(requests.get(link).content, 'html.parser')
    script_table = episode_page.findAll('table')[1]
    chars = get_characters(episode_page.find(id='mw-content-text').find('ul'))
    
    print('-------# of Lines:', len(script_table.find_all('tr')))
    script_table = script_table.find_all('tr')[1 : -1]
    
    temp = []
    
    for row in script_table:
        cell = row.contents
        if len(cell) >= 3:
            c = str(cell[1].text)
            l = str(cell[2].text)
            if not len(c) == 1 :
                name = re.sub('[^a-zA-Z1-9 ,]', ' ', c)
                name = re.sub('\\s+', ' ', name).strip()
                stripped_line = l.strip()
                for n in name.split(', '):
                    n = match_character(n, chars)
                    d = {}
                    d['season'] = season
                    d['episode'] = episode
                    d['character'] = n
                    d['line'] = stripped_line
                    temp.append(d)
    write_data(temp, 'data/season ' + str(season) + '/episode ' + str(episode))
    return temp

### Season Processing
Processes the main table on the page that contains a link to each of the South Park episodes of a season--a loop goes through the link to each episode and returns a list containing the character and the text they said over all the seasons.

In [8]:
# loops through all episodes of a single season
def parse_season(link, season):
    print('\n-----------Season:', link)
    
    season_page = BeautifulSoup(requests.get(link).content, 'html.parser')
    season_table = season_page.find('div', id='gallery-0')
    
    count = 1
    
    print('----------# of Ep:', len(season_table.find_all('a', class_='link-internal')))
    temp = []
    for a in season_table.find_all('a', class_='link-internal'):
        k = parse_episode(url + a['href'], season, count)        
        count += 1
        temp.extend(k)
        if season == 21 and count == 10:
            break
    write_data(temp, 'data/seasons/season ' + str(season))
    return temp

In [9]:
main_page = BeautifulSoup(requests.get(url + portal).content, 'html.parser')
main_table = main_page.find('div', id='gallery-0')

count = 1

a = main_table.findAll('a', class_='link-internal')[0]
temp = []

for a in main_table.findAll('a', class_='link-internal'):
    season_url = strip_season_url(a['href'])
    k = parse_season(url + portal + season_url, count)
    temp.extend(k)

    count += 1
    
df = pd.DataFrame(temp)
df = df[['season', 'episode', 'character', 'line']]
df.to_csv('data/all-seasons.csv', encoding='utf-8')
df.to_csv('data/all-seasons-sep.csv', encoding='utf-8', sep='}')
df['line'] = df['line'].apply(strip_extras)
df.to_csv('data/all-seasons-cleaned-sep.csv', sep='}', encoding='utf-8')


-----------Season: http://southpark.wikia.com/wiki/Portal:Scripts/Season_One
----------# of Ep: 13
----------Episode: http://southpark.wikia.com/wiki/Cartman_Gets_an_Anal_Probe/Script
-------# of Lines: 507
----------Episode: http://southpark.wikia.com/wiki/Weight_Gain_4000/Script
-------# of Lines: 399
----------Episode: http://southpark.wikia.com/wiki/Volcano/Script
-------# of Lines: 406
----------Episode: http://southpark.wikia.com/wiki/Big_Gay_Al%27s_Big_Gay_Boat_Ride/Script
-------# of Lines: 528
----------Episode: http://southpark.wikia.com/wiki/An_Elephant_Makes_Love_to_a_Pig/Script
-------# of Lines: 139
----------Episode: http://southpark.wikia.com/wiki/Death/Script
-------# of Lines: 528
----------Episode: http://southpark.wikia.com/wiki/Pinkeye/Script
-------# of Lines: 523
----------Episode: http://southpark.wikia.com/wiki/Starvin%27_Marvin/Script
-------# of Lines: 472
----------Episode: http://southpark.wikia.com/wiki/Mr._Hankey,_the_Christmas_Poo/Script
-------# of Lin

-------# of Lines: 355
----------Episode: http://southpark.wikia.com/wiki/Here_Comes_the_Neighborhood/Script
-------# of Lines: 376
----------Episode: http://southpark.wikia.com/wiki/Kenny_Dies/Script
-------# of Lines: 274
----------Episode: http://southpark.wikia.com/wiki/Butters%27_Very_Own_Episode/Script
-------# of Lines: 324

-----------Season: http://southpark.wikia.com/wiki/Portal:Scripts/Season_Six
----------# of Ep: 17
----------Episode: http://southpark.wikia.com/wiki/Jared_Has_Aides/Script
-------# of Lines: 387
----------Episode: http://southpark.wikia.com/wiki/Asspen/Script
-------# of Lines: 317
----------Episode: http://southpark.wikia.com/wiki/Freak_Strike/Script
-------# of Lines: 365
----------Episode: http://southpark.wikia.com/wiki/Fun_with_Veal/Script
-------# of Lines: 422
----------Episode: http://southpark.wikia.com/wiki/The_New_Terrance_and_Phillip_Movie_Trailer/Script
-------# of Lines: 345
----------Episode: http://southpark.wikia.com/wiki/Professor_Chaos/Sc

----------# of Ep: 14
----------Episode: http://southpark.wikia.com/wiki/With_Apologies_to_Jesse_Jackson/Script
-------# of Lines: 249
----------Episode: http://southpark.wikia.com/wiki/Cartman_Sucks/Script
-------# of Lines: 290
----------Episode: http://southpark.wikia.com/wiki/Lice_Capades/Script
-------# of Lines: 260
----------Episode: http://southpark.wikia.com/wiki/The_Snuke/Script
-------# of Lines: 366
----------Episode: http://southpark.wikia.com/wiki/Fantastic_Easter_Special/Script
-------# of Lines: 264
----------Episode: http://southpark.wikia.com/wiki/D-Yikes!/Script
-------# of Lines: 286
----------Episode: http://southpark.wikia.com/wiki/Night_of_the_Living_Homeless/Script
-------# of Lines: 298
----------Episode: http://southpark.wikia.com/wiki/Le_Petit_Tourette/Script
-------# of Lines: 270
----------Episode: http://southpark.wikia.com/wiki/More_Crap/Script
-------# of Lines: 247
----------Episode: http://southpark.wikia.com/wiki/Imaginationland/Script
-------# of Lin

-------# of Lines: 204
----------Episode: http://southpark.wikia.com/wiki/Insecurity/Script
-------# of Lines: 272
----------Episode: http://southpark.wikia.com/wiki/Going_Native/Script
-------# of Lines: 222
----------Episode: http://southpark.wikia.com/wiki/A_Nightmare_on_Face_Time/Script
-------# of Lines: 285
----------Episode: http://southpark.wikia.com/wiki/A_Scause_For_Applause/Script
-------# of Lines: 248
----------Episode: http://southpark.wikia.com/wiki/Obama_Wins!/Script
-------# of Lines: 274

-----------Season: http://southpark.wikia.com/wiki/Portal:Scripts/Season_Seventeen
----------# of Ep: 10
----------Episode: http://southpark.wikia.com/wiki/Let_Go,_Let_Gov/Script
-------# of Lines: 243
----------Episode: http://southpark.wikia.com/wiki/Informative_Murder_Porn/Script
-------# of Lines: 258
----------Episode: http://southpark.wikia.com/wiki/World_War_Zimmerman/Script
-------# of Lines: 290
----------Episode: http://southpark.wikia.com/wiki/Goth_Kids_3:_Dawn_of_the_Pose