## Raspagem de textos de episódios

In [10]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import json

In [11]:
def is_line(line):
    return (line.find('b') != None)

def get_character(line):
    return line.find('b').text

def get_character_text(line):
    character = get_character(line)
    return line.text[len(character)+2:].strip()

def get_noncharacter_text(line):
    return line.text

def get_map_character_names():
    return {
        "LSP": "Lumpy Space Princess",
        "PB": "Princess Bubblegum",
        "Bubblegum": "Princess Bubblegum",
        "Susan": "Susan Strong",
        "Hunson": "Hunson Abadeer",
        "Starchie": "Starchy",
        "Gunther": "Gunter",
        "Lich": "The Lich",
        "Simon": "Simon Petrikov"
    }


def parse_episode_text(lines):
    text = []
    for line in lines:
        line_dict = {}
        if is_line(line):
            origin = get_character(line).strip().replace("\"", "").replace("\n", "").strip()
            if origin in get_map_character_names().keys():
                origin = get_map_character_names()[origin]
            line_dict['origin'] = origin
            line_dict['text'] = get_character_text(line)
        else:
            line_dict['origin'] = 'action'
            line_dict['text'] = get_noncharacter_text(line)
        text.append(line_dict)
    return text

def parse_episode_text_special(soup):
    lines = soup.find_all('p')
    text = []
    for line in lines[1:]:
        line_dict = {}
        pattern = r"([A-Za-z\s]+):\s.*?"
        match = re.search(pattern, line.text)
        b = line.find_all('b')

        if match:
            origin = match.group(1).strip().replace("\"", "").replace("\n", "").strip()
            line_text = line.text.replace(origin , "").strip().replace(":", "").replace("\n", "").strip()
        elif len(b) >  0:
            origin = b[0].text
            origin = origin.replace(": ", "")
            origin = origin.replace(":", "")
            origin = origin.strip().replace("\"", "").replace("\n", "").strip()
            b[0].extract()
            line_text = line.text.strip()
        else:
            origin = 'action'
            line_text = line.text.strip()
        
        if origin in get_map_character_names().keys():
            origin = get_map_character_names()[origin]
        line_dict['origin'] = origin
        line_dict['text'] = line_text
        text.append(line_dict)
    return text

In [12]:
def parse_episode_data(url):

    episode = {}
    response = requests.get(url)
    html_content = response.text

    # Criar um objeto BeautifulSoup para análise
    soup = BeautifulSoup(html_content, 'html.parser')

    # Encontrar elementos usando seletores CSS
    metadata = soup.find('aside')
    lines = soup.find_all('dl')

    title = soup.find('h1').text.split('/')[0].strip()
    if len(lines) > 10:
        text = parse_episode_text(lines)
    else:
        text = parse_episode_text_special(soup)
    season_number, episode_number = parse_episode_number(metadata)

    # Distant refere-se a "Adventure Times Distant Lands", não fazendo parte de nenhuma das temporadas oficiais da animação 
    # e portanto não fará parte desta análise. episódios marcados como parte desta temporada serão removidos.
    if season_number == 'Distant':
        season_number = 99


    episode['title'] = title.replace("(episode)", "").strip()
    episode['season_number'] = int(season_number)
    episode['episode_number'] = int(episode_number)
    episode['text'] = text
    
    return episode


def parse_episode_number(metadata):
    try:
        index = metadata.find('nav').text.split(', ')
        season_number = index[0].split()[1]
        episode_number = index[1].split()[1]
    except:
        season_number = 0
        episode_number = 0
    return season_number, episode_number

In [13]:
url = 'https://adventuretime.fandom.com/wiki/Category:Transcripts'
response = requests.get(url)
html_content = response.text

# Criar um objeto BeautifulSoup para análise
soup = BeautifulSoup(html_content, 'html.parser')

main = soup.find('main')
links = main.find_all('div')[16].find_all('a')[1:]
urls = []
for link in links:
    if 'Transcript' in link.text:
        urls.append('https://adventuretime.fandom.com' + link['href'])


episodes = []
for url in urls:
    episode = parse_episode_data(url)
    episodes.append(episode)


### Criação do Dataset

In [14]:
df = pd.DataFrame(episodes)

### Tratamento de casos especiais

In [15]:
df[(df['season_number'] == 0) | (df['episode_number'] == 0)]


Unnamed: 0,title,season_number,episode_number,text
0,Abstract,0,0,"[{'origin': 'Jake', 'text': 'Hey, brother. Pas..."
6,Animated short,0,0,"[{'origin': 'action', 'text': '[The episode st..."
57,Destiny,0,0,[]
58,Diamonds and Lemons,0,279,"[{'origin': 'action', 'text': '[The episode op..."
107,Hero Heart,0,0,"[{'origin': 'action', 'text': '[The episode be..."
138,Jerry,0,0,[]


In [16]:
df.loc[1, ['season_number', 'episode_number']] = [9, 10]
# Este episódio é um especial, não fazendo parte de nenhuma das temporadas oficiais da animação e portanto não fará parte desta análise
df.loc[58, ['season_number', 'episode_number']] = [99, 0]
df.loc[107, ['season_number', 'episode_number']] = [9, 8]

In [17]:
df = df.drop(index=7)
df = df.reset_index(drop=True)

## Raspagem de metadados de episódios

In [18]:
url = 'https://en.wikipedia.org/wiki/List_of_Adventure_Time_episodes'
response = requests.get(url)
html_content = response.text

# Criar um objeto BeautifulSoup para análise
soup = BeautifulSoup(html_content, 'html.parser')

elements_to_remove = soup.find_all('sup')
for element in elements_to_remove:
    element.extract()

tables = soup.find_all(class_='wikiepisodetable')[1:11]

In [19]:
def parse_storyboarders_writers(cell):
    st_text = cell.text
    if ", " in st_text:
        storyboarded = [name.replace("& ", "") for name in st_text.split(", ")]
    elif "&" in st_text:
        storyboarded = st_text.split(" & ")
    else:
        storyboarded = [st_text]
        
    return storyboarded

In [20]:
seasons = {}
for i, table in enumerate(tables):
    episodes = table.find_all('tr')[1:]
    season = {}
    for episode in episodes:
        ep = {}

        info = episode.find_all('td')

        try:
            number = info[0].text
            directed = list(info[2].stripped_strings)
            writers = parse_storyboarders_writers(info[3])
            
            ep["directors"] = directed
            ep["writers"] = writers
            
            season[number] = ep
        except:
            continue
    seasons[str(i+1)] = season

In [21]:
def get_episode_metadata(season_number, episode_number, metadata):
    try:
        episode = metadata[str(season_number)][str(episode_number)]
    except KeyError:
        return None, None
    directors = episode.get("directors")
    writers = episode.get("writers")
    return directors, writers

In [22]:
df[["directors", "writers"]] = df.apply(lambda row: get_episode_metadata(row['season_number'], row['episode_number'], seasons), axis=1, result_type='expand')


## Tratamento individual de casos especiais
Alguns episódios específicos têm o texto ou o html em outro formato e portanto não podem ser obtidos com o código abstraído executado anteriormente. Estes casos serão tratados individualmente nas células a seguir.

In [23]:
df.head(3)

Unnamed: 0,title,season_number,episode_number,text,directors,writers
0,Abstract,0,0,"[{'origin': 'Jake', 'text': 'Hey, brother. Pas...",,
1,All the Little People,9,10,"[{'origin': 'action', 'text': '[Finn and Jake ...",[Adam Muto],"[Graham Falk, Laura Knetzger]"
2,All Your Fault,5,9,"[{'origin': 'action', 'text': '[Princess Bubbl...","[Larry Leichliter, Nate Cash]","[Tom Herpich, Steve Wolfhard]"


In [24]:
df[df.apply(lambda row: len(row['text']) <  3, axis=1)]

Unnamed: 0,title,season_number,episode_number,text,directors,writers
51,Daddy-Daughter Card Wars,8,11,"[{'origin': 'action', 'text': ''}, {'origin': ...",[Andres Salaff],"[Steve Wolfhard, Adam Muto]"
56,Destiny,0,0,[],,
108,High Strangeness,8,17,"[{'origin': 'action', 'text': ''}, {'origin': ...",[Elizabeth Ito],"[Pendleton Ward, Sam Alden]"
137,Jerry,0,0,[],,


In [25]:
# Daddy-Daughter Card Wars não possui uma transcrição, então a linha será removida.

df = df.drop(index=52)
df = df.reset_index(drop=True)

In [26]:
print(df.iloc[107]['text'][1]['text'])

(SNORING) (SNORING LOUDLY) (SNORING LIGHTLY) Oh, it's you. Meh. Meh. (SLOW MOTION) Hi, everyone! My babies! (POWER SAW WHIRRING) (CRICKETS CHIRPING) Mashed potato? Pie pie? (SNORING) (SNORING LIGHTLY)  (LAUGHS) Okay, bye! Hey, where'd you go? I think I was visited by aliens again. Every few years, they stop by and show me my hybrid children. (LAUGHS) What? Don't be jealous of my alien consorts, Mr. Pig. I love you the most. (CHUCKLES) All right, sweetie. Thank you. (SMOOCHES) You gonna come with me to Princess Bubblegum's fireworks show tomorrow? No, she wizzed on our wedding. I'm sorry, but she's a fat fish in the sand. I want to see free fireworks, though, and I want you to come with me. Eh, all right. (SMACK! SQUEAL!) (MUSIC PLAYING) (MUSIC STOPS) Science rules! Here's some proof. (CAMERA SHUTTER CLICKING) I'm doing the right thing. (WARBLING) Free fireworks! Right! Here we go! (APPLAUSE) (WHIR!) (WHISTLE!) (LAUGHING) Whoa-ho-ho-ho! Whoa! Whoa-ho-ho! Hey, where's my wife? (SLOW MOTI

E para o caso do episódio High Strangeness, Pode-se reparar que a transcrição não possui separação entre linhas, e por isso ficou ocupando apenas uma posição na lista. Este tipo de caso não será removido pois o seu texto ainda poderá ser usado para modelagem de tópicos.


## Tratamento de valores nulos

Há ainda casos em que existem valores nulos no dataset. A célula a seguir encontrará as linhas em que isto acontece.

In [27]:
df[df.isna().any(axis=1)]

Unnamed: 0,title,season_number,episode_number,text,directors,writers
0,Abstract,0,0,"[{'origin': 'Jake', 'text': 'Hey, brother. Pas...",,
6,Animated short,0,0,"[{'origin': 'action', 'text': '[The episode st...",,
27,BMO,99,1,"[{'origin': 'action', 'text': '[The episode st...",,
45,Come Along With Me,10,13,"[{'origin': 'action', 'text': '[Outside a very...",,
55,Destiny,0,0,[],,
56,Diamonds and Lemons,99,0,"[{'origin': 'action', 'text': '[The episode op...",,
110,Holly Jolly Secrets Part I,3,19,"[{'origin': 'action', 'text': '[The episode be...",,
111,Holly Jolly Secrets Part II,3,20,"[{'origin': 'action', 'text': '[The episode op...",,
136,Jerry,0,0,[],,
144,Lemonhope Part 2,5,51,"[{'origin': 'action', 'text': '[A small cloud ...",,


Como pode-se ver, são apenas 7 casos em que isto acontece. 

> Vale ainda ressaltar que entre estes 7 casos, 3 são de especiais (aqui marcados como parte da temporada 99). estas linhas serão removidas por não fazerem parte do escopo deste trabalho, restando 4 casos a serem de fato tratados. A célula a seguir removerá estas linhas.

In [31]:
rows_to_remove = df.loc[df['season_number'] == 99]
df = df.drop(rows_to_remove.index)
rows_to_remove = df.loc[df['season_number'] == 0]
df = df.drop(rows_to_remove.index)
df = df.reset_index(drop=True)

df[df.isna().any(axis=1)]

Unnamed: 0,title,season_number,episode_number,text,directors,writers
42,Come Along With Me,10,13,"[{'origin': 'action', 'text': '[Outside a very...",,
138,Lemonhope Part 2,5,51,"[{'origin': 'action', 'text': '[A small cloud ...",,


Sendo apenas poucos, pode-se preencher manualmente os dados faltantes. Isto é feito a seguir:

In [29]:
def fill_cell(index, directors, writers):
    df.loc[index, ['directors', 'writers']] = [json.dumps(directors), json.dumps(writers)]

fill_cell(42, ["Cole Sanchez", "Diana Lafyatis"], ["Tom Herpich", "Steve Wolfhard", "Somvilay Xayaphone", "Seo Kim", "Aleks Sennwald", "Hanna K. Nyström", "Sam Alden", "Graham Falk"])
fill_cell(138, ["Nate Cash"], ["Tom Herpich", "Steve Wolfhard"])

In [30]:
import pickle

file_name = 'data.pkl'

# Salvar o DataFrame em disco usando pickle
with open(file_name, 'wb') as file:
    pickle.dump(df, file)