# Exploring Steven Universe

This notebook explores the data I scraped from the [Steven Universe Fandom Wiki](https://steven-universe.fandom.com/wiki/Episode_Guide). I want to see how large the datasets are so that I can decide whether or not to create a database to store the data or just use pandas DataFrames. 

## To-do list:
- Create additional columns e.g., calculating length of a season
- Find most salient tokens using TF-IDF
    - Create a dictionary that counts the number of times tokens are in the summary column
    - Write a function that lists the top K tokens, in descending order
    - Write a function that lists the tokens that occur at least some specified minimum number of times
    - Write a function that maps each token to its TF-IDF

In [201]:
import csv
import pandas as pd
import numpy as np

In [202]:
data_pathname = '../data/'

seasons_df = pd.read_csv(data_pathname + 'seasons.csv')

In [203]:
seasons_df

Unnamed: 0,name,num_episodes,start_date,end_date
0,1,52,2013-11-04 00:00:00,2015-03-12 00:00:00
1,2,26,2015-03-13 00:00:00,2016-01-08 00:00:00
2,3,25,2016-05-12 00:00:00,2016-08-10 00:00:00
3,4,25,2016-08-11 00:00:00,2017-05-11 00:00:00
4,5,32,2017-05-29 00:00:00,2019-01-21 00:00:00
5,Movie,1,2019-09-02 00:00:00,2019-09-02 00:00:00
6,Future,20,2019-12-07 00:00:00,2020-03-27 00:00:00


In [204]:
seasons_df.dtypes

name            object
num_episodes     int64
start_date      object
end_date        object
dtype: object

In [205]:
# Need to convert date columns to datetime objects
for col in ['start_date', 'end_date']:
    seasons_df[col] = pd.to_datetime(seasons_df[col])
    
seasons_df.dtypes

name                    object
num_episodes             int64
start_date      datetime64[ns]
end_date        datetime64[ns]
dtype: object

In [206]:
seasons_df['length'] = seasons_df['end_date'] - seasons_df['start_date']
seasons_df[seasons_df['name'] != "Movie"]['length'].describe()

count                           6
mean            311 days 16:00:00
std      204 days 01:15:45.296281
min              90 days 00:00:00
25%             151 days 12:00:00
50%             287 days 00:00:00
75%             445 days 00:00:00
max             602 days 00:00:00
Name: length, dtype: object

In [209]:
# Visualize the length of each season
import matplotlib.pyplot as plt
import seaborn as sns

# LATER

In [210]:
episodes_df = pd.read_csv(data_pathname + 'episodes.csv')

In [211]:
episodes_df.head()

Unnamed: 0,title,season,num_series,num_season,airdate,summary
0,Gem Glow,1,1,1,,Steven thinks his favorite ice cream sandwiche...
1,Laser Light Cannon,1,2,2,,"A magical comet hurtles toward Beach City, and..."
2,Cheeseburger Backpack,1,3,3,,A mission to the Lunar Sea Spire takes a treac...
3,Together Breakfast,1,4,4,,Steven tries to get the Crystal Gems to take p...
4,Frybo,1,5,5,,Steven helps a Boardie kid with his grueling j...


In [212]:
episodes_df.shape

(175, 6)

In [213]:
import re

def create_list_tokens(input_string):
    '''
    Takes input (should be a string) and returns a list of tokens 
    '''
    if not isinstance(input_string, str):
        string = str(input_string)
    else:
        string = input_string
        
    list_of_tokens = string.split()
    cleaned_list = []
    
    for token in list_of_tokens:
        stripped_token = re.sub(r'[^\w\s]', '', token).lower() 
        cleaned_token = re.sub(r'[0-9]', '', stripped_token)
        
        if cleaned_token != '':
            cleaned_list.append(cleaned_token)
        
    return cleaned_list

In [214]:
# Create a helper function that counts the distinct number of tokens
def count_distinct_tokens(list_of_tokens, token_counts):
    '''
    Takes as input a list of tokens and updates a dictionary with each 
    distinct string and the number of times it appears in the list.
    
    Inputs:
        - list_of_tokens (list): List of tokens
        - token_counts (dict): Dictionary mapping token to counts
        
    Returns None, updates dictionary in place
    '''
    for token in list_of_tokens:
        if token not in token_counts:
            token_counts[token] = 1
        else:
            token_counts[token] += 1
    
    return None

In [215]:
episodes_df['list_of_tokens'] = episodes_df.apply(lambda row: create_list_tokens(row['summary']), axis = 1)
episodes_df['list_of_tokens'].head()

0    [steven, thinks, his, favorite, ice, cream, sa...
1    [a, magical, comet, hurtles, toward, beach, ci...
2    [a, mission, to, the, lunar, sea, spire, takes...
3    [steven, tries, to, get, the, crystal, gems, t...
4    [steven, helps, a, boardie, kid, with, his, gr...
Name: list_of_tokens, dtype: object

In [216]:
episodes_df.head()

Unnamed: 0,title,season,num_series,num_season,airdate,summary,list_of_tokens
0,Gem Glow,1,1,1,,Steven thinks his favorite ice cream sandwiche...,"[steven, thinks, his, favorite, ice, cream, sa..."
1,Laser Light Cannon,1,2,2,,"A magical comet hurtles toward Beach City, and...","[a, magical, comet, hurtles, toward, beach, ci..."
2,Cheeseburger Backpack,1,3,3,,A mission to the Lunar Sea Spire takes a treac...,"[a, mission, to, the, lunar, sea, spire, takes..."
3,Together Breakfast,1,4,4,,Steven tries to get the Crystal Gems to take p...,"[steven, tries, to, get, the, crystal, gems, t..."
4,Frybo,1,5,5,,Steven helps a Boardie kid with his grueling j...,"[steven, helps, a, boardie, kid, with, his, gr..."


In [250]:
episodes_df.loc[episodes_df.title == 'Gem Glow']['season'].values[0]

'1'

In [217]:
# Need to create a dictionary of tokens for the summaries
summary_token_map = {}

episodes_df.apply(lambda row: count_distinct_tokens(row['list_of_tokens'], summary_token_map), axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
170    None
171    None
172    None
173    None
174    None
Length: 175, dtype: object

In [218]:
summary_token_map

{'steven': 153,
 'thinks': 3,
 'his': 34,
 'favorite': 3,
 'ice': 1,
 'cream': 2,
 'sandwiches': 1,
 'are': 6,
 'the': 119,
 'trick': 1,
 'to': 115,
 'summoning': 1,
 'magic': 2,
 'shield': 1,
 'but': 15,
 'learns': 5,
 'otherwise': 1,
 'when': 11,
 'facing': 1,
 'off': 6,
 'with': 26,
 'an': 11,
 'acidspewing': 1,
 'insect': 1,
 'monster': 6,
 'a': 106,
 'magical': 9,
 'comet': 1,
 'hurtles': 1,
 'toward': 1,
 'beach': 14,
 'city': 15,
 'and': 81,
 'must': 2,
 'dig': 1,
 'through': 3,
 'fathers': 1,
 'collection': 1,
 'of': 33,
 'old': 4,
 'junk': 1,
 'past': 2,
 'find': 9,
 'weapon': 1,
 'that': 14,
 'can': 8,
 'save': 1,
 'town': 3,
 'mission': 6,
 'lunar': 2,
 'sea': 3,
 'spire': 3,
 'takes': 7,
 'treacherous': 1,
 'turn': 2,
 'has': 11,
 'packed': 1,
 'totally': 1,
 'amazing': 1,
 'cheeseburger': 1,
 'backpack': 1,
 'anything': 1,
 'they': 5,
 'could': 1,
 'ever': 1,
 'need': 1,
 'tries': 8,
 'get': 11,
 'crystal': 16,
 'gems': 35,
 'take': 8,
 'part': 2,
 'in': 29,
 'special': 5,

## Calculating TF-IDF
According to Wikipedia, term frequency–inverse document frequency (aka tf–idf) is a statistic designed to reflect how important a word is to a document in a collection or corpus and is often used as a weighting factor in information retrieval and text mining. A word or term is considered salient to a particular document if it occurs frequently in that document, but not in the document corpus over all.

Term frequency–inverse document frequency is defined as:

$$\text{tf-idf}(t,d,D)= \text{tf}(t,d) ⋅ \text {idf}(t,D)$$

where _t_ is a term, _d_ is a document (collection of terms), _D_ is the collection of documents, and _tf_ and _idf_ are defined below.

There are several variants of both term frequency (tf) and inverse document frequency (idf) that can be used to compute tf-idf. We will be using augumented frequency as our measure of term frequency, and we will use vanilla inverse document frequency.

The augmented frequency of a term _t_ in a document _d_ is defined as

$$\text {tf}(t,d)=0.5+0.5⋅\frac {f_{t,d}}{\text {max}({f_{t′,d}}:t′∈d)}$$

where $f_{t,d}$ is the number of times the term _t_ appears in the document _d_.

The vanilla inverse document frequency of a term _t_ in a document collection _D_ is defined as

$$\text {idf} (t,D)=\text {log} \frac{N}{| d∈D:t∈d |}$$
where _N_ is the number of documents in the document collection D.

Use the natural log (math.log) in the _idf_ computation.

In [219]:
def sort_by_count(token_count):
    '''
    Takes a dictionary of token and counts, and returns a list
    of tuples, sorted as expected
    '''
    return sorted(token_count.items(), key=lambda x: x[1], reverse=True)


def calculate_tf(term, token_count):
    '''
    Calculates the term frequency in a specific document
    
    Inputs:
        - term (str): Term of interest
        - token_count (dict): Dictionary mapping terms to 
                              their frequencies in the doc
    '''
    f_td = token_count[term]
    max_ftd = sort_by_count(token_count)[0][1] 
    
    return 0.5 + (0.5 * (f_td / max_ftd))

In [220]:
test_d = {"a": 1, "b": 2, "c": 3}
test_sorted = sort_by_count(test_d)
test_sorted

[('c', 3), ('b', 2), ('a', 1)]

In [221]:
calculate_tf("a", test_d)

0.6666666666666666

In [222]:
calculate_tf("b", test_d)

0.8333333333333333

In [223]:
calculate_tf("c", test_d)

1.0

In [224]:
import math

def count_documents_with_term(term, corpus):
    '''
    Counts the number of documents that contain the term
    
    Inputs:
        - term (str): The term of interest
        - corpus (dict): The collection of documents where the 
            key is some identifier, and the value is a list of
            tokens
    '''
    num_documents=0
    
    for document, tokens in corpus.items():
        if term in tokens:
             num_documents += 1
    
    return num_documents


def calculate_idf(term, corpus):
    '''
    Calculates the vanilla inverse document frequency of a term
    
    Inputs:
        - term (str): The term of interest
        - corpus (dict): The collection of documents where the 
            key is some identifier, and the value is a list of
            tokens
    '''
    docs_with_term = count_documents_with_term(term, corpus)
    
    return math.log(len(corpus) / docs_with_term)


def calculate_tfidf(term, document_tokens, corpus):
    '''
    Calculate the tf-idf of a term in a document, given a corpus
    
    Inputs:
    - term (str): The term of interest
    - document_tokens (list): List of tokens in a document
    - corpus (dict): The collection of documents where the 
            key is some identifier, and the value is a list of
            tokens
    '''
    token_frequency_map = {}
    count_distinct_tokens(document_tokens, token_frequency_map)
    
    tf = calculate_tf(term, token_frequency_map)
    idf = calculate_idf(term, corpus)
    tfidf = tf * idf
        
    return tfidf

In [225]:
# Test on episode summaries
# Every episode summary is a document
# All episode summaries can be the corpus

corpus = {}

for i in range(len(episodes_df)):
    corpus[episodes_df.loc[i, 'title']] = episodes_df.loc[i, 'list_of_tokens']

In [226]:
corpus

{'Gem Glow': ['steven',
  'thinks',
  'his',
  'favorite',
  'ice',
  'cream',
  'sandwiches',
  'are',
  'the',
  'trick',
  'to',
  'summoning',
  'his',
  'magic',
  'shield',
  'but',
  'learns',
  'otherwise',
  'when',
  'facing',
  'off',
  'with',
  'an',
  'acidspewing',
  'insect',
  'monster'],
 'Laser Light Cannon': ['a',
  'magical',
  'comet',
  'hurtles',
  'toward',
  'beach',
  'city',
  'and',
  'steven',
  'must',
  'dig',
  'through',
  'his',
  'fathers',
  'collection',
  'of',
  'old',
  'junk',
  'and',
  'the',
  'past',
  'to',
  'find',
  'the',
  'weapon',
  'that',
  'can',
  'save',
  'the',
  'town'],
 'Cheeseburger Backpack': ['a',
  'mission',
  'to',
  'the',
  'lunar',
  'sea',
  'spire',
  'takes',
  'a',
  'treacherous',
  'turn',
  'but',
  'steven',
  'has',
  'packed',
  'his',
  'totally',
  'amazing',
  'cheeseburger',
  'backpack',
  'with',
  'anything',
  'they',
  'could',
  'ever',
  'need'],
 'Together Breakfast': ['steven',
  'tries',
  

In [227]:
# Find most salient

def build_dictionary(document, corpus):
    '''
    Build a dictionary that maps tokens in a specific doc to their
    tf-idf values.

    Inputs:
        - document (list): a list of tokens
        - corpus (dict): maps an identifier to a list of tokens

    Returns: dictionary i.e., {token: tf-idf}
    '''
    token_to_tfidf = {}

    for token in document:
        if token not in token_to_tfidf:
            token_to_tfidf[token] = calculate_tfidf(token, document, corpus)

    return token_to_tfidf


def find_most_salient(corpus, k):
    '''
    Takes a collection of documents and an integer k and returns 
    a dictionary of the k most salient terms, that is, the terms with 
    the highest tf–idf, for each document. 
    
    Inputs:
        - corpus (dict): maps an identifier to a list of tokens
        - k (int): number of terms per document to pull
        
    Returns a dictionary where the key is the identifier, and the 
        value, a list of the k most salient terms
    '''
    
    k_most_salient = {}

    for doc_id, tokens in corpus.items():
        most_salient_by_doc = []

        if tokens:
            token_to_tfidf = build_dictionary(tokens, corpus)
            sorted_list = sort_by_count(token_to_tfidf)

            for token, count in sorted_list[:k]:
                most_salient_by_doc.append(token)

        k_most_salient[doc_id] = most_salient_by_doc

    return k_most_salient


In [228]:
most_salient = find_most_salient(corpus, 3)

In [229]:
most_salient

{'Gem Glow': ['ice', 'sandwiches', 'trick'],
 'Laser Light Cannon': ['comet', 'hurtles', 'toward'],
 'Cheeseburger Backpack': ['treacherous', 'packed', 'totally'],
 'Together Breakfast': ['breakfast', 'tradition', 'instead'],
 'Frybo': ['boardie', 'kid', 'grueling'],
 'Cat Fingers': ['shape', 'master', 'shifting'],
 'Bubble Buddies': ['looks', 'like', 'hero'],
 'Serious Steven': ['prove', 'worthy', 'navigate'],
 'Tiger Millionaire': ['league', 'joins', 'underground'],
 "Steven's Lion": ['quite', 'why', 'makes'],
 'Arcade Mania': ['arcade', 'allure', 'video'],
 'Giant Woman': ['during', 'sky', 'convince'],
 'So Many Birthdays': ['thousands', 'years', 'birthdays'],
 'Lars and the Cool Kids': ['teenage', 'shenanigans', 'hang'],
 'Onion Trade': ['toy', 'trade', 'escalates'],
 'Steven the Sword Fighter': ['art', 'swordfighting', 'holographic'],
 'Lion 2: The Movie': ['movie', 'theater', 'other'],
 'Beach Party': ['pizza', 'battle', 'damages'],
 "Rose's Room": ['desire', 'alonetime', 'every'

In [230]:
## Test with transcripts?
transcripts_df = pd.read_csv(data_pathname + 'transcripts.csv')

In [231]:
transcripts_df.head()

Unnamed: 0,episode,speaker,actions,quote,location,description
0,Gem Glow,,[],,Open Overview of Beach City,
1,Gem Glow,,[],,Trans. Ext. Big Donut,
2,Gem Glow,Steven,[],NOOOOOOOOOOOOO!!,,
3,Gem Glow,,[],,Trans. Int. Big Donut,
4,Gem Glow,Steven,"['looking at an empty freezer', 'grabs Lars ar...",This can't be happening! This has to be a drea...,,


In [232]:
transcripts_df.shape

(30377, 6)

In [233]:
valid_transcripts = transcripts_df.loc[transcripts_df['quote'].notna(), :]

In [234]:
valid_transcripts['list_of_tokens'] = valid_transcripts.apply(lambda row: create_list_tokens(row['quote']), axis = 1)
valid_transcripts['list_of_tokens'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_transcripts['list_of_tokens'] = valid_transcripts.apply(lambda row: create_list_tokens(row['quote']), axis = 1)


2                                     [nooooooooooooo]
4    [this, cant, be, happening, this, has, to, be,...
5              [get, off, me, man, im, stocking, here]
6    [im, sorry, steven, i, guess, they, stopped, m...
7    [stopped, making, them, why, in, the, world, w...
Name: list_of_tokens, dtype: object

In [235]:
valid_transcripts.head()

Unnamed: 0,episode,speaker,actions,quote,location,description,list_of_tokens
2,Gem Glow,Steven,[],NOOOOOOOOOOOOO!!,,,[nooooooooooooo]
4,Gem Glow,Steven,"['looking at an empty freezer', 'grabs Lars ar...",This can't be happening! This has to be a drea...,,,"[this, cant, be, happening, this, has, to, be,..."
5,Gem Glow,Lars,['shakes Steven off'],"Get off me man, I'm stocking here!",,,"[get, off, me, man, im, stocking, here]"
6,Gem Glow,Sadie,[],"I'm sorry, Steven. I guess they stopped making...",,,"[im, sorry, steven, i, guess, they, stopped, m..."
7,Gem Glow,Steven,[],Stopped making them?! Why in the world would t...,,,"[stopped, making, them, why, in, the, world, w..."


In [236]:
# NOTE:
# Can't just use the episode name as a key becuase these are not unique
# The transcript corpus has to have one long list of tokens 
# I can create a corpus where each episode is a document
# Alternatively, one corpus per episode, where each document represents a speaker
# Alternatively, one corpus per speaker, where each document represents an episode

# TEST-RUN: Try limiting transcripts to just Gem Glow
# Each speaker will have their own document

gem_glow_corpus = {}
gem_glow_transcripts = valid_transcripts.loc[valid_transcripts['episode'] == 'Gem Glow', :]

gem_glow_transcripts.head()

Unnamed: 0,episode,speaker,actions,quote,location,description,list_of_tokens
2,Gem Glow,Steven,[],NOOOOOOOOOOOOO!!,,,[nooooooooooooo]
4,Gem Glow,Steven,"['looking at an empty freezer', 'grabs Lars ar...",This can't be happening! This has to be a drea...,,,"[this, cant, be, happening, this, has, to, be,..."
5,Gem Glow,Lars,['shakes Steven off'],"Get off me man, I'm stocking here!",,,"[get, off, me, man, im, stocking, here]"
6,Gem Glow,Sadie,[],"I'm sorry, Steven. I guess they stopped making...",,,"[im, sorry, steven, i, guess, they, stopped, m..."
7,Gem Glow,Steven,[],Stopped making them?! Why in the world would t...,,,"[stopped, making, them, why, in, the, world, w..."


In [237]:
speaker_tokens = {}

for i in range(len(gem_glow_transcripts)):
    speaker = gem_glow_transcripts.iloc[i]['speaker']
    tokens = gem_glow_transcripts.iloc[i]['list_of_tokens']
    
    if speaker not in speaker_tokens:
        speaker_tokens[speaker] = tokens
    else:
        speaker_tokens[speaker].extend(tokens)
        
speaker_tokens

{'Steven': ['nooooooooooooo',
  'this',
  'cant',
  'be',
  'happening',
  'this',
  'has',
  'to',
  'be',
  'a',
  'dream',
  'lars',
  'lars',
  'please',
  'tell',
  'me',
  'im',
  'dreaming',
  'stopped',
  'making',
  'them',
  'why',
  'in',
  'the',
  'world',
  'would',
  'they',
  'stop',
  'making',
  'cookie',
  'cats',
  'theyre',
  'only',
  'the',
  'most',
  'scrumptious',
  'and',
  'delicious',
  'icecream',
  'sandwich',
  'ever',
  'made',
  'dont',
  'they',
  'have',
  'laws',
  'for',
  'this',
  'not',
  'lion',
  'lickers',
  'nobody',
  'likes',
  'them',
  'they',
  'dont',
  'even',
  'look',
  'like',
  'lions',
  'kids',
  'these',
  'days',
  'ill',
  'tell',
  'ya',
  'what',
  'thats',
  'not',
  'how',
  'it',
  'works',
  'lars',
  'right',
  'oh',
  'sweet',
  'cookie',
  'cats',
  'with',
  'your',
  'crunchy',
  'cookie',
  'outside',
  'and',
  'your',
  'icy',
  'creamy',
  'insides',
  'you',
  'were',
  'too',
  'good',
  'for',
  'this',
  'w

In [238]:
most_salient_gem_glow = {}
most_salient_gem_glow = find_most_salient(speaker_tokens, 2)

In [239]:
most_salient_gem_glow

{'Steven': ['cookie', 'cat'],
 'Lars': ['stocking', 'tough'],
 'Sadie': ['want', 'freezer'],
 'Amethyst': ['fun', 'youre'],
 'Pearl': ['petals', 'properties'],
 'Garnet': ['weapons', 'means'],
 'Garnet, Pearl & Amethyst': ['no', 'steven']}

In [240]:
def build_corpus(df, index, doc_col, filter_by=None):
    '''
    Takes a pandas dataframe and returns a corpus i.e., a dictionary where the
        key is some identifier and the value is the document

    Inputs:
        - df (pandas DataFrame): dataset with columns for index and document
        - index (str): name of the column to use as key
        - doc_col (str): name of the column containing the document i.e., a list
                         of strings
        - filter_by (tuple of strings): column, value to limit the data by

    Returns a dictionary of documents mapped to their identifier
    '''
    corpus = {}

    if filter_by:
        filter_col, filter_val = limit_by
        df = df.loc[df[filter_col] == filter_val, :]

    for i in range(len(df)):
        id = df.iloc[i][index]
        doc = df.iloc[i][doc_col]

        if id not in corpus:
            corpus[id] = doc
        else:
            corpus[id].extend(doc)

    return corpus


In [241]:
by_episode_corpus = build_corpus(valid_transcripts, 'episode', 'list_of_tokens')

In [242]:
by_episode_corpus

{'Gem Glow': ['nooooooooooooo',
  'this',
  'cant',
  'be',
  'happening',
  'this',
  'has',
  'to',
  'be',
  'a',
  'dream',
  'lars',
  'lars',
  'please',
  'tell',
  'me',
  'im',
  'dreaming',
  'stopped',
  'making',
  'them',
  'why',
  'in',
  'the',
  'world',
  'would',
  'they',
  'stop',
  'making',
  'cookie',
  'cats',
  'theyre',
  'only',
  'the',
  'most',
  'scrumptious',
  'and',
  'delicious',
  'icecream',
  'sandwich',
  'ever',
  'made',
  'dont',
  'they',
  'have',
  'laws',
  'for',
  'this',
  'not',
  'lion',
  'lickers',
  'nobody',
  'likes',
  'them',
  'they',
  'dont',
  'even',
  'look',
  'like',
  'lions',
  'kids',
  'these',
  'days',
  'ill',
  'tell',
  'ya',
  'what',
  'thats',
  'not',
  'how',
  'it',
  'works',
  'lars',
  'right',
  'oh',
  'sweet',
  'cookie',
  'cats',
  'with',
  'your',
  'crunchy',
  'cookie',
  'outside',
  'and',
  'your',
  'icy',
  'creamy',
  'insides',
  'you',
  'were',
  'too',
  'good',
  'for',
  'this',
  

In [243]:
most_salient = {}
most_salient = find_most_salient(by_episode_corpus, 3)

In [244]:
most_salient

{'Gem Glow': ['duper', 'gurgens', 'petals'],
 'Laser Light Cannon': ['su', 'rotates', 'infect'],
 'Cheeseburger Backpack': ['cheeseburger', 'shrimp', 'hamburger'],
 'Together Breakfast': ['doggy', 'balanced', 'apps'],
 'Frybo': ['frybo', 'sock', 'seahorse'],
 'Cat Fingers': ['arp', 'sloop', 'womp'],
 'Bubble Buddies': ['trawler', 'harpoon', 'woooh'],
 'Serious Steven': ['teacups', 'butterflies', 'swarm'],
 'Tiger Millionaire': ['boos', 'loch', 'ness'],
 "Steven's Lion": ['column', 'pregnant', 'wwater'],
 'Arcade Mania': ['mania', 'mutt', 'loses'],
 'Giant Woman': ['opal', 'goat', 'squawking'],
 'So Many Birthdays': ['aging', 'burrito', 'piñata'],
 'Lars and the Cool Kids': ['moss', 'planted', 'lala'],
 'Onion Trade': ['ranger', 'dave', 'replicator'],
 'Steven the Sword Fighter': ['parry', 'thrust', 'grr'],
 'Lion 2: The Movie': ['helicopters', 'missile', 'forehand'],
 'Beach Party': ['pufferfish', 'outfits', 'cheatin'],
 "Rose's Room": ['colonel', 'kernels', 'minigolf'],
 'Coach Steven

In [245]:
# Analyzing most salient by episode alone doesn't seem to make much sense.
by_speaker_corpus = build_corpus(valid_transcripts, 'speaker', 'list_of_tokens')

In [246]:
most_salient_speaker = {}
most_salient_speaker = find_most_salient(by_speaker_corpus, 10)
most_salient_speaker

{'Steven': ['wo',
  'bing',
  'ninja',
  'stack',
  'quiz',
  'miiiiiiind',
  'duper',
  'gurgens',
  'gun',
  'sock'],
 'Lars': ['napalm',
  'bozo',
  'superweird',
  'thisweird',
  'ditch',
  'ahaao',
  'lit',
  'possess',
  'wormhole',
  'deserved'],
 'Sadie': ['shifts',
  'deadend',
  'minimum',
  'wage',
  'whoaohoh',
  'gggghost',
  'alaska',
  'biodegradable',
  'potluuuck',
  'poootluuuck'],
 'Amethyst': ['wshh',
  'ghem',
  'ultra',
  'betty',
  'boringer',
  'bleating',
  'distress',
  'commenting',
  'posture',
  'sync'],
 'Pearl': ['lala',
  'synthetic',
  'fossa',
  'outbursts',
  'hardens',
  'playfully',
  'heehuh',
  'reeks',
  'alloy',
  'tad'],
 'Garnet': ['determined',
  'panels',
  'trigger',
  'gunshow',
  'giggling',
  'anticipation',
  'revive',
  'spoiled',
  'appetite',
  'avenge'],
 'Garnet, Pearl & Amethyst': ['steven', 'no'],
 'Mr. Fryman': ['frybo',
  'valued',
  'ketchup',
  'woahho',
  'ti',
  'ensnares',
  'tendril',
  'peeped',
  'affiliates',
  'lighte

In [251]:
# See how tokens change over episodes and seasons
# Create a document for each speaker, where the corpus is a episode

merged_df = transcripts_df.merge(episodes_df, left_on='episode', right_on='title')
merged_df.head()

Unnamed: 0,episode,speaker,actions,quote,location,description,title,season,num_series,num_season,airdate,summary,list_of_tokens
0,Gem Glow,,[],,Open Overview of Beach City,,Gem Glow,1,1,1,,Steven thinks his favorite ice cream sandwiche...,"[steven, thinks, his, favorite, ice, cream, sa..."
1,Gem Glow,,[],,Trans. Ext. Big Donut,,Gem Glow,1,1,1,,Steven thinks his favorite ice cream sandwiche...,"[steven, thinks, his, favorite, ice, cream, sa..."
2,Gem Glow,Steven,[],NOOOOOOOOOOOOO!!,,,Gem Glow,1,1,1,,Steven thinks his favorite ice cream sandwiche...,"[steven, thinks, his, favorite, ice, cream, sa..."
3,Gem Glow,,[],,Trans. Int. Big Donut,,Gem Glow,1,1,1,,Steven thinks his favorite ice cream sandwiche...,"[steven, thinks, his, favorite, ice, cream, sa..."
4,Gem Glow,Steven,"['looking at an empty freezer', 'grabs Lars ar...",This can't be happening! This has to be a drea...,,,Gem Glow,1,1,1,,Steven thinks his favorite ice cream sandwiche...,"[steven, thinks, his, favorite, ice, cream, sa..."


In [252]:
season = "1"
filtered = merged_df.loc[merged_df['season']==season, :]
filtered.head()

Unnamed: 0,episode,speaker,actions,quote,location,description,title,season,num_series,num_season,airdate,summary,list_of_tokens
0,Gem Glow,,[],,Open Overview of Beach City,,Gem Glow,1,1,1,,Steven thinks his favorite ice cream sandwiche...,"[steven, thinks, his, favorite, ice, cream, sa..."
1,Gem Glow,,[],,Trans. Ext. Big Donut,,Gem Glow,1,1,1,,Steven thinks his favorite ice cream sandwiche...,"[steven, thinks, his, favorite, ice, cream, sa..."
2,Gem Glow,Steven,[],NOOOOOOOOOOOOO!!,,,Gem Glow,1,1,1,,Steven thinks his favorite ice cream sandwiche...,"[steven, thinks, his, favorite, ice, cream, sa..."
3,Gem Glow,,[],,Trans. Int. Big Donut,,Gem Glow,1,1,1,,Steven thinks his favorite ice cream sandwiche...,"[steven, thinks, his, favorite, ice, cream, sa..."
4,Gem Glow,Steven,"['looking at an empty freezer', 'grabs Lars ar...",This can't be happening! This has to be a drea...,,,Gem Glow,1,1,1,,Steven thinks his favorite ice cream sandwiche...,"[steven, thinks, his, favorite, ice, cream, sa..."


In [289]:
all_speakers = transcripts_df.speaker.unique()

In [285]:
test_speaker = all_speakers[7]

In [286]:
test_speaker

'Garnet, Pearl & Amethyst'

In [287]:
set([speaker.strip() for speaker in re.split(', |& ', test_speaker)])

['Garnet', 'Pearl', 'Amethyst']

In [283]:
test_speaker = test_speaker.strip()

In [284]:
test_speaker

'Garnet, Pearl & Amethyst'

In [265]:
clean_speaker_list = test_speaker.lower().strip().split()

In [266]:
speaker_set = set(clean_speaker_list)
speaker_set

{'amethyst', 'garnet', 'pearl'}

In [267]:
all_speakers

array([nan, 'Steven', 'Lars', 'Sadie', 'Amethyst', 'Pearl', 'Garnet',
       'Garnet, Pearl & Amethyst', 'Mr. Fryman', 'Steven & Amethyst',
       'Greg', 'Pearl, Amethyst, and Garnet', 'Jamie', 'Mr. Queasy',
       'Garnet, Amethyst & Pearl', 'Pearl & Amethyst',
       'Amethyst & Garnet', 'Dumb Police', 'Gems', 'Peedee', 'Mr. Smiley',
       'The Frymans', 'Mayor Dewey', 'Steven and Greg', 'Cat Finger',
       'Sadie and Lars', 'Cat Fingers', 'Peedee and Mr. Fryman',
       'Ronaldo', 'Connie', 'Chunk Truck', 'Random Man', 'Kiki',
       'Road Killer', 'Teens of Rage', 'Punch Buddy', 'Meat Beat Mania',
       'Opal', 'Whacker Man', 'Amethyst & Pearl', 'Jenny', 'Buck',
       'Sour Cream', 'Jenny, Sour Cream & Buck', 'Jenny & Buck',
       'G.U.Y.S. Machine', 'Lonely Blade', 'Samurai President',
       'Holo-Pearl', 'Garnet & Amethyst', "Lonely Blade's Brother",
       'Movie Narrator', 'Steven & Connie', 'Ticket Booth Lady', 'Kofi',
       'Nanefua', 'Jenny & Kiki', 'Amethyst & Jenny

In [1]:
cleaned_speakers = []
for speaker in all_speakers:
    if isinstance(speaker, str):
        speaker_set = set([speaker.strip() for speaker in re.split(', |& |and ', speaker) if speaker != ''])
        if speaker_set not in cleaned_speakers:
            cleaned_speakers.append(speaker_set)
        
cleaned_speakers

NameError: name 'all_speakers' is not defined