In [1]:
import os
import re
import json
import pandas as pd
from collections import Counter
import spacy
from spacy.tokens import Doc

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn

sentiment_analyzer = SentimentIntensityAnalyzer()
def polarity_scores(doc):
    return sentiment_analyzer.polarity_scores(doc.text)
Doc.set_extension('polarity_scores', getter=polarity_scores)
nlp = spacy.load("en_core_web_lg")    

from tqdm import tqdm


In [2]:
convTimelineCell = 42
minLines = 50
stripPersonnas = True
main = ['jerry', 'summer', 'beth', 'rick', 'morty', 'pickle rick', 'unity']
keysToRemove = ['fullText', 'pron', 'lines']
replace = {
    'rick sanches': 'rick',
    'drunk rick': 'rick',
    'morty smith': 'morty',
    'gromfomite': 'gromflomite',
    'other gromflomite': 'gromflomite',
    'gromflomite captain': 'gromflomite',
    'gromflomite worker': 'gromflomite',
    'gromflomite guard': 'gromflomite',
    'gromflomite office employee': 'gromflomite',
    'rick salseman': 'rick salesman',
    'armed ricks': 'armed rick',
    'all religious mortys': 'religious morty',
    'little tommy': 'tommy',
    'meeseek': 'meeseeks',
    'all the meeseeks': 'meeseeks',
    'all other meeseeks': 'meeseeks',
    'mailmen': 'mailman',
    'gerry': 'jerry'
    
}
ignore = [
    'a mexican armada shows up. with weapons made from two', 
    'pa system', 
    'all classmates except morty',
    'another voice',
    'sexualized s',
    'both']

#main = ['beth']
data = {
    'characters': {},
}

In [3]:

 
def do_nlp(character, line, file, section):
    non_actions = re.sub(r'\[.*\]|\(.*\)', '', line)
    non_actions = re.sub(r'[!\.\?\;\"\-\*]', ' ', non_actions)
    non_actions = re.sub(r'([a-z])\1{2,}',r'\1', non_actions)

    text = '';
    
    doc = nlp(non_actions)
    
    for token in doc:
        if (token.lemma_ != '-PRON-'):        
            text = text + token.lemma_ + ' '        
    
    character['fullText'] += non_actions    
    score = doc._.polarity_scores['compound']

    actualVal =  character['sentiment'][section] if character['sentiment'][section] != False else 0
    character['sentiment'][section] = (actualVal + score) / 2
    
    actualVal =  character['episodes'][file]['sentiment'][section] if character['episodes'][file]['sentiment'][section] != False else 0
    character['episodes'][file]['sentiment'][section] = (actualVal + score) / 2
    
    if abs(score)>.2:
        character['episodes'][file]['sentimentLine'][section] += non_actions;

    
#     s = flair.data.Sentence(line)
#     flair_sentiment.predict(s)
    
#     try:
#         score = s.labels[0].to_dict() 
#         score = score['confidence'] if score['value'] == 'POSITIVE' else -score['confidence']
        
#         actualVal =  character['sentiment'][section] if character['sentiment'][section] != False else 0
#         character['sentiment'][section] = (actualVal + score) / 2
        
#         actualVal =  character['episodes'][file]['sentiment'][section] if character['episodes'][file]['sentiment'][section] != False else 0
#         character['episodes'][file]['sentiment'][section] = (actualVal + score) / 2
#     except:
#         pass
#         print('Error')

In [4]:
def getCharacter(characterName):
    characters = data['characters']
    try:
        character = characters[characterName]
    except KeyError:
        character = characters[characterName] = {
            'linesCount': 0, 
            'lines': [], 
            'time': [0] * convTimelineCell, 
            
            'episodes': {},
            'pron': {},
            'fullText': ''
        }
        if characterName in main:
            character['sentiment'] = ([False] * convTimelineCell)
            character['sentimentLine'] = ([''] * convTimelineCell)

    return character

def parseLine(characterName, line, file, section):
    character = getCharacter(characterName)    
    character['linesCount'] += 1    
    episodes = character['episodes']
    
    try: 
        episode = episodes[file]
    except:
        episode = episodes[file] = {
            'linesCount': 0,
            'time': [0]*convTimelineCell,
            
        }
        if characterName in main:
            episode['sentiment'] = ([False] * convTimelineCell)
            episode['sentimentLine'] = ([''] * convTimelineCell)
        
    character['time'][section] += 1
    episode['time'][section] += 1
    episode['linesCount'] += 1
    
    if characterName in main:
        do_nlp(character, line, file, section)

In [5]:
files = os.listdir("./parsed")
for filename in files:
    print('Processing: ', filename)
    lines = 0
    with open("./parsed/" + filename, "r") as f:             
        for line in f:            
            lines += 1
            
    with open("./parsed/" + filename, "r") as f:                    
        index = 0
        for line in tqdm(f, total=lines): 
            if stripPersonnas == True:
                find = re.search(r"^([a-zA-Z][a-zA-Z \.]+).*\:(.*)", line) 
            else:
                find = re.search(r"^([a-z][a-z0-9 \(\)]+):(.*)", line)

            if find:
                name = find.group(1).strip()
                text = find.group(2).strip()
                if name not in ignore:
                    try:
                        actualName = replace[name]
                    except:
                        actualName = name
                    
                    parseLine(actualName, text, filename[6:len(filename)-4], int(index * convTimelineCell / lines))
            index += 1

  8%|▊         | 19/245 [00:00<00:01, 185.56it/s]

Processing:  011 - Ricksy Business.txt


100%|██████████| 245/245 [00:01<00:00, 237.93it/s]
 10%|█         | 27/262 [00:00<00:00, 260.45it/s]

Processing:  021 - The Wedding Squanchers.txt


100%|██████████| 262/262 [00:01<00:00, 226.31it/s]
  7%|▋         | 23/319 [00:00<00:01, 228.02it/s]

Processing:  015 - Total Rickall.txt


100%|██████████| 319/319 [00:01<00:00, 276.41it/s]
  5%|▌         | 19/347 [00:00<00:01, 187.61it/s]

Processing:  020 - Look Who's Purging Now.txt


100%|██████████| 347/347 [00:01<00:00, 227.04it/s]
  6%|▋         | 14/224 [00:00<00:01, 137.95it/s]

Processing:  029 - Morty's Mind Blowers.txt


100%|██████████| 224/224 [00:01<00:00, 163.46it/s]
  6%|▋         | 18/287 [00:00<00:01, 177.91it/s]

Processing:  001 - Pilot.txt


100%|██████████| 287/287 [00:01<00:00, 196.86it/s]
  7%|▋         | 24/340 [00:00<00:01, 225.64it/s]

Processing:  010 - Close Rick-counters of the Rick Kind.txt


100%|██████████| 340/340 [00:01<00:00, 311.96it/s]
  6%|▌         | 16/287 [00:00<00:01, 151.34it/s]

Processing:  026 - The Whirly Dirly Conspiracy.txt


100%|██████████| 287/287 [00:01<00:00, 187.47it/s]
  7%|▋         | 26/358 [00:00<00:01, 247.80it/s]

Processing:  002 - Lawnmower Dog.txt


100%|██████████| 358/358 [00:01<00:00, 305.54it/s]
 13%|█▎        | 53/411 [00:00<00:00, 516.63it/s]

Processing:  016 - Get Schwifty.txt


100%|██████████| 411/411 [00:00<00:00, 525.25it/s]
  8%|▊         | 19/235 [00:00<00:01, 188.49it/s]

Processing:  024 - Pickle Rick.txt


100%|██████████| 235/235 [00:00<00:00, 287.52it/s]
  8%|▊         | 31/365 [00:00<00:01, 308.99it/s]

Processing:  022 - The Rickshank Redemption.txt


100%|██████████| 365/365 [00:01<00:00, 313.39it/s]
  8%|▊         | 44/559 [00:00<00:01, 428.91it/s]

Processing:  007 - Raising Gazorpazorp.txt


100%|██████████| 559/559 [00:00<00:00, 758.61it/s]
  6%|▌         | 16/270 [00:00<00:01, 158.06it/s]

Processing:  018 - Big Trouble In Little Sanchez.txt


100%|██████████| 270/270 [00:01<00:00, 264.89it/s]
 11%|█         | 37/337 [00:00<00:00, 353.51it/s]

Processing:  014 - Auto Erotic Assimilation.txt


100%|██████████| 337/337 [00:01<00:00, 230.62it/s]
  7%|▋         | 21/323 [00:00<00:01, 202.55it/s]

Processing:  030 - The ABCs of Beth.txt


100%|██████████| 323/323 [00:01<00:00, 221.73it/s]
 35%|███▌      | 40/113 [00:00<00:00, 374.76it/s]

Processing:  006 - Rick Potion 9.txt


100%|██████████| 113/113 [00:00<00:00, 343.32it/s]
  6%|▌         | 25/432 [00:00<00:01, 245.19it/s]

Processing:  008 - Rixty Minutes.txt


100%|██████████| 432/432 [00:00<00:00, 1491.65it/s]
  6%|▌         | 26/444 [00:00<00:01, 254.72it/s]

Processing:  003 - Anatomy Park.txt


100%|██████████| 444/444 [00:00<00:00, 485.18it/s]
  8%|▊         | 31/378 [00:00<00:01, 292.31it/s]

Processing:  009 - Something Ricked This Way Comes.txt


100%|██████████| 378/378 [00:01<00:00, 296.48it/s]
  6%|▌         | 22/377 [00:00<00:01, 209.98it/s]

Processing:  013 - Mortynight Run.txt


100%|██████████| 377/377 [00:00<00:00, 378.75it/s]
  9%|▊         | 28/321 [00:00<00:01, 268.76it/s]

Processing:  005 - Meeseeks and Destroy.txt


100%|██████████| 321/321 [00:01<00:00, 313.42it/s]
  9%|▉         | 27/295 [00:00<00:01, 262.53it/s]

Processing:  004 - M. Night Shaym-Aliens!.txt


100%|██████████| 295/295 [00:01<00:00, 285.30it/s]
  6%|▌         | 21/376 [00:00<00:01, 201.13it/s]

Processing:  023 - Rickmancing the Stone.txt


100%|██████████| 376/376 [00:01<00:00, 293.44it/s]
  4%|▍         | 18/411 [00:00<00:02, 177.40it/s]

Processing:  012 - A Rickle in Time.txt


100%|██████████| 411/411 [00:01<00:00, 312.72it/s]


In [6]:
# toRemove = []
# characters = data['characters']
# data['allCharacterNames'] = set(characters.keys())
# for key in characters.keys():
#     count = characters[key]['linesCount']
#     if count < minLines:
#         toRemove.append(key)
        
# for keyToRemove in toRemove:
#     del characters[keyToRemove]
    
# print(characters.keys())

In [7]:
for key in main:
    try:
        character = data['characters'][key]
        full_text = character['fullText']
        doc = nlp(full_text)
        nouns = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "NOUN"]
        word_freq = Counter(nouns)
        character['common_words'] = word_freq.most_common(50)    
        print('-'.join(['-']*10))
        print(key)
        print(character['common_words'])
    except KeyError:
        pass

for key in data['characters'].keys():
    
    character = data['characters'][key]
    for keyToRemove in keysToRemove:
        try:
            del character[keyToRemove]
        except KeyError:
            pass
    episodes = character['episodes']
    
    for episode in episodes.keys():
        print(episode)
        for keyToRemove in keysToRemove:
            try:
                del character['episodes'][episode][keyToRemove]
            except KeyError:
                pass

    
    
    

-------------------
jerry
[('morty', 24), ('planet', 23), ('guys', 20), ('son', 19), ('thing', 18), ('summer', 18), ('time', 18), ('family', 16), ('father', 16), ('rick', 13), ('jerry', 12), ('house', 11), ('science', 11), ('way', 10), ('life', 8), ('honey', 8), ('wife', 8), ('people', 7), ('fun', 7), ('hell', 7), ('gary', 7), ('guy', 7), ('christmas', 7), ('kids', 7), ('things', 7), ('pluto', 7), ('idea', 6), ('school', 6), ('friend', 6), ('head', 6), ('dad', 6), ('man', 6), ('apples', 6), ('deer', 6), ('tv', 5), ('device', 5), ('job', 5), ('love', 5), ('milk', 5), ('home', 5), ('face', 5), ('kind', 5), ('dog', 5), ('mom', 5), ('cloud', 4), ('atlas', 4), ('tru', 4), ('surgeon', 4), ('game', 4), ('movie', 4)]
-------------------
summer
[('dad', 39), ('grandpa', 39), ('morty', 25), ('rick', 19), ('mom', 12), ('guys', 11), ('people', 8), ('head', 7), ('time', 7), ('unity', 7), ('drum', 7), ('pants', 6), ('school', 6), ('place', 6), ('parents', 5), ('thing', 5), ('family', 5), ('god', 5),

In [8]:
with open('data.json', 'w') as fp:
    json.dump(data, fp)