In [1]:
import os
import re
import json
import pprint
import random

import pandas as pd
from collections import Counter

import spacy
from spacy.tokens import Doc
from spacy.gold import GoldParse
from spacy.language import EntityRecognizer
from spacy.util import minibatch, compounding

from tqdm import tqdm

pp = pprint.PrettyPrinter(indent=4)
nlp = spacy.load("en_core_web_lg") 

if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe("ner")
    
stripPersonnas = True
replace = {
    'rick sanches': 'rick',
    'drunk rick': 'rick',
    'morty smith': 'morty',
    'gromfomite': 'gromflomite',
    'other gromflomite': 'gromflomite',
    'gromflomite captain': 'gromflomite',
    'gromflomite worker': 'gromflomite',
    'gromflomite guard': 'gromflomite',
    'gromflomite office employee': 'gromflomite',
    'rick salseman': 'rick salesman',
    'armed ricks': 'armed rick',
    'all religious mortys': 'religious morty',
    'little tommy': 'tommy',
    'meeseek': 'meeseeks',
    'all the meeseeks': 'meeseeks',
    'all other meeseeks': 'meeseeks',
    'mailmen': 'mailman',
    'gerry': 'jerry'
    
}

ignore = [
    'a mexican armada shows up. with weapons made from two', 
    'pa system', 
    'all classmates except morty',
    'another voice',
    'sexualized s',
    'both']

In [2]:
json_data = None
characters = None
with open('data.json') as json_file:
    json_data = json.load(json_file)
    characters = json_data['characters'].keys()
    
    



In [3]:
def parseLine(characterName, line, file, lastSpeaker):
    non_actions = re.sub(r'[{\W}]', ' ', line.lower())
    non_actions = re.sub(r'[-{2,}]', '', non_actions)
    non_actions = re.sub(r'([a-z])\1{2,}', r'\1', non_actions)
    doc = nlp(line)
    lastToken = None
    index = 0
    foundSpokenTo = None
    
    ens = 'ensquanche' in line
    
    for token in doc:
       
        if lastToken != None and token.text in characters:
            if ens and lastToken != None:
                print(lastToken, token,lastToken.is_punct, token.is_punct)
            if token.is_punct == True and lastToken.text in characters:            
                foundSpokenTo = lastToken
            elif lastToken.is_punct == True and token.text in characters:
                foundSpokenTo = token
                    
        index += 1
        lastToken = token
    
#     print(line)
#     if foundSpokenTo != None:
#         print(characterName, '->', foundSpokenTo.text)
#     else:
#         print(characterName, '->', lastSpeaker)
#     print()
    return [characterName, lastSpeaker]
    


In [4]:
document = {}
files = os.listdir("./parsed")
for filename in files:
    print('Processing: ', filename)
    episodeName = filename[6:len(filename)-4];
    interaction = []
    lines = 0
    with open("./parsed/" + filename, "r") as f:             
        for line in f:            
            lines += 1
    
    with open("./parsed/" + filename, "r") as f:         
        lastSpeaker = ''
        for line in tqdm(f, total=lines): 
            if stripPersonnas == True:
                find = re.search(r"^([a-zA-Z][a-zA-Z \.]+).*\:(.*)", line) 
            else:
                find = re.search(r"^([a-z][a-z0-9 \(\)]+):(.*)", line)
            if find:
                name = find.group(1).strip()
                text = find.group(2).strip()
                if name not in ignore:
                    try:
                        actualName = replace[name]
                    except:
                        actualName = name
                    
                interaction.append(parseLine(actualName, text, filename[6:len(filename)-4], lastSpeaker))
                lastSpeaker = name
    document[episodeName] = interaction

  7%|▋         | 18/245 [00:00<00:01, 171.42it/s]

Processing:  011 - Ricksy Business.txt


100%|██████████| 245/245 [00:01<00:00, 160.41it/s]
  7%|▋         | 19/262 [00:00<00:01, 181.92it/s]

Processing:  021 - The Wedding Squanchers.txt


 21%|██        | 54/262 [00:00<00:01, 176.52it/s]

, summer True False
be beth False False


100%|██████████| 262/262 [00:01<00:00, 177.87it/s]
  6%|▌         | 19/319 [00:00<00:01, 187.64it/s]

Processing:  015 - Total Rickall.txt


100%|██████████| 319/319 [00:01<00:00, 189.78it/s]
  5%|▍         | 16/347 [00:00<00:02, 157.69it/s]

Processing:  020 - Look Who's Purging Now.txt


100%|██████████| 347/347 [00:01<00:00, 175.61it/s]
  8%|▊         | 17/224 [00:00<00:01, 163.86it/s]

Processing:  029 - Morty's Mind Blowers.txt


100%|██████████| 224/224 [00:01<00:00, 153.11it/s]
  6%|▌         | 17/287 [00:00<00:01, 166.65it/s]

Processing:  001 - Pilot.txt


100%|██████████| 287/287 [00:01<00:00, 147.43it/s]
  7%|▋         | 23/340 [00:00<00:01, 223.88it/s]

Processing:  010 - Close Rick-counters of the Rick Kind.txt


100%|██████████| 340/340 [00:01<00:00, 206.88it/s]
  5%|▌         | 15/287 [00:00<00:01, 147.11it/s]

Processing:  026 - The Whirly Dirly Conspiracy.txt


100%|██████████| 287/287 [00:01<00:00, 167.74it/s]
  7%|▋         | 26/358 [00:00<00:01, 246.64it/s]

Processing:  002 - Lawnmower Dog.txt


100%|██████████| 358/358 [00:01<00:00, 213.34it/s]
  7%|▋         | 29/411 [00:00<00:01, 281.67it/s]

Processing:  016 - Get Schwifty.txt


100%|██████████| 411/411 [00:01<00:00, 238.70it/s]
  8%|▊         | 19/235 [00:00<00:01, 185.17it/s]

Processing:  024 - Pickle Rick.txt


100%|██████████| 235/235 [00:01<00:00, 173.15it/s]
  4%|▍         | 16/365 [00:00<00:02, 157.02it/s]

Processing:  022 - The Rickshank Redemption.txt


100%|██████████| 365/365 [00:01<00:00, 183.01it/s]
  6%|▋         | 36/559 [00:00<00:01, 355.51it/s]

Processing:  007 - Raising Gazorpazorp.txt


100%|██████████| 559/559 [00:00<00:00, 693.18it/s]
  6%|▌         | 16/270 [00:00<00:01, 156.82it/s]

Processing:  018 - Big Trouble In Little Sanchez.txt


100%|██████████| 270/270 [00:01<00:00, 177.09it/s]
  8%|▊         | 26/337 [00:00<00:01, 258.50it/s]

Processing:  014 - Auto Erotic Assimilation.txt


100%|██████████| 337/337 [00:01<00:00, 187.04it/s]
  6%|▌         | 18/323 [00:00<00:01, 170.68it/s]

Processing:  030 - The ABCs of Beth.txt


100%|██████████| 323/323 [00:01<00:00, 173.68it/s]
 21%|██        | 24/113 [00:00<00:00, 230.78it/s]

Processing:  006 - Rick Potion 9.txt


100%|██████████| 113/113 [00:00<00:00, 218.16it/s]
  5%|▍         | 21/432 [00:00<00:02, 203.76it/s]

Processing:  008 - Rixty Minutes.txt


100%|██████████| 432/432 [00:00<00:00, 961.64it/s]
  6%|▌         | 25/444 [00:00<00:01, 241.98it/s]

Processing:  003 - Anatomy Park.txt


100%|██████████| 444/444 [00:01<00:00, 256.18it/s]
  7%|▋         | 26/378 [00:00<00:01, 251.26it/s]

Processing:  009 - Something Ricked This Way Comes.txt


100%|██████████| 378/378 [00:01<00:00, 216.84it/s]
  6%|▌         | 22/377 [00:00<00:01, 209.35it/s]

Processing:  013 - Mortynight Run.txt


100%|██████████| 377/377 [00:01<00:00, 230.70it/s]
  9%|▉         | 29/321 [00:00<00:01, 273.90it/s]

Processing:  005 - Meeseeks and Destroy.txt


100%|██████████| 321/321 [00:01<00:00, 205.89it/s]
  7%|▋         | 20/295 [00:00<00:01, 199.07it/s]

Processing:  004 - M. Night Shaym-Aliens!.txt


100%|██████████| 295/295 [00:01<00:00, 201.63it/s]
  6%|▌         | 21/376 [00:00<00:01, 206.11it/s]

Processing:  023 - Rickmancing the Stone.txt


100%|██████████| 376/376 [00:01<00:00, 206.37it/s]
  6%|▌         | 24/411 [00:00<00:01, 223.59it/s]

Processing:  012 - A Rickle in Time.txt


100%|██████████| 411/411 [00:02<00:00, 196.93it/s]


In [5]:
with open('interaction.json', 'w') as fp:
    json.dump(document, fp)