# Organizing parts of speech
This notebook is used to identify the parts of speech (pos) for each word spoken by Jerry, George, Kramer, and Elaine. These pos will be be sorted by their types and saved in dataframes corresponding to their respective characters.

## Libraries

In [2]:
#importing libraries
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer, PunktSentenceTokenizer
import pandas as pd
import numpy as np
import pickle

## Sentence tokenizing

In [3]:
#loading the parsed script
script_df = pd.read_pickle("Lines.csv")

In [39]:
#defining sentence tokenizer for each character
def sent_tokenize(char):
    
    #pulling all the lines pertaining to the specified character
    lines = []
    for i in range(len(script_df)):
        if script_df.iloc[i][0] == char:
            lines.append(script_df.iloc[i][1])
    
    #training a tokenizer to tokenize the sentences
    tokenizer = PunktSentenceTokenizer(lines[0])
    
    #tokenizing each line and add all the sentences to a list
    sent = []
    for i in lines:
        sent += tokenizer.tokenize(i)
    return sent

In [8]:
#tokenizing the characters
jerry_sent = sent_tokenize("JERRY")
elaine_sent = sent_tokenize("ELAINE")
george_sent = sent_tokenize("GEORGE")
kramer_sent = sent_tokenize("KRAMER")

## Tagging the words

In [22]:
#tagging each word for each character's lines with its associated part-of-speech
def tagging(char_sent):
    
    #tokenizing the words in each sentence, 
    #tagging them with pos, 
    #and adding the result to a list
    words_tagged = []
    pos = []
    for i in char_sent:
        words = word_tokenize(i)
        tagged = nltk.pos_tag(words)
        words_tagged += tagged
        
    #creating a list of the types of pos used in the lines
    for i in range(len(jerry_words_tagged)):
        if jerry_words_tagged[i][1] not in pos:
            pos.append(jerry_words_tagged[i][1])
    return words_tagged, pos


## Organizing pos

In [28]:
#organizing the words in the lines based on their type of pos
def pos_organizer(tagged,pos):
    
    pos_org = []
    
    #creating a list of pos lists
    for j in range(len(pos)):
        temp = []
        
        #pulling the words that match a specific tag
        for i in range(len(tagged)):
            if tagged[i][1] == pos[j]:
                temp.append(tagged[i][0])
        pos_org.append(temp)
    return pos_org

In [32]:
#creating tagged sentences
jerry_sent_tagged, pos = tagging(jerry_sent)

#organizing pos
jerry_pos_org = pos_organizer(jerry_sent_tagged,pos)

#creating pos dataframe
jerry_pos_df = pd.DataFrame(jerry_pos_org)
jerry_pos_df = np.transpose(jerry_pos_df)
jerry_pos_df.columns = pos
george_sent_tagged, pos = tagging(george_sent)
george_pos_org = pos_organizer(george_sent_tagged,pos)
george_pos_df = pd.DataFrame(george_pos_org)
george_pos_df = np.transpose(george_pos_df)
george_pos_df.columns = pos
elaine_sent_tagged, pos = tagging(elaine_sent)
elaine_pos_org = pos_organizer(elaine_sent_tagged,pos)
elaine_pos_df = pd.DataFrame(elaine_pos_org)
elaine_pos_df = np.transpose(elaine_pos_df)
elaine_pos_df.columns = pos
kramer_sent_tagged, pos = tagging(kramer_sent)
kramer_pos_org = pos_organizer(kramer_sent_tagged,pos)
kramer_pos_df = pd.DataFrame(kramer_pos_org)
kramer_pos_df = np.transpose(kramer_pos_df)
kramer_pos_df.columns = pos


## Saving dataframes

In [38]:
jerry_pos_df.to_pickle('pos/jerry_pos.csv')
elaine_pos_df.to_pickle('pos/elaine_pos.csv')
george_pos_df.to_pickle('pos/george_pos.csv')
kramer_pos_df.to_pickle('pos/kramer_pos.csv')