In [1]:
# Import dependencies
import pandas as pd
import numpy as np

In [2]:
# Read in the Scripts CSV file
cleaned_df = pd.read_csv('../rawData/cleaned_scripts.csv')

# Drop the "Unnamed: 0" and "Unnamed: 0.1" columns
cleaned_df = cleaned_df.drop("Unnamed: 0", 1)
cleaned_df = cleaned_df.drop("Unnamed: 0.1", 1)
cleaned_df.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,Sentiment
0,JERRY,Do you know what this is all about? Do you kno...,1,S01E01,1,0.9029
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1,S01E01,1,-0.4389
2,GEORGE,Are you through?,1,S01E01,1,0.0
3,JERRY,"You do of course try on, when you buy?",1,S01E01,1,0.0
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1,S01E01,1,0.6705


In [3]:
# Create variables for WordCount and LineCount
# LineCount is easy - just make it equal to 1, then when we group by episode, take the sum

In [4]:
# LineCount variable created
cleaned_df["LineCount"] = np.repeat(1, cleaned_df.shape[0])
cleaned_df.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,Sentiment,LineCount
0,JERRY,Do you know what this is all about? Do you kno...,1,S01E01,1,0.9029,1
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1,S01E01,1,-0.4389,1
2,GEORGE,Are you through?,1,S01E01,1,0.0,1
3,JERRY,"You do of course try on, when you buy?",1,S01E01,1,0.0,1
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1,S01E01,1,0.6705,1


In [5]:
#Create a function for counting the # of words per line of dialogue
def word_count(text):
    count = 0
    words = text.split()

    for word in words:
        count += 1
    
    return count

In [10]:
# WordCount -- create a loop through each row and each Dialogue...

# Create a list that will later be added to dataframe
WordCount = []

for i in range(cleaned_df.shape[0]):
    
    # Assign the current line of text to variable 'line'
    line = str(cleaned_df["Dialogue"][i]).lower()
    
    # Run the word_count function on line i
    try:
        count = word_count(line)
    except:
        count = 0
    
    # Append count to WordCount list
    WordCount.append(count)
    
    #########################
    # Print notifications to make sure the code is running
    #########################

    if i % 10000 == 0:
        perc_complete = round(i / cleaned_df.shape[0] * 100, 0)
        print("Percent Complete: " + str(perc_complete) + "%")
    
    if i == (cleaned_df.shape[0] - 1):
        print("-----------------------------------")
        print("Loop Complete!")

Percent Complete: 0.0%
Percent Complete: 18.0%
Percent Complete: 37.0%
Percent Complete: 55.0%
Percent Complete: 73.0%
Percent Complete: 92.0%
-----------------------------------
Loop Complete!


In [11]:
#Add WordCount as a column in dataframe
cleaned_df["WordCount"] = WordCount
cleaned_df.tail()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,Sentiment,LineCount,WordCount
54611,JERRY,Grand theft auto - don't steal any of my jokes.,23,S09E23,9,0.7669,1,10
54612,OTHER,You suck - I'm gonna cut you.,23,S09E23,9,-0.6124,1,7
54613,JERRY,"Hey, I don't come down to where you work, and ...",23,S09E23,9,0.4939,1,18
54614,OTHER,"Alright, Seinfeld, that's it. Let's go. Come on.",23,S09E23,9,0.25,1,8
54615,JERRY,"Alright, hey, you've been great! See you in th...",23,S09E23,9,0.75,1,10


In [12]:
# Create a weighted sentiment score (Sentiment x WordCount)
cleaned_df["WtdSentiment"] = cleaned_df["Sentiment"] * cleaned_df["WordCount"]
cleaned_df.tail(10)

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,Sentiment,LineCount,WordCount,WtdSentiment
54606,OTHER,I am.,23,S09E23,9,0.0,1,2,0.0
54607,JERRY,I'll talk slower. I'm kidding - I love Cellblo...,23,S09E23,9,0.8316,1,22,18.2952
54608,OTHER,Murder one.,23,S09E23,9,-0.6908,1,2,-1.3816
54609,JERRY,"Murder one? Oooooo, watch out everybody. Bette...",23,S09E23,9,0.1926,1,26,5.0076
54610,OTHER,Grand theft auto.,23,S09E23,9,0.4588,1,3,1.3764
54611,JERRY,Grand theft auto - don't steal any of my jokes.,23,S09E23,9,0.7669,1,10,7.669
54612,OTHER,You suck - I'm gonna cut you.,23,S09E23,9,-0.6124,1,7,-4.2868
54613,JERRY,"Hey, I don't come down to where you work, and ...",23,S09E23,9,0.4939,1,18,8.8902
54614,OTHER,"Alright, Seinfeld, that's it. Let's go. Come on.",23,S09E23,9,0.25,1,8,2.0
54615,JERRY,"Alright, hey, you've been great! See you in th...",23,S09E23,9,0.75,1,10,7.5


In [None]:
##### Create columns for Line Count, Word Count and Wtd Sentiment for each character / character group
# 1) Main Characters
# 2) Jerry
# 3) George
# 4) Kramer
# 5) Elaine
# 6) Secondary Characters
# 7) Other Characters

In [None]:
# Use loc to separate out each CHARACTER into separate dfs

In [22]:
#Add list for main characters
mainChars = ["JERRY", "GEORGE", "KRAMER", "ELAINE"]

In [34]:
# Create separate dataframes 
jerry_df = cleaned_df.loc[cleaned_df["Character"] == "JERRY", :]
george_df = cleaned_df.loc[cleaned_df["Character"] == "GEORGE", :]
kramer_df = cleaned_df.loc[cleaned_df["Character"] == "KRAMER", :]
elaine_df = cleaned_df.loc[cleaned_df["Character"] == "ELAINE", :]

mainChars_df = cleaned_df.loc[(cleaned_df["Character"] == "JERRY") | 
                             (cleaned_df["Character"] == "GEORGE") |
                             (cleaned_df["Character"] == "KRAMER") |
                             (cleaned_df["Character"] == "ELAINE"), :]

secondary_df = cleaned_df.loc[cleaned_df["Character"] == "SECONDARY", :]
other_df = cleaned_df.loc[cleaned_df["Character"] == "OTHER", :]

In [None]:
# Group dataframes (including the original cleaned_df) by Episode

In [38]:
cleaned_df.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,Sentiment,LineCount,WordCount,WtdSentiment
0,JERRY,Do you know what this is all about? Do you kno...,1,S01E01,1,0.9029,1,189,170.6481
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1,S01E01,1,-0.4389,1,41,-17.9949
2,GEORGE,Are you through?,1,S01E01,1,0.0,1,3,0.0
3,JERRY,"You do of course try on, when you buy?",1,S01E01,1,0.0,1,9,0.0
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1,S01E01,1,0.6705,1,14,9.387


In [49]:
# Group by SEID column
overall_grp = cleaned_df.groupby(by = "SEID").sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
overall_grp["EpisodeNo"] = overall_grp["EpisodeNo"] / overall_grp["LineCount"]
overall_grp["Season"] = overall_grp["Season"] / overall_grp["LineCount"]
overall_grp["Sentiment"] = overall_grp["WtdSentiment"] / overall_grp["WordCount"]

# Drop column for WtdSentiment since we no longer need it
overall_grp = overall_grp.drop("WtdSentiment", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
overall_grp = overall_grp.rename(index=str, columns = {"Sentiment": "SentimentTotal",
                                         "LineCount": "LineCountTotal",
                                         "WordCount": "WordCountTotal"
                                        })

overall_grp.tail()

Unnamed: 0_level_0,EpisodeNo,Season,SentimentTotal,LineCountTotal,WordCountTotal
SEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
S09E17,17.0,9.0,0.050767,301,3074
S09E18,18.0,9.0,0.08521,299,2697
S09E19,19.0,9.0,0.056641,364,2740
S09E20,20.0,9.0,0.072543,316,2784
S09E23,23.0,9.0,0.035249,713,6986
