In [1]:
# Import dependencies
import pandas as pd
import numpy as np

In [2]:
# Read in the Scripts CSV file
cleaned_df = pd.read_csv('../rawData/cleaned_scripts.csv')

# Drop the "Unnamed: 0" and "Unnamed: 0.1" columns
cleaned_df = cleaned_df.drop("Unnamed: 0", 1)
cleaned_df = cleaned_df.drop("Unnamed: 0.1", 1)
cleaned_df.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,Sentiment
0,JERRY,Do you know what this is all about? Do you kno...,1,S01E01,1,0.9029
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1,S01E01,1,-0.4389
2,GEORGE,Are you through?,1,S01E01,1,0.0
3,JERRY,"You do of course try on, when you buy?",1,S01E01,1,0.0
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1,S01E01,1,0.6705


In [3]:
# Create variables for WordCount and LineCount
# LineCount is easy - just make it equal to 1, then when we group by episode, take the sum

In [4]:
# LineCount variable created
cleaned_df["LineCount"] = np.repeat(1, cleaned_df.shape[0])
cleaned_df.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,Sentiment,LineCount
0,JERRY,Do you know what this is all about? Do you kno...,1,S01E01,1,0.9029,1
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1,S01E01,1,-0.4389,1
2,GEORGE,Are you through?,1,S01E01,1,0.0,1
3,JERRY,"You do of course try on, when you buy?",1,S01E01,1,0.0,1
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1,S01E01,1,0.6705,1


In [5]:
#Create a function for counting the # of words per line of dialogue
def word_count(text):
    count = 0
    words = text.split()

    for word in words:
        count += 1
    
    return count

In [10]:
# WordCount -- create a loop through each row and each Dialogue...

# Create a list that will later be added to dataframe
WordCount = []

for i in range(cleaned_df.shape[0]):
    
    # Assign the current line of text to variable 'line'
    line = str(cleaned_df["Dialogue"][i]).lower()
    
    # Run the word_count function on line i
    try:
        count = word_count(line)
    except:
        count = 0
    
    # Append count to WordCount list
    WordCount.append(count)
    
    #########################
    # Print notifications to make sure the code is running
    #########################

    if i % 10000 == 0:
        perc_complete = round(i / cleaned_df.shape[0] * 100, 0)
        print("Percent Complete: " + str(perc_complete) + "%")
    
    if i == (cleaned_df.shape[0] - 1):
        print("-----------------------------------")
        print("Loop Complete!")

Percent Complete: 0.0%
Percent Complete: 18.0%
Percent Complete: 37.0%
Percent Complete: 55.0%
Percent Complete: 73.0%
Percent Complete: 92.0%
-----------------------------------
Loop Complete!


In [11]:
#Add WordCount as a column in dataframe
cleaned_df["WordCount"] = WordCount
cleaned_df.tail()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,Sentiment,LineCount,WordCount
54611,JERRY,Grand theft auto - don't steal any of my jokes.,23,S09E23,9,0.7669,1,10
54612,OTHER,You suck - I'm gonna cut you.,23,S09E23,9,-0.6124,1,7
54613,JERRY,"Hey, I don't come down to where you work, and ...",23,S09E23,9,0.4939,1,18
54614,OTHER,"Alright, Seinfeld, that's it. Let's go. Come on.",23,S09E23,9,0.25,1,8
54615,JERRY,"Alright, hey, you've been great! See you in th...",23,S09E23,9,0.75,1,10


In [12]:
# Create a weighted sentiment score (Sentiment x WordCount)
cleaned_df["WtdSentiment"] = cleaned_df["Sentiment"] * cleaned_df["WordCount"]
cleaned_df.tail(10)

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,Sentiment,LineCount,WordCount,WtdSentiment
54606,OTHER,I am.,23,S09E23,9,0.0,1,2,0.0
54607,JERRY,I'll talk slower. I'm kidding - I love Cellblo...,23,S09E23,9,0.8316,1,22,18.2952
54608,OTHER,Murder one.,23,S09E23,9,-0.6908,1,2,-1.3816
54609,JERRY,"Murder one? Oooooo, watch out everybody. Bette...",23,S09E23,9,0.1926,1,26,5.0076
54610,OTHER,Grand theft auto.,23,S09E23,9,0.4588,1,3,1.3764
54611,JERRY,Grand theft auto - don't steal any of my jokes.,23,S09E23,9,0.7669,1,10,7.669
54612,OTHER,You suck - I'm gonna cut you.,23,S09E23,9,-0.6124,1,7,-4.2868
54613,JERRY,"Hey, I don't come down to where you work, and ...",23,S09E23,9,0.4939,1,18,8.8902
54614,OTHER,"Alright, Seinfeld, that's it. Let's go. Come on.",23,S09E23,9,0.25,1,8,2.0
54615,JERRY,"Alright, hey, you've been great! See you in th...",23,S09E23,9,0.75,1,10,7.5


In [None]:
##### Create columns for Line Count, Word Count and Wtd Sentiment for each character / character group
# 1) Main Characters
# 2) Jerry
# 3) George
# 4) Kramer
# 5) Elaine
# 6) Secondary Characters
# 7) Other Characters

In [None]:
# Use loc to separate out each CHARACTER into separate dfs

In [22]:
#Add list for main characters
mainChars = ["JERRY", "GEORGE", "KRAMER", "ELAINE"]

In [34]:
# Create separate dataframes 
jerry_df = cleaned_df.loc[cleaned_df["Character"] == "JERRY", :]
george_df = cleaned_df.loc[cleaned_df["Character"] == "GEORGE", :]
kramer_df = cleaned_df.loc[cleaned_df["Character"] == "KRAMER", :]
elaine_df = cleaned_df.loc[cleaned_df["Character"] == "ELAINE", :]

mainChars_df = cleaned_df.loc[(cleaned_df["Character"] == "JERRY") | 
                             (cleaned_df["Character"] == "GEORGE") |
                             (cleaned_df["Character"] == "KRAMER") |
                             (cleaned_df["Character"] == "ELAINE"), :]

secondary_df = cleaned_df.loc[cleaned_df["Character"] == "SECONDARY", :]
other_df = cleaned_df.loc[cleaned_df["Character"] == "OTHER", :]

In [None]:
# Group dataframes (including the original cleaned_df) by Episode

In [38]:
cleaned_df.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,Sentiment,LineCount,WordCount,WtdSentiment
0,JERRY,Do you know what this is all about? Do you kno...,1,S01E01,1,0.9029,1,189,170.6481
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1,S01E01,1,-0.4389,1,41,-17.9949
2,GEORGE,Are you through?,1,S01E01,1,0.0,1,3,0.0
3,JERRY,"You do of course try on, when you buy?",1,S01E01,1,0.0,1,9,0.0
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1,S01E01,1,0.6705,1,14,9.387


In [71]:
# Grouping by Episode -- Overall (a.k.a. all characters)
overall_grp = cleaned_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
overall_grp["EpisodeNo"] = overall_grp["EpisodeNo"] / overall_grp["LineCount"]
overall_grp["Season"] = overall_grp["Season"] / overall_grp["LineCount"]
overall_grp["Sentiment"] = overall_grp["WtdSentiment"] / overall_grp["WordCount"]

# Drop column for WtdSentiment since we no longer need it
overall_grp = overall_grp.drop("WtdSentiment", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
overall_grp = overall_grp.rename(index=str, columns = {"Sentiment": "SentimentTotal",
                                         "LineCount": "LineCountTotal",
                                         "WordCount": "WordCountTotal"
                                        })
overall_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,SentimentTotal,LineCountTotal,WordCountTotal
168,S09E17,17.0,9.0,0.050767,301,3074
169,S09E18,18.0,9.0,0.08521,299,2697
170,S09E19,19.0,9.0,0.056641,364,2740
171,S09E20,20.0,9.0,0.072543,316,2784
172,S09E23,23.0,9.0,0.035249,713,6986


In [72]:
# Grouping by Episode -- Jerry
jerry_grp = jerry_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
jerry_grp["EpisodeNo"] = jerry_grp["EpisodeNo"] / jerry_grp["LineCount"]
jerry_grp["Season"] = jerry_grp["Season"] / jerry_grp["LineCount"]
jerry_grp["Sentiment"] = jerry_grp["WtdSentiment"] / jerry_grp["WordCount"]

# Drop column for WtdSentiment since we no longer need it
jerry_grp = jerry_grp.drop("WtdSentiment", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
jerry_grp = jerry_grp.rename(index=str, columns = {"Sentiment": "SentimentJerry",
                                                   "LineCount": "LineCountJerry",
                                                   "WordCount": "WordCountJerry"
                                                  })
jerry_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,SentimentJerry,LineCountJerry,WordCountJerry
168,S09E17,17.0,9.0,-0.052106,76,678
169,S09E18,18.0,9.0,0.11163,72,579
170,S09E19,19.0,9.0,0.06453,105,619
171,S09E20,20.0,9.0,0.011879,75,585
172,S09E23,23.0,9.0,0.071567,151,1334


In [73]:
# Grouping by Episode -- George
george_grp = george_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
george_grp["EpisodeNo"] = george_grp["EpisodeNo"] / george_grp["LineCount"]
george_grp["Season"] = george_grp["Season"] / george_grp["LineCount"]
george_grp["Sentiment"] = george_grp["WtdSentiment"] / george_grp["WordCount"]

# Drop column for WtdSentiment since we no longer need it
george_grp = george_grp.drop("WtdSentiment", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
george_grp = george_grp.rename(index=str, columns = {"Sentiment": "SentimentGeorge",
                                                     "LineCount": "LineCountGeorge",
                                                     "WordCount": "WordCountGeorge"
                                                    })
george_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,SentimentGeorge,LineCountGeorge,WordCountGeorge
167,S09E17,17.0,9.0,0.025762,36,343
168,S09E18,18.0,9.0,0.123639,54,520
169,S09E19,19.0,9.0,0.055882,45,329
170,S09E20,20.0,9.0,0.038388,49,433
171,S09E23,23.0,9.0,-0.022855,126,975


In [74]:
# Grouping by Episode -- Kramer
kramer_grp = kramer_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
kramer_grp["EpisodeNo"] = kramer_grp["EpisodeNo"] / kramer_grp["LineCount"]
kramer_grp["Season"] = kramer_grp["Season"] / kramer_grp["LineCount"]
kramer_grp["Sentiment"] = kramer_grp["WtdSentiment"] / kramer_grp["WordCount"]

# Drop column for WtdSentiment since we no longer need it
kramer_grp = kramer_grp.drop("WtdSentiment", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
kramer_grp = kramer_grp.rename(index=str, columns = {"Sentiment": "SentimentKramer",
                                                     "LineCount": "LineCountKramer",
                                                     "WordCount": "WordCountKramer"
                                                    })
kramer_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,SentimentKramer,LineCountKramer,WordCountKramer
166,S09E17,17.0,9.0,0.241623,37,479
167,S09E18,18.0,9.0,-0.058961,33,351
168,S09E19,19.0,9.0,0.095594,56,557
169,S09E20,20.0,9.0,0.124037,57,717
170,S09E23,23.0,9.0,0.170964,38,411


In [75]:
# Grouping by Episode -- Elaine
elaine_grp = elaine_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
elaine_grp["EpisodeNo"] = elaine_grp["EpisodeNo"] / elaine_grp["LineCount"]
elaine_grp["Season"] = elaine_grp["Season"] / elaine_grp["LineCount"]
elaine_grp["Sentiment"] = elaine_grp["WtdSentiment"] / elaine_grp["WordCount"]

# Drop column for WtdSentiment since we no longer need it
elaine_grp = elaine_grp.drop("WtdSentiment", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
elaine_grp = elaine_grp.rename(index=str, columns = {"Sentiment": "SentimentElaine",
                                                     "LineCount": "LineCountElaine",
                                                     "WordCount": "WordCountElaine"
                                                    })
elaine_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,SentimentElaine,LineCountElaine,WordCountElaine
166,S09E17,17.0,9.0,0.060135,36,412
167,S09E18,18.0,9.0,0.042037,41,438
168,S09E19,19.0,9.0,0.018077,58,453
169,S09E20,20.0,9.0,0.086721,48,449
170,S09E23,23.0,9.0,0.140492,75,585


In [76]:
# Grouping by Episode -- Secondary characters
secondary_grp = secondary_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
secondary_grp["EpisodeNo"] = secondary_grp["EpisodeNo"] / secondary_grp["LineCount"]
secondary_grp["Season"] = secondary_grp["Season"] / secondary_grp["LineCount"]
secondary_grp["Sentiment"] = secondary_grp["WtdSentiment"] / secondary_grp["WordCount"]

# Drop column for WtdSentiment since we no longer need it
secondary_grp = secondary_grp.drop("WtdSentiment", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
secondary_grp = secondary_grp.rename(index=str, columns = {"Sentiment": "SentimentSecondary",
                                                     "LineCount": "LineCountSecondary",
                                                     "WordCount": "WordCountSecondary"
                                                    })
secondary_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,SentimentSecondary,LineCountSecondary,WordCountSecondary
91,S09E13,13.0,9.0,0.59765,2,34
92,S09E15,15.0,9.0,0.112005,41,373
93,S09E16,16.0,9.0,-0.036034,21,120
94,S09E17,17.0,9.0,0.09715,36,320
95,S09E23,23.0,9.0,0.017131,45,527


In [77]:
# Grouping by Episode -- Other characters
other_grp = other_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
other_grp["EpisodeNo"] = other_grp["EpisodeNo"] / other_grp["LineCount"]
other_grp["Season"] = other_grp["Season"] / other_grp["LineCount"]
other_grp["Sentiment"] = other_grp["WtdSentiment"] / other_grp["WordCount"]

# Drop column for WtdSentiment since we no longer need it
other_grp = other_grp.drop("WtdSentiment", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
other_grp = other_grp.rename(index=str, columns = {"Sentiment": "SentimentOther",
                                                     "LineCount": "LineCountOther",
                                                     "WordCount": "WordCountOther"
                                                    })
other_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,SentimentOther,LineCountOther,WordCountOther
168,S09E17,17.0,9.0,0.013003,80,842
169,S09E18,18.0,9.0,0.127525,99,809
170,S09E19,19.0,9.0,0.045311,100,782
171,S09E20,20.0,9.0,0.084193,87,600
172,S09E23,23.0,9.0,0.00367,278,3154


In [94]:
# Grouping by Episode -- Main characters only
mainChars_grp = mainChars_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
mainChars_grp["EpisodeNo"] = mainChars_grp["EpisodeNo"] / mainChars_grp["LineCount"]
mainChars_grp["Season"] = mainChars_grp["Season"] / mainChars_grp["LineCount"]
mainChars_grp["Sentiment"] = mainChars_grp["WtdSentiment"] / mainChars_grp["WordCount"]

# Drop column for WtdSentiment since we no longer need it
mainChars_grp = mainChars_grp.drop("WtdSentiment", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
mainChars_grp = mainChars_grp.rename(index=str, columns = {"Sentiment": "SentimentMainChars",
                                                     "LineCount": "LineCountMainChars",
                                                     "WordCount": "WordCountMainChars"
                                                    })
mainChars_grp.shape

(173, 6)

In [None]:
##### Merge all grouped dataframes into one #####
# overall_grp
# jerry_grp
# george_grp
# kramer_grp
# elaine_grp
# mainChars_grp
# secondary_grp
# other_grp

In [88]:
# Merging Overall and Jerry
merged_df = pd.merge(overall_grp, jerry_grp, on = "SEID", how = "left")

# Drop EpisodeNo_y and Season_y
merged_df = merged_df.drop("EpisodeNo_y", 1)
merged_df = merged_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
merged_df = merged_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})

print(merged_df.shape)
print(merged_df.head())

(173, 9)
     SEID  EpisodeNo  Season  SentimentTotal  LineCountTotal  WordCountTotal  \
0  S01E01        1.0     1.0        0.173524             557            6582   
1  S01E02        2.0     1.0       -0.002193             280            3268   
2  S01E03        3.0     1.0        0.131158             241            3376   
3  S01E04        4.0     1.0        0.039517             230            3277   
4  S02E01        1.0     2.0        0.104272             245            3448   

   SentimentJerry  LineCountJerry  WordCountJerry  
0        0.235595             250            3902  
1       -0.134020             110            1552  
2        0.199045             102            1698  
3        0.108037             102            1730  
4        0.184650             107            1569  


In [89]:
# Merging above with George
merged_df = pd.merge(merged_df, george_grp, on = "SEID", how = "left")

# Drop EpisodeNo_y and Season_y
merged_df = merged_df.drop("EpisodeNo_y", 1)
merged_df = merged_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
merged_df = merged_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})

print(merged_df.shape)
print(merged_df.head())

(173, 12)
     SEID  EpisodeNo  Season  SentimentTotal  LineCountTotal  WordCountTotal  \
0  S01E01        1.0     1.0        0.173524             557            6582   
1  S01E02        2.0     1.0       -0.002193             280            3268   
2  S01E03        3.0     1.0        0.131158             241            3376   
3  S01E04        4.0     1.0        0.039517             230            3277   
4  S02E01        1.0     2.0        0.104272             245            3448   

   SentimentJerry  LineCountJerry  WordCountJerry  SentimentGeorge  \
0        0.235595             250            3902         0.105266   
1       -0.134020             110            1552        -0.008671   
2        0.199045             102            1698         0.029603   
3        0.108037             102            1730        -0.161307   
4        0.184650             107            1569        -0.071880   

   LineCountGeorge  WordCountGeorge  
0             98.0            969.0  
1           

In [90]:
# Merging above with Kramer
merged_df = pd.merge(merged_df, kramer_grp, on = "SEID", how = "left")

# Drop EpisodeNo_y and Season_y
merged_df = merged_df.drop("EpisodeNo_y", 1)
merged_df = merged_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
merged_df = merged_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})

print(merged_df.shape)
print(merged_df.head())

(173, 15)
     SEID  EpisodeNo  Season  SentimentTotal  LineCountTotal  WordCountTotal  \
0  S01E01        1.0     1.0        0.173524             557            6582   
1  S01E02        2.0     1.0       -0.002193             280            3268   
2  S01E03        3.0     1.0        0.131158             241            3376   
3  S01E04        4.0     1.0        0.039517             230            3277   
4  S02E01        1.0     2.0        0.104272             245            3448   

   SentimentJerry  LineCountJerry  WordCountJerry  SentimentGeorge  \
0        0.235595             250            3902         0.105266   
1       -0.134020             110            1552        -0.008671   
2        0.199045             102            1698         0.029603   
3        0.108037             102            1730        -0.161307   
4        0.184650             107            1569        -0.071880   

   LineCountGeorge  WordCountGeorge  SentimentKramer  LineCountKramer  \
0             9

In [91]:
# Merging above with Elaine
merged_df = pd.merge(merged_df, elaine_grp, on = "SEID", how = "left")

# Drop EpisodeNo_y and Season_y
merged_df = merged_df.drop("EpisodeNo_y", 1)
merged_df = merged_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
merged_df = merged_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})

print(merged_df.shape)
print(merged_df.head())

(173, 18)
     SEID  EpisodeNo  Season  SentimentTotal  LineCountTotal  WordCountTotal  \
0  S01E01        1.0     1.0        0.173524             557            6582   
1  S01E02        2.0     1.0       -0.002193             280            3268   
2  S01E03        3.0     1.0        0.131158             241            3376   
3  S01E04        4.0     1.0        0.039517             230            3277   
4  S02E01        1.0     2.0        0.104272             245            3448   

   SentimentJerry  LineCountJerry  WordCountJerry  SentimentGeorge  \
0        0.235595             250            3902         0.105266   
1       -0.134020             110            1552        -0.008671   
2        0.199045             102            1698         0.029603   
3        0.108037             102            1730        -0.161307   
4        0.184650             107            1569        -0.071880   

   LineCountGeorge  WordCountGeorge  SentimentKramer  LineCountKramer  \
0             9

In [92]:
# Merging above with MainChars
merged_df = pd.merge(merged_df, mainChars_grp, on = "SEID", how = "left")

# Drop EpisodeNo_y and Season_y
merged_df = merged_df.drop("EpisodeNo_y", 1)
merged_df = merged_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
merged_df = merged_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})

print(merged_df.shape)
print(merged_df.head())

(173, 21)
     SEID  EpisodeNo  Season  SentimentTotal  LineCountTotal  WordCountTotal  \
0  S01E01        1.0     1.0        0.173524             557            6582   
1  S01E02        2.0     1.0       -0.002193             280            3268   
2  S01E03        3.0     1.0        0.131158             241            3376   
3  S01E04        4.0     1.0        0.039517             230            3277   
4  S02E01        1.0     2.0        0.104272             245            3448   

   SentimentJerry  LineCountJerry  WordCountJerry  SentimentGeorge  \
0        0.235595             250            3902         0.105266   
1       -0.134020             110            1552        -0.008671   
2        0.199045             102            1698         0.029603   
3        0.108037             102            1730        -0.161307   
4        0.184650             107            1569        -0.071880   

          ...          WordCountGeorge  SentimentKramer  LineCountKramer  \
0         ..

In [93]:
# Merging above with Secondary
merged_df = pd.merge(merged_df, secondary_grp, on = "SEID", how = "left")

# Drop EpisodeNo_y and Season_y
merged_df = merged_df.drop("EpisodeNo_y", 1)
merged_df = merged_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
merged_df = merged_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})

print(merged_df.shape)
print(merged_df.head())

(173, 24)
     SEID  EpisodeNo  Season  SentimentTotal  LineCountTotal  WordCountTotal  \
0  S01E01        1.0     1.0        0.173524             557            6582   
1  S01E02        2.0     1.0       -0.002193             280            3268   
2  S01E03        3.0     1.0        0.131158             241            3376   
3  S01E04        4.0     1.0        0.039517             230            3277   
4  S02E01        1.0     2.0        0.104272             245            3448   

   SentimentJerry  LineCountJerry  WordCountJerry  SentimentGeorge  \
0        0.235595             250            3902         0.105266   
1       -0.134020             110            1552        -0.008671   
2        0.199045             102            1698         0.029603   
3        0.108037             102            1730        -0.161307   
4        0.184650             107            1569        -0.071880   

          ...          WordCountKramer  SentimentElaine  LineCountElaine  \
0         ..

In [95]:
# Merging above with Other
merged_df = pd.merge(merged_df, other_grp, on = "SEID", how = "left")

# Drop EpisodeNo_y and Season_y
merged_df = merged_df.drop("EpisodeNo_y", 1)
merged_df = merged_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
merged_df = merged_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})

print(merged_df.shape)
print(merged_df.head())

(173, 27)
     SEID  EpisodeNo  Season  SentimentTotal  LineCountTotal  WordCountTotal  \
0  S01E01        1.0     1.0        0.173524             557            6582   
1  S01E02        2.0     1.0       -0.002193             280            3268   
2  S01E03        3.0     1.0        0.131158             241            3376   
3  S01E04        4.0     1.0        0.039517             230            3277   
4  S02E01        1.0     2.0        0.104272             245            3448   

   SentimentJerry  LineCountJerry  WordCountJerry  SentimentGeorge  \
0        0.235595             250            3902         0.105266   
1       -0.134020             110            1552        -0.008671   
2        0.199045             102            1698         0.029603   
3        0.108037             102            1730        -0.161307   
4        0.184650             107            1569        -0.071880   

        ...        WordCountElaine  SentimentMainChars  LineCountMainChars  \
0       ..

In [96]:
# Export to CSV
merged_df.to_csv('../rawData/merged_scripts.csv')