In [1]:
# Import dependencies
import pandas as pd
import numpy as np

In [2]:
# Read in the Scripts CSV file
cleaned_df = pd.read_csv('../outputData/cleaned_scripts.csv')

# Drop the "Unnamed: 0" and "Unnamed: 0.1" columns
cleaned_df = cleaned_df.drop("Unnamed: 0", 1)
cleaned_df = cleaned_df.drop("Unnamed: 0.1", 1)
cleaned_df.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,PosSent,NegSent,CompSent
0,JERRY,Do you know what this is all about? Do you kno...,1,S01E01,1,0.072,0.011,0.9029
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1,S01E01,1,0.054,0.096,-0.4389
2,GEORGE,Are you through?,1,S01E01,1,0.0,0.0,0.0
3,JERRY,"You do of course try on, when you buy?",1,S01E01,1,0.0,0.0,0.0
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1,S01E01,1,0.355,0.0,0.6705


In [3]:
# Create variables for WordCount and LineCount
# LineCount is easy - just make it equal to 1, then when we group by episode, take the sum

In [4]:
# LineCount variable created
cleaned_df["LineCount"] = np.repeat(1, cleaned_df.shape[0])
cleaned_df.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,PosSent,NegSent,CompSent,LineCount
0,JERRY,Do you know what this is all about? Do you kno...,1,S01E01,1,0.072,0.011,0.9029,1
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1,S01E01,1,0.054,0.096,-0.4389,1
2,GEORGE,Are you through?,1,S01E01,1,0.0,0.0,0.0,1
3,JERRY,"You do of course try on, when you buy?",1,S01E01,1,0.0,0.0,0.0,1
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1,S01E01,1,0.355,0.0,0.6705,1


In [5]:
#Create a function for counting the # of words per line of dialogue
def word_count(text):
    count = 0
    words = text.split()

    for word in words:
        count += 1
    
    return count

In [6]:
# WordCount -- create a loop through each row and each Dialogue...

# Create a list that will later be added to dataframe
WordCount = []

for i in range(cleaned_df.shape[0]):
    
    # Assign the current line of text to variable 'line'
    line = str(cleaned_df["Dialogue"][i]).lower()
    
    # Run the word_count function on line i
    try:
        count = word_count(line)
    except:
        count = 0
    
    # Append count to WordCount list
    WordCount.append(count)
    
    #########################
    # Print notifications to make sure the code is running
    #########################

    if i % 10000 == 0:
        perc_complete = round(i / cleaned_df.shape[0] * 100, 0)
        print("Percent Complete: " + str(perc_complete) + "%")
    
    if i == (cleaned_df.shape[0] - 1):
        print("-----------------------------------")
        print("Loop Complete!")

Percent Complete: 0.0%
Percent Complete: 18.0%
Percent Complete: 37.0%
Percent Complete: 55.0%
Percent Complete: 73.0%
Percent Complete: 92.0%
-----------------------------------
Loop Complete!


In [7]:
#Add WordCount as a column in dataframe
cleaned_df["WordCount"] = WordCount
cleaned_df.tail()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,PosSent,NegSent,CompSent,LineCount,WordCount
54611,JERRY,Grand theft auto - don't steal any of my jokes.,23,S09E23,9,0.56,0.0,0.7669,1,10
54612,OTHER,You suck - I'm gonna cut you.,23,S09E23,9,0.0,0.556,-0.6124,1,7
54613,JERRY,"Hey, I don't come down to where you work, and ...",23,S09E23,9,0.167,0.0,0.4939,1,18
54614,OTHER,"Alright, Seinfeld, that's it. Let's go. Come on.",23,S09E23,9,0.222,0.0,0.25,1,8
54615,JERRY,"Alright, hey, you've been great! See you in th...",23,S09E23,9,0.444,0.0,0.75,1,10


In [8]:
# Create a weighted sentiment score (Sentiment x WordCount)
cleaned_df["WtdPos"] = cleaned_df["PosSent"] * cleaned_df["WordCount"]
cleaned_df["WtdNeg"] = cleaned_df["CompSent"] * cleaned_df["WordCount"]
cleaned_df["WtdComp"] = cleaned_df["NegSent"] * cleaned_df["WordCount"]
cleaned_df.tail(10)

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,PosSent,NegSent,CompSent,LineCount,WordCount,WtdPos,WtdNeg,WtdComp
54606,OTHER,I am.,23,S09E23,9,0.0,0.0,0.0,1,2,0.0,0.0,0.0
54607,JERRY,I'll talk slower. I'm kidding - I love Cellblo...,23,S09E23,9,0.341,0.0,0.8316,1,22,7.502,18.2952,0.0
54608,OTHER,Murder one.,23,S09E23,9,0.0,0.825,-0.6908,1,2,0.0,-1.3816,1.65
54609,JERRY,"Murder one? Oooooo, watch out everybody. Bette...",23,S09E23,9,0.225,0.142,0.1926,1,26,5.85,5.0076,3.692
54610,OTHER,Grand theft auto.,23,S09E23,9,0.6,0.0,0.4588,1,3,1.8,1.3764,0.0
54611,JERRY,Grand theft auto - don't steal any of my jokes.,23,S09E23,9,0.56,0.0,0.7669,1,10,5.6,7.669,0.0
54612,OTHER,You suck - I'm gonna cut you.,23,S09E23,9,0.0,0.556,-0.6124,1,7,0.0,-4.2868,3.892
54613,JERRY,"Hey, I don't come down to where you work, and ...",23,S09E23,9,0.167,0.0,0.4939,1,18,3.006,8.8902,0.0
54614,OTHER,"Alright, Seinfeld, that's it. Let's go. Come on.",23,S09E23,9,0.222,0.0,0.25,1,8,1.776,2.0,0.0
54615,JERRY,"Alright, hey, you've been great! See you in th...",23,S09E23,9,0.444,0.0,0.75,1,10,4.44,7.5,0.0


In [9]:
##### Create columns for Line Count, Word Count and Wtd Sentiment for each character / character group
# 1) Jerry
# 2) George
# 3) Kramer
# 4) Elaine
# 5) Secondary Characters
# 6) Other Characters

In [10]:
# Use loc to separate out each CHARACTER into separate dfs

In [11]:
#Add list for main characters
mainChars = ["JERRY", "GEORGE", "KRAMER", "ELAINE"]

In [12]:
# Create separate dataframes 
jerry_df = cleaned_df.loc[cleaned_df["Character"] == "JERRY", :]
george_df = cleaned_df.loc[cleaned_df["Character"] == "GEORGE", :]
kramer_df = cleaned_df.loc[cleaned_df["Character"] == "KRAMER", :]
elaine_df = cleaned_df.loc[cleaned_df["Character"] == "ELAINE", :]

mainChars_df = cleaned_df.loc[(cleaned_df["Character"] == "JERRY") | 
                             (cleaned_df["Character"] == "GEORGE") |
                             (cleaned_df["Character"] == "KRAMER") |
                             (cleaned_df["Character"] == "ELAINE"), :]

secondary_df = cleaned_df.loc[cleaned_df["Character"] == "SECONDARY", :]
other_df = cleaned_df.loc[cleaned_df["Character"] == "OTHER", :]

In [13]:
# Group dataframes (including the original cleaned_df) by Episode

In [14]:
# Grouping by Episode -- Overall (a.k.a. all characters)
overall_grp = cleaned_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
overall_grp["EpisodeNo"] = overall_grp["EpisodeNo"] / overall_grp["LineCount"]
overall_grp["Season"] = overall_grp["Season"] / overall_grp["LineCount"]
overall_grp["PosSent"] = overall_grp["WtdPos"] / overall_grp["WordCount"]
overall_grp["NegSent"] = overall_grp["WtdNeg"] / overall_grp["WordCount"]
overall_grp["CompSent"] = overall_grp["WtdComp"] / overall_grp["WordCount"]

# Drop column for WtdSentiments since we no longer need it
overall_grp = overall_grp.drop("WtdPos", 1)
overall_grp = overall_grp.drop("WtdNeg", 1)
overall_grp = overall_grp.drop("WtdComp", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
overall_grp = overall_grp.rename(index=str, columns = {"PosSent": "PosTotal",
                                                       "NegSent": "NegTotal",
                                                       "CompSent": "CompTotal",
                                                       "LineCount": "LineCountTotal",
                                                       "WordCount": "WordCountTotal"
                                                      })
overall_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,PosTotal,NegTotal,CompTotal,LineCountTotal,WordCountTotal
168,S09E17,17.0,9.0,0.097596,0.047979,0.071316,301,3074
169,S09E18,18.0,9.0,0.129953,0.08521,0.064849,299,2697
170,S09E19,19.0,9.0,0.095803,0.054141,0.071659,364,2740
171,S09E20,20.0,9.0,0.104708,0.074494,0.073826,316,2784
172,S09E23,23.0,9.0,0.11523,0.034926,0.084556,713,6986


In [15]:
# Grouping by Episode -- Jerry
jerry_grp = jerry_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
jerry_grp["EpisodeNo"] = jerry_grp["EpisodeNo"] / jerry_grp["LineCount"]
jerry_grp["Season"] = jerry_grp["Season"] / jerry_grp["LineCount"]
jerry_grp["PosSent"] = jerry_grp["WtdPos"] / jerry_grp["WordCount"]
jerry_grp["NegSent"] = jerry_grp["WtdNeg"] / jerry_grp["WordCount"]
jerry_grp["CompSent"] = jerry_grp["WtdComp"] / jerry_grp["WordCount"]

# Drop column for WtdSentiments since we no longer need it
jerry_grp = jerry_grp.drop("WtdPos", 1)
jerry_grp = jerry_grp.drop("WtdNeg", 1)
jerry_grp = jerry_grp.drop("WtdComp", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
jerry_grp = jerry_grp.rename(index=str, columns = {"PosSent": "PosJerry",
                                                       "NegSent": "NegJerry",
                                                       "CompSent": "CompJerry",
                                                       "LineCount": "LineCountJerry",
                                                       "WordCount": "WordCountJerry"
                                                      })
jerry_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,PosJerry,NegJerry,CompJerry,LineCountJerry,WordCountJerry
168,S09E17,17.0,9.0,0.093678,-0.054254,0.101966,76,678
169,S09E18,18.0,9.0,0.131767,0.11163,0.05182,72,579
170,S09E19,19.0,9.0,0.124543,0.068158,0.072737,105,619
171,S09E20,20.0,9.0,0.068291,0.02409,0.054019,75,585
172,S09E23,23.0,9.0,0.122067,0.065924,0.076808,151,1334


In [16]:
# Grouping by Episode -- George
george_grp = george_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
george_grp["EpisodeNo"] = george_grp["EpisodeNo"] / george_grp["LineCount"]
george_grp["Season"] = george_grp["Season"] / george_grp["LineCount"]
george_grp["PosSent"] = george_grp["WtdPos"] / george_grp["WordCount"]
george_grp["NegSent"] = george_grp["WtdNeg"] / george_grp["WordCount"]
george_grp["CompSent"] = george_grp["WtdComp"] / george_grp["WordCount"]

# Drop column for WtdSentiments since we no longer need it
george_grp = george_grp.drop("WtdPos", 1)
george_grp = george_grp.drop("WtdNeg", 1)
george_grp = george_grp.drop("WtdComp", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
george_grp = george_grp.rename(index=str, columns = {"PosSent": "PosGeorge",
                                                       "NegSent": "NegGeorge",
                                                       "CompSent": "CompGeorge",
                                                       "LineCount": "LineCountGeorge",
                                                       "WordCount": "WordCountGeorge"
                                                      })
george_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,PosGeorge,NegGeorge,CompGeorge,LineCountGeorge,WordCountGeorge
167,S09E17,17.0,9.0,0.083598,0.025762,0.045525,36,343
168,S09E18,18.0,9.0,0.115144,0.123639,0.0467,54,520
169,S09E19,19.0,9.0,0.090359,0.055882,0.075799,45,329
170,S09E20,20.0,9.0,0.109635,0.038388,0.084028,49,433
171,S09E23,23.0,9.0,0.111221,-0.022855,0.10644,126,975


In [17]:
# Grouping by Episode -- Kramer
kramer_grp = kramer_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
kramer_grp["EpisodeNo"] = kramer_grp["EpisodeNo"] / kramer_grp["LineCount"]
kramer_grp["Season"] = kramer_grp["Season"] / kramer_grp["LineCount"]
kramer_grp["PosSent"] = kramer_grp["WtdPos"] / kramer_grp["WordCount"]
kramer_grp["NegSent"] = kramer_grp["WtdNeg"] / kramer_grp["WordCount"]
kramer_grp["CompSent"] = kramer_grp["WtdComp"] / kramer_grp["WordCount"]

# Drop column for WtdSentiments since we no longer need it
kramer_grp = kramer_grp.drop("WtdPos", 1)
kramer_grp = kramer_grp.drop("WtdNeg", 1)
kramer_grp = kramer_grp.drop("WtdComp", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
kramer_grp = kramer_grp.rename(index=str, columns = {"PosSent": "PosKramer",
                                                       "NegSent": "NegKramer",
                                                       "CompSent": "CompKramer",
                                                       "LineCount": "LineCountKramer",
                                                       "WordCount": "WordCountKramer"
                                                      })
kramer_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,PosKramer,NegKramer,CompKramer,LineCountKramer,WordCountKramer
166,S09E17,17.0,9.0,0.091637,0.230111,0.029161,37,479
167,S09E18,18.0,9.0,0.101228,-0.058961,0.102214,33,351
168,S09E19,19.0,9.0,0.119154,0.079262,0.072743,56,557
169,S09E20,20.0,9.0,0.126921,0.1289,0.072577,57,717
170,S09E23,23.0,9.0,0.144574,0.170964,0.057307,38,411


In [18]:
# Grouping by Episode -- Elaine
elaine_grp = elaine_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
elaine_grp["EpisodeNo"] = elaine_grp["EpisodeNo"] / elaine_grp["LineCount"]
elaine_grp["Season"] = elaine_grp["Season"] / elaine_grp["LineCount"]
elaine_grp["PosSent"] = elaine_grp["WtdPos"] / elaine_grp["WordCount"]
elaine_grp["NegSent"] = elaine_grp["WtdNeg"] / elaine_grp["WordCount"]
elaine_grp["CompSent"] = elaine_grp["WtdComp"] / elaine_grp["WordCount"]

# Drop column for WtdSentiments since we no longer need it
elaine_grp = elaine_grp.drop("WtdPos", 1)
elaine_grp = elaine_grp.drop("WtdNeg", 1)
elaine_grp = elaine_grp.drop("WtdComp", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
elaine_grp = elaine_grp.rename(index=str, columns = {"PosSent": "PosElaine",
                                                       "NegSent": "NegElaine",
                                                       "CompSent": "CompElaine",
                                                       "LineCount": "LineCountElaine",
                                                       "WordCount": "WordCountElaine"
                                                      })
elaine_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,PosElaine,NegElaine,CompElaine,LineCountElaine,WordCountElaine
166,S09E17,17.0,9.0,0.138799,0.051517,0.096039,36,412
167,S09E18,18.0,9.0,0.094658,0.042037,0.074886,41,438
168,S09E19,19.0,9.0,0.070695,0.018077,0.099313,58,453
169,S09E20,20.0,9.0,0.109394,0.085975,0.068871,48,449
170,S09E23,23.0,9.0,0.126094,0.140492,0.065491,75,585


In [19]:
# Grouping by Episode -- Secondary characters
secondary_grp = secondary_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
secondary_grp["EpisodeNo"] = secondary_grp["EpisodeNo"] / secondary_grp["LineCount"]
secondary_grp["Season"] = secondary_grp["Season"] / secondary_grp["LineCount"]
secondary_grp["PosSent"] = secondary_grp["WtdPos"] / secondary_grp["WordCount"]
secondary_grp["NegSent"] = secondary_grp["WtdNeg"] / secondary_grp["WordCount"]
secondary_grp["CompSent"] = secondary_grp["WtdComp"] / secondary_grp["WordCount"]

# Drop column for WtdSentiments since we no longer need it
secondary_grp = secondary_grp.drop("WtdPos", 1)
secondary_grp = secondary_grp.drop("WtdNeg", 1)
secondary_grp = secondary_grp.drop("WtdComp", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
secondary_grp = secondary_grp.rename(index=str, columns = {"PosSent": "PosSecond",
                                                       "NegSent": "NegSecond",
                                                       "CompSent": "CompSecond",
                                                       "LineCount": "LineCountSecond",
                                                       "WordCount": "WordCountSecond"
                                                      })
secondary_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,PosSecond,NegSecond,CompSecond,LineCountSecond,WordCountSecond
91,S09E13,13.0,9.0,0.2905,0.59765,0.041,2,34
92,S09E15,15.0,9.0,0.087198,0.112005,0.039247,41,373
93,S09E16,16.0,9.0,0.124392,-0.036034,0.123967,21,120
94,S09E17,17.0,9.0,0.123681,0.106206,0.074753,36,320
95,S09E23,23.0,9.0,0.13259,0.01455,0.109949,45,527


In [20]:
# Grouping by Episode -- Other characters
other_grp = other_df.groupby(by = "SEID", as_index=False).sum()

# Drop the EpisodeNo, Season, and Sentiment columns since we don't need them anymore

# Re-create columns for EpisodeNo, Season and Sentiment
other_grp["EpisodeNo"] = other_grp["EpisodeNo"] / other_grp["LineCount"]
other_grp["Season"] = other_grp["Season"] / other_grp["LineCount"]
other_grp["PosSent"] = other_grp["WtdPos"] / other_grp["WordCount"]
other_grp["NegSent"] = other_grp["WtdNeg"] / other_grp["WordCount"]
other_grp["CompSent"] = other_grp["WtdComp"] / other_grp["WordCount"]

# Drop column for WtdSentiments since we no longer need it
other_grp = other_grp.drop("WtdPos", 1)
other_grp = other_grp.drop("WtdNeg", 1)
other_grp = other_grp.drop("WtdComp", 1)

# Rename Sentiment, LineCount, and WordCount variables to have "Total" at the end of them
other_grp = other_grp.rename(index=str, columns = {"PosSent": "PosOther",
                                                       "NegSent": "NegOther",
                                                       "CompSent": "CompOther",
                                                       "LineCount": "LineCountOther",
                                                       "WordCount": "WordCountOther"
                                                      })
other_grp.tail()

Unnamed: 0,SEID,EpisodeNo,Season,PosOther,NegOther,CompOther,LineCountOther,WordCountOther
168,S09E17,17.0,9.0,0.079768,0.011877,0.067721,80,842
169,S09E18,18.0,9.0,0.169747,0.127525,0.064195,99,809
170,S09E19,19.0,9.0,0.073256,0.045311,0.052274,100,782
171,S09E20,20.0,9.0,0.106608,0.076088,0.090977,87,600
172,S09E23,23.0,9.0,0.104839,0.005773,0.083913,278,3154


In [21]:
##### Merge all grouped dataframes into one #####
# overall_grp
# jerry_grp
# george_grp
# kramer_grp
# elaine_grp
# secondary_grp
# other_grp

In [22]:
# Merging Overall and Jerry
merged_df = pd.merge(overall_grp, jerry_grp, on = "SEID", how = "left")

# Drop EpisodeNo_y and Season_y
merged_df = merged_df.drop("EpisodeNo_y", 1)
merged_df = merged_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
merged_df = merged_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})

#print(merged_df.shape)
merged_df.head()

Unnamed: 0,SEID,EpisodeNo,Season,PosTotal,NegTotal,CompTotal,LineCountTotal,WordCountTotal,PosJerry,NegJerry,CompJerry,LineCountJerry,WordCountJerry
0,S01E01,1.0,1.0,0.107816,0.16043,0.063447,557,6582,0.11325,0.233425,0.063717,250,3902
1,S01E02,2.0,1.0,0.100301,-0.004003,0.068077,280,3268,0.079147,-0.13402,0.073063,110,1552
2,S01E03,3.0,1.0,0.095904,0.118192,0.069294,241,3376,0.100044,0.181343,0.070813,102,1698
3,S01E04,4.0,1.0,0.090593,-0.013447,0.067964,230,3277,0.114206,0.003125,0.064939,102,1730
4,S02E01,1.0,2.0,0.103584,0.078673,0.08396,245,3448,0.10655,0.146833,0.082965,107,1569


In [23]:
# Merging above with George
merged_df = pd.merge(merged_df, george_grp, on = "SEID", how = "left")

# Drop EpisodeNo_y and Season_y
merged_df = merged_df.drop("EpisodeNo_y", 1)
merged_df = merged_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
merged_df = merged_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})

print(merged_df.shape)
print(merged_df.head())

(173, 18)
     SEID  EpisodeNo  Season  PosTotal  NegTotal  CompTotal  LineCountTotal  \
0  S01E01        1.0     1.0  0.107816  0.160430   0.063447             557   
1  S01E02        2.0     1.0  0.100301 -0.004003   0.068077             280   
2  S01E03        3.0     1.0  0.095904  0.118192   0.069294             241   
3  S01E04        4.0     1.0  0.090593 -0.013447   0.067964             230   
4  S02E01        1.0     2.0  0.103584  0.078673   0.083960             245   

   WordCountTotal  PosJerry  NegJerry  CompJerry  LineCountJerry  \
0            6582  0.113250  0.233425   0.063717             250   
1            3268  0.079147 -0.134020   0.073063             110   
2            3376  0.100044  0.181343   0.070813             102   
3            3277  0.114206  0.003125   0.064939             102   
4            3448  0.106550  0.146833   0.082965             107   

   WordCountJerry  PosGeorge  NegGeorge  CompGeorge  LineCountGeorge  \
0            3902   0.092752   0.0

In [24]:
# Merging above with Kramer
merged_df = pd.merge(merged_df, kramer_grp, on = "SEID", how = "left")

# Drop EpisodeNo_y and Season_y
merged_df = merged_df.drop("EpisodeNo_y", 1)
merged_df = merged_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
merged_df = merged_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})

print(merged_df.shape)
print(merged_df.head())

(173, 23)
     SEID  EpisodeNo  Season  PosTotal  NegTotal  CompTotal  LineCountTotal  \
0  S01E01        1.0     1.0  0.107816  0.160430   0.063447             557   
1  S01E02        2.0     1.0  0.100301 -0.004003   0.068077             280   
2  S01E03        3.0     1.0  0.095904  0.118192   0.069294             241   
3  S01E04        4.0     1.0  0.090593 -0.013447   0.067964             230   
4  S02E01        1.0     2.0  0.103584  0.078673   0.083960             245   

   WordCountTotal  PosJerry  NegJerry       ...         PosGeorge  NegGeorge  \
0            6582  0.113250  0.233425       ...          0.092752   0.070334   
1            3268  0.079147 -0.134020       ...          0.118689  -0.013624   
2            3376  0.100044  0.181343       ...          0.061642   0.016521   
3            3277  0.114206  0.003125       ...          0.045892  -0.149243   
4            3448  0.106550  0.146833       ...          0.097092  -0.062400   

   CompGeorge  LineCountGeorge  Wo

In [25]:
# Merging above with Elaine
merged_df = pd.merge(merged_df, elaine_grp, on = "SEID", how = "left")

# Drop EpisodeNo_y and Season_y
merged_df = merged_df.drop("EpisodeNo_y", 1)
merged_df = merged_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
merged_df = merged_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})

print(merged_df.shape)
print(merged_df.head())

(173, 28)
     SEID  EpisodeNo  Season  PosTotal  NegTotal  CompTotal  LineCountTotal  \
0  S01E01        1.0     1.0  0.107816  0.160430   0.063447             557   
1  S01E02        2.0     1.0  0.100301 -0.004003   0.068077             280   
2  S01E03        3.0     1.0  0.095904  0.118192   0.069294             241   
3  S01E04        4.0     1.0  0.090593 -0.013447   0.067964             230   
4  S02E01        1.0     2.0  0.103584  0.078673   0.083960             245   

   WordCountTotal  PosJerry  NegJerry       ...         PosKramer  NegKramer  \
0            6582  0.113250  0.233425       ...          0.039038  -0.154942   
1            3268  0.079147 -0.134020       ...          0.134622   0.234742   
2            3376  0.100044  0.181343       ...          0.118167   0.093931   
3            3277  0.114206  0.003125       ...          0.064161   0.107578   
4            3448  0.106550  0.146833       ...          0.151388   0.118010   

   CompKramer  LineCountKramer  Wo

In [26]:
# Merging above with Secondary
merged_df = pd.merge(merged_df, secondary_grp, on = "SEID", how = "left")

# Drop EpisodeNo_y and Season_y
merged_df = merged_df.drop("EpisodeNo_y", 1)
merged_df = merged_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
merged_df = merged_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})

print(merged_df.shape)
print(merged_df.head())

(173, 33)
     SEID  EpisodeNo  Season  PosTotal  NegTotal  CompTotal  LineCountTotal  \
0  S01E01        1.0     1.0  0.107816  0.160430   0.063447             557   
1  S01E02        2.0     1.0  0.100301 -0.004003   0.068077             280   
2  S01E03        3.0     1.0  0.095904  0.118192   0.069294             241   
3  S01E04        4.0     1.0  0.090593 -0.013447   0.067964             230   
4  S02E01        1.0     2.0  0.103584  0.078673   0.083960             245   

   WordCountTotal  PosJerry  NegJerry       ...         PosElaine  NegElaine  \
0            6582  0.113250  0.233425       ...          0.095643  -0.012478   
1            3268  0.079147 -0.134020       ...          0.123047   0.162913   
2            3376  0.100044  0.181343       ...          0.076712  -0.126343   
3            3277  0.114206  0.003125       ...          0.071208   0.022277   
4            3448  0.106550  0.146833       ...          0.071866   0.066057   

   CompElaine  LineCountElaine  Wo

In [27]:
# Merging above with Other
merged_df = pd.merge(merged_df, other_grp, on = "SEID", how = "left")

# Drop EpisodeNo_y and Season_y
merged_df = merged_df.drop("EpisodeNo_y", 1)
merged_df = merged_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
merged_df = merged_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})

print(merged_df.shape)
print(merged_df.head())

(173, 38)
     SEID  EpisodeNo  Season  PosTotal  NegTotal  CompTotal  LineCountTotal  \
0  S01E01        1.0     1.0  0.107816  0.160430   0.063447             557   
1  S01E02        2.0     1.0  0.100301 -0.004003   0.068077             280   
2  S01E03        3.0     1.0  0.095904  0.118192   0.069294             241   
3  S01E04        4.0     1.0  0.090593 -0.013447   0.067964             230   
4  S02E01        1.0     2.0  0.103584  0.078673   0.083960             245   

   WordCountTotal  PosJerry  NegJerry       ...        PosSecond  NegSecond  \
0            6582  0.113250  0.233425       ...         0.125156   0.107827   
1            3268  0.079147 -0.134020       ...              NaN        NaN   
2            3376  0.100044  0.181343       ...              NaN        NaN   
3            3277  0.114206  0.003125       ...              NaN        NaN   
4            3448  0.106550  0.146833       ...              NaN        NaN   

   CompSecond  LineCountSecond  WordCoun

In [28]:
# Export to CSV
merged_df.to_csv('../outputData/merged_scripts.csv')