In [1]:
# Import dependencies
import pandas as pd
import numpy as np

In [2]:
# Read in the CSV files
merged_df = pd.read_csv('../outputData/merged_scripts.csv')
episode_df = pd.read_csv('../rawData/episode_info.csv')

# Drop the "Unnamed: 0" and "Unnamed: 0.1" columns
merged_df = merged_df.drop("Unnamed: 0", 1)
episode_df = episode_df.drop("Unnamed: 0", 1)

# Drop S01E00, since we don't have any sentiment for that
episode_df = episode_df[1:episode_df.shape[0]]

In [3]:
merged_df.head()

Unnamed: 0,SEID,EpisodeNo,Season,PosTotal,NegTotal,CompTotal,LineCountTotal,WordCountTotal,PosJerry,NegJerry,...,PosSecond,NegSecond,CompSecond,LineCountSecond,WordCountSecond,PosOther,NegOther,CompOther,LineCountOther,WordCountOther
0,S01E01,1.0,1.0,0.107816,0.16043,0.063447,557,6582,0.11325,0.233425,...,0.125156,0.107827,0.068993,59.0,449.0,0.100364,0.049903,0.068621,90,829
1,S01E02,2.0,1.0,0.100301,-0.004003,0.068077,280,3268,0.079147,-0.13402,...,,,,,,0.099803,0.190117,0.057431,27,274
2,S01E03,3.0,1.0,0.095904,0.118192,0.069294,241,3376,0.100044,0.181343,...,,,,,,0.119932,0.154017,0.041675,45,530
3,S01E04,4.0,1.0,0.090593,-0.013447,0.067964,230,3277,0.114206,0.003125,...,,,,,,0.098036,0.053918,0.052881,37,278
4,S02E01,1.0,2.0,0.103584,0.078673,0.08396,245,3448,0.10655,0.146833,...,,,,,,0.116604,0.131408,0.05233,23,273


In [4]:
episode_df.head()

Unnamed: 0,Season,EpisodeNo,Title,AirDate,Writers,Director,SEID,IMDB Rating
1,1,1,The Stakeout,31-May-90,"Larry David, Jerry Seinfeld",Tom Cherones,S01E01,7.8
2,1,2,The Robbery,7-Jun-90,Matt Goldman,Tom Cherones,S01E02,7.7
3,1,3,Male Unbonding,14-Jun-90,"Larry David, Jerry Seinfeld",Tom Cherones,S01E03,7.5
4,1,4,The Stock Tip,21-Jun-90,"Larry David, Jerry Seinfeld",Tom Cherones,S01E04,7.7
5,2,1,The Ex-Girlfriend,16-Jan-91,"Larry David, Jerry Seinfeld",Tom Cherones,S02E01,7.8


In [5]:
### Merge both data frames on SEID
seinfeld_df = pd.merge(episode_df, merged_df, on = "SEID", how = "left")
seinfeld_df.head()

Unnamed: 0,Season_x,EpisodeNo_x,Title,AirDate,Writers,Director,SEID,IMDB Rating,EpisodeNo_y,Season_y,...,PosSecond,NegSecond,CompSecond,LineCountSecond,WordCountSecond,PosOther,NegOther,CompOther,LineCountOther,WordCountOther
0,1,1,The Stakeout,31-May-90,"Larry David, Jerry Seinfeld",Tom Cherones,S01E01,7.8,1.0,1.0,...,0.125156,0.107827,0.068993,59.0,449.0,0.100364,0.049903,0.068621,90,829
1,1,2,The Robbery,7-Jun-90,Matt Goldman,Tom Cherones,S01E02,7.7,2.0,1.0,...,,,,,,0.099803,0.190117,0.057431,27,274
2,1,3,Male Unbonding,14-Jun-90,"Larry David, Jerry Seinfeld",Tom Cherones,S01E03,7.5,3.0,1.0,...,,,,,,0.119932,0.154017,0.041675,45,530
3,1,4,The Stock Tip,21-Jun-90,"Larry David, Jerry Seinfeld",Tom Cherones,S01E04,7.7,4.0,1.0,...,,,,,,0.098036,0.053918,0.052881,37,278
4,2,1,The Ex-Girlfriend,16-Jan-91,"Larry David, Jerry Seinfeld",Tom Cherones,S02E01,7.8,1.0,2.0,...,,,,,,0.116604,0.131408,0.05233,23,273


In [6]:
# Drop EpisodeNo_y and Season_y
seinfeld_df = seinfeld_df.drop("EpisodeNo_y", 1)
seinfeld_df = seinfeld_df.drop("Season_y", 1)

# Rename EpsodeNo_x and Season_x
seinfeld_df = seinfeld_df.rename(index=str, columns = {"EpisodeNo_x": "EpisodeNo", "Season_x": "Season"})
seinfeld_df.head()

Unnamed: 0,Season,EpisodeNo,Title,AirDate,Writers,Director,SEID,IMDB Rating,PosTotal,NegTotal,...,PosSecond,NegSecond,CompSecond,LineCountSecond,WordCountSecond,PosOther,NegOther,CompOther,LineCountOther,WordCountOther
0,1,1,The Stakeout,31-May-90,"Larry David, Jerry Seinfeld",Tom Cherones,S01E01,7.8,0.107816,0.16043,...,0.125156,0.107827,0.068993,59.0,449.0,0.100364,0.049903,0.068621,90,829
1,1,2,The Robbery,7-Jun-90,Matt Goldman,Tom Cherones,S01E02,7.7,0.100301,-0.004003,...,,,,,,0.099803,0.190117,0.057431,27,274
2,1,3,Male Unbonding,14-Jun-90,"Larry David, Jerry Seinfeld",Tom Cherones,S01E03,7.5,0.095904,0.118192,...,,,,,,0.119932,0.154017,0.041675,45,530
3,1,4,The Stock Tip,21-Jun-90,"Larry David, Jerry Seinfeld",Tom Cherones,S01E04,7.7,0.090593,-0.013447,...,,,,,,0.098036,0.053918,0.052881,37,278
4,2,1,The Ex-Girlfriend,16-Jan-91,"Larry David, Jerry Seinfeld",Tom Cherones,S02E01,7.8,0.103584,0.078673,...,,,,,,0.116604,0.131408,0.05233,23,273


In [7]:
# Rename "IMDB Rating" column to "IMDBrating"
seinfeld_df = seinfeld_df.rename(index=str, columns = {"IMDB Rating": "IMDBrating"})

In [8]:
#### Add dummy variables for two-part episodes
# S01E01
# S05E18
# S09E23

In [9]:
seinfeld_df["TwoParter"] = np.repeat(0, seinfeld_df.shape[0])

seinfeld_df.loc[seinfeld_df.SEID == "S01E01", "TwoParter"] = 1
seinfeld_df.loc[seinfeld_df.SEID == "S05E18", "TwoParter"] = 1
seinfeld_df.loc[seinfeld_df.SEID == "S09E23", "TwoParter"] = 1

seinfeld_df.head()

Unnamed: 0,Season,EpisodeNo,Title,AirDate,Writers,Director,SEID,IMDBrating,PosTotal,NegTotal,...,NegSecond,CompSecond,LineCountSecond,WordCountSecond,PosOther,NegOther,CompOther,LineCountOther,WordCountOther,TwoParter
0,1,1,The Stakeout,31-May-90,"Larry David, Jerry Seinfeld",Tom Cherones,S01E01,7.8,0.107816,0.16043,...,0.107827,0.068993,59.0,449.0,0.100364,0.049903,0.068621,90,829,1
1,1,2,The Robbery,7-Jun-90,Matt Goldman,Tom Cherones,S01E02,7.7,0.100301,-0.004003,...,,,,,0.099803,0.190117,0.057431,27,274,0
2,1,3,Male Unbonding,14-Jun-90,"Larry David, Jerry Seinfeld",Tom Cherones,S01E03,7.5,0.095904,0.118192,...,,,,,0.119932,0.154017,0.041675,45,530,0
3,1,4,The Stock Tip,21-Jun-90,"Larry David, Jerry Seinfeld",Tom Cherones,S01E04,7.7,0.090593,-0.013447,...,,,,,0.098036,0.053918,0.052881,37,278,0
4,2,1,The Ex-Girlfriend,16-Jan-91,"Larry David, Jerry Seinfeld",Tom Cherones,S02E01,7.8,0.103584,0.078673,...,,,,,0.116604,0.131408,0.05233,23,273,0


In [10]:
# Add separate dummy variables for the pilot and the finale
seinfeld_df["Pilot"] = np.repeat(0, seinfeld_df.shape[0])
seinfeld_df.loc[seinfeld_df.SEID == "S01E01", "Pilot"] = 1

seinfeld_df["Finale"] = np.repeat(0, seinfeld_df.shape[0])
seinfeld_df.loc[seinfeld_df.SEID == "S09E23", "Finale"] = 1

seinfeld_df.tail()

Unnamed: 0,Season,EpisodeNo,Title,AirDate,Writers,Director,SEID,IMDBrating,PosTotal,NegTotal,...,LineCountSecond,WordCountSecond,PosOther,NegOther,CompOther,LineCountOther,WordCountOther,TwoParter,Pilot,Finale
168,9,17,The Bookstore,9-Apr-98,Spike Feresten,Andy Ackerman,S09E17,8.5,0.097596,0.047979,...,36.0,320.0,0.079768,0.011877,0.067721,80,842,0,0,0
169,9,18,The Frogger,23-Apr-98,"Gregg Kavet, Andy Robin",Andy Ackerman,S09E18,9.0,0.129953,0.08521,...,,,0.169747,0.127525,0.064195,99,809,0,0,0
170,9,19,The Maid,30-Apr-98,"Alec Berg, David Mandel, Jeff Schaffer",Andy Ackerman,S09E19,8.3,0.095803,0.054141,...,,,0.073256,0.045311,0.052274,100,782,0,0,0
171,9,20,The Puerto Rican Day,7-May-98,"Alec Berg, Jennifer Crittenden, Spike Feresten...",Andy Ackerman,S09E20,8.2,0.104708,0.074494,...,,,0.106608,0.076088,0.090977,87,600,0,0,0
172,9,23,The Finale,14-May-98,Larry David,Andy Ackerman,S09E23,7.7,0.11523,0.034926,...,45.0,527.0,0.104839,0.005773,0.083913,278,3154,1,0,1


In [11]:
# Export to CSV
seinfeld_df.to_csv('../rawData/seinfeld.csv')