In [296]:
! pip install pandas textblob matplotlib ipython



In [297]:
import os
import pandas as pd
import re
from textblob import TextBlob
import sqlite3
import matplotlib
matplotlib.use('Agg')  # Non-interactive backend for generating plots
print(matplotlib.get_backend())
# matplotlib.use('module://matplotlib_inline.backend_inline')
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
import warnings

# Set matplotlib formats
set_matplotlib_formats('retina', quality=100)

warnings.filterwarnings('ignore')

agg


In [298]:
# Define the base directory where the script files are stored
BASE_URL = 'script_dataset/'

# Initialize an empty list to store the parsed script data
master_array = []

# Iterate over each file in the base directory
for filename in os.listdir(BASE_URL):
    
    f = open(BASE_URL+filename, "r", encoding='utf-8')
    
    split_name = filename.split('.') #obtain the season and episodes
    season = split_name[0]   
    episode = split_name[1]
    
    script = f.read() #read the script file
    pattern = re.compile(r'\s(?=\w+(?=:))') # store the regex
    result = re.split(pattern, script) # split the script where our pattern matched (pink dot)
    
    for item in result:
        split_line = item.split(':')
        try:
            character = split_line[0]
            speech = split_line[1]
            master_array.append([season, episode, character, speech])
        except:
            pass

In [299]:
df = pd.DataFrame(master_array, columns=['season', 'episode', 'char', 'line'])
df

Unnamed: 0,season,episode,char,line
0,6,16,by,Adam Chase\nStory
1,6,16,by,Zachary Rosenblatt\nTranscribed
2,6,16,by,Eric Aasen\n\n[Scene
3,6,16,Ross,"Hey, remember when I had a monkey?"
4,6,16,Chandler,Yeah.
...,...,...,...,...
61822,8,22,Monica,How did you know that?! (Runs to yell at Joey...
61823,8,22,Phoebe,Theyre at the coffeehouse.
61824,8,22,Monica,"You know everything!! Oh wait, double or noth..."
61825,8,22,Phoebe,We know its a girl! (Exits.)


In [300]:
# Droping all of these rows by removing the character 'by'
df = df[df.char != 'by']
# Drop rows with null values in the DataFrame
df = df.dropna()
df


Unnamed: 0,season,episode,char,line
3,6,16,Ross,"Hey, remember when I had a monkey?"
4,6,16,Chandler,Yeah.
5,6,16,Ross,"Yeah, what, what was I thinking?"
6,6,16,Joey,"(hes just picked up their bill) Hey! So, wha..."
7,6,16,Chandler,Twenty percent is a pretty generous tip Joe.
...,...,...,...,...
61822,8,22,Monica,How did you know that?! (Runs to yell at Joey...
61823,8,22,Phoebe,Theyre at the coffeehouse.
61824,8,22,Monica,"You know everything!! Oh wait, double or noth..."
61825,8,22,Phoebe,We know its a girl! (Exits.)


In [301]:
df['char'].unique()[20:30]

array(['Janitor', 'From', '2', 'Long', 'guy', 'like', 'Erica', 'Waiter',
       'Sarah', 'Girl'], dtype=object)

In [302]:
df['char'].replace({'Chandler':'Chandler','CHANDLER':'Chandler', 'Chandlers':'Chandler', 'chandler': 'Chandler',
                    'Joey':'Joey', 'JOEY': 'Joey',
                    'Monica':'Monica','MONICA':'Monica', 'MOnica': 'Monica', 'MNCA': 'Monica',
                    'Phoebe':'Phoebe','PHOEBE':'Phoebe', 'Pheebs':'Phoebe', 'Pheebs':'Phoebe',
                    'Rachel':'Rachel','RACHEL':'Rachel', 'RACH':'Rachel', 'RAHCEL':'Rachel', 'Racel':'Rachel', 'Rache':'Rachel',
                    'Ross':'Ross', 'RUSS':'Ross', 'ROSS': 'Ross'}, inplace=True)

In [303]:
char = ['Chandler', 'Joey', 'Monica', 'Phoebe', 'Rachel', 'Ross']
df = df[df['char'].isin(char)]

In [304]:
df

Unnamed: 0,season,episode,char,line
3,6,16,Ross,"Hey, remember when I had a monkey?"
4,6,16,Chandler,Yeah.
5,6,16,Ross,"Yeah, what, what was I thinking?"
6,6,16,Joey,"(hes just picked up their bill) Hey! So, wha..."
7,6,16,Chandler,Twenty percent is a pretty generous tip Joe.
...,...,...,...,...
61822,8,22,Monica,How did you know that?! (Runs to yell at Joey...
61823,8,22,Phoebe,Theyre at the coffeehouse.
61824,8,22,Monica,"You know everything!! Oh wait, double or noth..."
61825,8,22,Phoebe,We know its a girl! (Exits.)


In [305]:
df['sentiment'] = df['line'].apply(lambda x: TextBlob(x).sentiment[0])

In [306]:
df['season'] = df['season'].apply(lambda x: int(x))
df['episode'] = df['episode'].apply(lambda x: int(x))
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50925 entries, 3 to 61826
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   season     50925 non-null  int64  
 1   episode    50925 non-null  int64  
 2   char       50925 non-null  object 
 3   line       50925 non-null  object 
 4   sentiment  50925 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 2.3+ MB


In [307]:
conn = sqlite3.connect('friends_script.db') #connect to the database
cur = conn.cursor()
cur.execute('CREATE TABLE IF NOT EXISTS Friends(season number, episode number, char text, line text, sentiment integer)')
conn.commit()

In [308]:
df.to_sql('Friends', conn, if_exists='replace', index=False) # save into the 'lines' table

50925

In [309]:
COLORS=['#40E0D0', '#4682B4', '#708090', '#90EE90', '#FFDEAD', '#E9967A']

## The Most Popular Friend

In [310]:
cur.execute("SELECT char, COUNT(line) AS 'spoken_lines' FROM Friends GROUP BY char ORDER BY spoken_lines DESC")
most_lines = [c for c in cur.fetchall()]

In [311]:

plt.rcParams['figure.figsize'] = (10, 6)
fig, ax = plt.subplots()
plt.bar(
    x=[line[0] for line in most_lines],
    height=[line[1] for line in most_lines],
    color=COLORS
)
plt.title('Number of Lines for the Entire Show')
plt.ylabel('No. of Lines')
fig.tight_layout()

# Save without the 'quality' argument
fig.savefig('output.png')

## Number of lines per season

In [312]:
cur.execute("""
SELECT char, season, count(line) AS total_lines FROM Friends 
WHERE char IN ('Rachel', 'Ross', 'Monica','Chandler','Joey', 'Phoebe') GROUP BY season, char""")
lines_per_season = [c for c in cur.fetchall()]

In [313]:
lines_dict = {}
for lines in lines_per_season:
    char = lines[0]
    season = lines[1]
    total_lines = lines[2]
    
    if char in lines_dict:
        # append the new number to the existing array at this slot
        lines_dict[char][season] = total_lines
    else:
        # create a new array in this slot
        lines_dict[char] = {
            season: total_lines
        }

In [314]:
fig, ax = plt.subplots()
plt.rcParams['figure.figsize'] = (10, 8)
for char in lines_dict.keys():
    x1 = lines_dict[char].keys()
    y1 = lines_dict[char].values()
    plt.plot(x1, y1, label = char)
    
plt.xlabel('Season')
plt.ylabel('Number of Lines')
plt.title('Number of lines by season')
plt.legend(loc=1)
plt.show()

fig.savefig('output2.png')

### Most Spoken About


In [335]:
nicknames = [['Rachel', 'Rach'], 
             ['Ross', 'Ross-A-Tron', 'Professor Geller'], 
             ['Monica', 'Mon'],
             ['Chandler', 'Chan'],
             ['Joey', 'Joe'], 
             ['Phoebe', 'Phoebes', 'Pheebs']]

In [336]:
all_lines = pd.read_sql("SELECT line FROM Friends", conn)

In [337]:
char_mention = [] 

for name_list in nicknames:    # loop for each character
    mention_counter = 0        # keep track of the mentions
    for name in name_list:     # loop for each nickname
        mentions = all_lines['line'].str.count(name).sum()
        mention_counter += mentions
    char_mention.append([name_list[0], mention_counter]) #append the name and mention count

In [338]:
char_mention = sorted(char_mention, key=lambda x: x[1], reverse=True)

In [339]:
plt.rcParams['figure.figsize'] = (10, 6)
fig, ax = plt.subplots()
plt.bar(
    x=[mention[0] for mention in char_mention],
    height=[mention[1] for mention in char_mention],
    color=COLORS
)
plt.title('Number of mentions in the script')
plt.ylabel('No. of mentions')
fig.tight_layout()

fig.savefig('output3.png')

### Largest Vocabulary

In [320]:
# function to remove all non alphabetical characters keep spaces
def alphanumonly(text):
    # Remove all non letters from string
    regex = re.compile('[^a-zA-Z ]')
    # First parameter is the replacement, second parameter is your input string
    return(regex.sub('', text))

In [321]:
cur.execute("SELECT char, line FROM Friends WHERE char IN ('Rachel', 'Ross', 'Monica','Chandler','Joey', 'Phoebe')")
lines_per_season = [c for c in cur.fetchall()]

In [322]:
monica_vocab = set()
chandler_vocab = set()
ross_vocab = set()
phoebe_vocab = set()
rachel_vocab = set()
joey_vocab = set()

for l in lines_per_season:
    char = l[0]
    l = alphanumonly(l[1]).strip()
    split_words = l.split(' ')
    for word in split_words:
        if char == 'Monica':
            monica_vocab.add(word)
            
        if char == 'Joey':
            joey_vocab.add(word)
        
        if char == 'Ross':
            ross_vocab.add(word)
        
        if char == 'Phoebe':
            phoebe_vocab.add(word)
        
        if char == 'Chandler':
            chandler_vocab.add(word)
        
        if char == 'Rachel':
            rachel_vocab.add(word)

char_vocal_length = ['Ross', 'Joey', 'Chandler', 'Rachel', 'Phoebe', 'Monica']
vocab_lengths = [len(ross_vocab), len(joey_vocab), len(chandler_vocab), len(rachel_vocab), len(phoebe_vocab), len(monica_vocab)]

In [323]:
plt.rcParams['figure.figsize'] = (10, 6)
fig, ax = plt.subplots()
plt.bar(
    x=char_vocal_length,
    height=vocab_lengths,
    color=COLORS
)
plt.title('Largest Vocabulary')
plt.ylabel('Number of unique words')
fig.tight_layout()

fig.savefig('output4.png')

In [324]:
cur.execute("SELECT season, episode, sentiment FROM Friends WHERE char == 'Ross' ORDER BY season, episode")
ross_sentiments = [c for c in cur.fetchall()]

In [325]:
ross_sentiment = {}

for r in ross_sentiments:
    season_episode = f'S{r[0]}E{r[1]}'
    if r[0] == 10:
        season_episode = f'STENE{r[1]}'
    sentiment = r[2]
    if season_episode in ross_sentiment:
        ross_sentiment[season_episode] += sentiment
    else:
        ross_sentiment[season_episode] = sentiment

In [326]:
plt.rcParams['figure.figsize'] = (14, 5)

In [327]:
fig, ax = plt.subplots()

seasons = ['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'STEN']
for season in seasons[0:3]:
    sentiments = list(filter(lambda item: item[0].startswith(season), ross_sentiment.items())) 
    x1 = list(range(1, len(sentiments)+1))
    y1 = [x[1] for x in sentiments]
    plt.plot(x1, y1, label = season)
    
plt.xlabel('Episode Number')
plt.ylabel('Sentiment Score')
plt.title('Ross Sentiment per Episode in Seasons 1 to Season 3')
plt.legend(loc=1)
plt.show()

fig.savefig('output5.png')

In [328]:
fig, ax = plt.subplots()

for season in seasons[3:6]:
    sentiments = list(filter(lambda item: item[0].startswith(season), ross_sentiment.items())) 
    x1 = list(range(1, len(sentiments)+1))
    y1 = [x[1] for x in sentiments]
    plt.plot(x1, y1, label = season)
    
plt.xlabel('Episode Number')
plt.ylabel('Sentiment Score')
plt.title('Ross Sentiment per Episode in Seasons 4 to Season 6')
plt.legend(loc=1)
plt.show()

fig.savefig('output6.png')

In [329]:
fig, ax = plt.subplots()

for season in seasons[6:]:
    
    sentiments = list(filter(lambda item: item[0].startswith(season), ross_sentiment.items())) 
    x1 = list(range(1, len(sentiments)+1))
    y1 = [x[1] for x in sentiments]
    
    if season == 'STEN': season = 'S10'
    plt.plot(x1, y1, label = season)
    
plt.xlabel('Episode Number')
plt.ylabel('Sentiment Score')
plt.title('Ross Sentiment per Episode in Seasons 7 to Season 10')
plt.legend(loc=1)
plt.show()

fig.savefig('output7.png')