In [1]:
import pandas as pd
from convokit import Corpus, download
corpus = Corpus(filename=download("friends-corpus"))
from collections import Counter

  from pandas.core.computation.check import NUMEXPR_INSTALLED


Dataset already exists at C:\Users\ganes\.convokit\downloads\friends-corpus


In [2]:
# some stats

corpus.print_summary_stats()

Number of Speakers: 700
Number of Utterances: 67373
Number of Conversations: 3107


In [3]:
# getting the dataframe of every single utterance 
df_all_utterances = corpus.get_utterances_dataframe()


# will discard columns with too more than 10000 nulls
df_all_utterances.isnull().sum()

timestamp                    67373
text                             0
speaker                          0
reply_to                      3107
conversation_id                  0
meta.tokens                      0
meta.character_entities      40313
meta.emotion                 54767
meta.caption                 49330
meta.transcript_with_note    46706
meta.tokens_with_note        46706
vectors                          0
dtype: int64

In [4]:
friends6 = ['Chandler Bing',
 'Joey Tribbiani',
 'Monica Geller',
 'Phoebe Buffay',
 'Rachel Green',
 'Ross Geller']

def get_speaker(utter_id):
    return corpus.get_utterance(utter_id).speaker.id

def get_speakers(conv_id):
    speakers = corpus.get_conversation(conv_id).get_speaker_ids()
    speakers = [el for el in speakers if (el != 'TRANSCRIPT_NOTE'  and  el != '#ALL#')]
    speakers.sort()
    return speakers
def get_main_cast(conv_id):
    speakers = corpus.get_conversation(conv_id).get_speaker_ids()
    speakers = [el for el in speakers if (el != 'TRANSCRIPT_NOTE'  and  el != '#ALL#')]
    speakers = [el for el in speakers if el in friends6]
    speakers.sort()
    return speakers

def season_episode(txt):
    season = int(txt[1:3])
    episode = int(txt[5:7])
    return [season, episode]

season_episode("s01_e01_c01_u001")



# testing the function
corpus.has_speaker('Jill Goodacre')

True

In [5]:
# dropping useless columns

df_all_utterances['reply_to_speaker'] = df_all_utterances.reply_to.apply(lambda x: get_speaker(x) if x is not None else x)
df_all_utterances.drop(columns = ['timestamp', 'meta.tokens', 'meta.character_entities', 'meta.emotion',
       'meta.caption', 'meta.transcript_with_note', 'meta.tokens_with_note',
       'vectors'] , inplace = True)

# new column utt_id
df_all_utterances['utt_id'] = df_all_utterances.index


# changing order of the columns
new_order = ['conversation_id', 'utt_id', 'speaker', 'reply_to' , 'reply_to_speaker' , 'text' ]
df_all_utterances = df_all_utterances[new_order]


df_all_utterances['all_speakers_in_conversation'] = df_all_utterances['conversation_id'].apply(lambda x: get_speakers(x))

df_all_utterances['main_cast'] = df_all_utterances['conversation_id'].apply(lambda x: get_main_cast(x))

df_all_utterances[['season','episode']] =  df_all_utterances['conversation_id'].apply( lambda x:  pd.Series(season_episode(x)))

In [6]:
df_all_utterances.head()

Unnamed: 0_level_0,conversation_id,utt_id,speaker,reply_to,reply_to_speaker,text,all_speakers_in_conversation,main_cast,season,episode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
s01_e01_c01_u001,s01_e01_c01_u001,s01_e01_c01_u001,Monica Geller,,,There's nothing to tell! He's just some guy I ...,"[Chandler Bing, Joey Tribbiani, Monica Geller,...","[Chandler Bing, Joey Tribbiani, Monica Geller,...",1,1
s01_e01_c01_u002,s01_e01_c01_u001,s01_e01_c01_u002,Joey Tribbiani,s01_e01_c01_u001,Monica Geller,"C'mon, you're going out with the guy! There's ...","[Chandler Bing, Joey Tribbiani, Monica Geller,...","[Chandler Bing, Joey Tribbiani, Monica Geller,...",1,1
s01_e01_c01_u003,s01_e01_c01_u001,s01_e01_c01_u003,Chandler Bing,s01_e01_c01_u002,Joey Tribbiani,"All right Joey, be nice. So does he have a hum...","[Chandler Bing, Joey Tribbiani, Monica Geller,...","[Chandler Bing, Joey Tribbiani, Monica Geller,...",1,1
s01_e01_c01_u004,s01_e01_c01_u001,s01_e01_c01_u004,Phoebe Buffay,s01_e01_c01_u003,Chandler Bing,"Wait, does he eat chalk?","[Chandler Bing, Joey Tribbiani, Monica Geller,...","[Chandler Bing, Joey Tribbiani, Monica Geller,...",1,1
s01_e01_c01_u005,s01_e01_c01_u001,s01_e01_c01_u005,TRANSCRIPT_NOTE,s01_e01_c01_u004,Phoebe Buffay,,"[Chandler Bing, Joey Tribbiani, Monica Geller,...","[Chandler Bing, Joey Tribbiani, Monica Geller,...",1,1


In [7]:
df_all_utterances.to_excel("FRIENDS_dialogues_data.xlsx")

In [8]:
list_of_convo_ids = df_all_utterances['conversation_id'].unique()  
list_of_convo_ids

array(['s01_e01_c01_u001', 's01_e01_c02_u001', 's01_e01_c03_u001', ...,
       's10_e18_c09_u000', 's10_e18_c10_u000', 's10_e18_c11_u000'],
      dtype=object)

In [9]:
list_scenes_cast = []

for convo_id in list_of_convo_ids:
    speakers = get_speakers(convo_id)
    speakers = [el for el in speakers if (el != 'TRANSCRIPT_NOTE'  and  el != '#ALL#')]
    speakers.sort()
    speakers2 = [el for el in speakers if el in friends6]
    
    list_scenes_cast.append( [ speakers, speakers2])
    
    
    

    
    
    
    

In [10]:
list_scenes_cast

[[['Chandler Bing',
   'Joey Tribbiani',
   'Monica Geller',
   'Phoebe Buffay',
   'Rachel Green',
   'Ross Geller',
   'Waitress'],
  ['Chandler Bing',
   'Joey Tribbiani',
   'Monica Geller',
   'Phoebe Buffay',
   'Rachel Green',
   'Ross Geller']],
 [['Chandler Bing',
   'Joey Tribbiani',
   'Monica Geller',
   'Paul the Wine Guy',
   'Phoebe Buffay',
   'Rachel Green',
   'Ross Geller'],
  ['Chandler Bing',
   'Joey Tribbiani',
   'Monica Geller',
   'Phoebe Buffay',
   'Rachel Green',
   'Ross Geller']],
 [['Phoebe Buffay'], ['Phoebe Buffay']],
 [['Chandler Bing', 'Joey Tribbiani', 'Ross Geller'],
  ['Chandler Bing', 'Joey Tribbiani', 'Ross Geller']],
 [['Monica Geller', 'Paul the Wine Guy'], ['Monica Geller']],
 [['Rachel Green'], ['Rachel Green']],
 [['Chandler Bing', 'Joey Tribbiani', 'Ross Geller'],
  ['Chandler Bing', 'Joey Tribbiani', 'Ross Geller']],
 [['Monica Geller', 'Paul the Wine Guy'], ['Monica Geller']],
 [['Priest On Tv', 'Rachel Green'], ['Rachel Green']],
 [['Ch

In [11]:
df_cast = pd.DataFrame(list_scenes_cast, columns = ['all_cast', 'main_cast'])
df_cast['count_cast'] = df_cast['main_cast'].apply(lambda x : len(x))
df_cast.head()

Unnamed: 0,all_cast,main_cast,count_cast
0,"[Chandler Bing, Joey Tribbiani, Monica Geller,...","[Chandler Bing, Joey Tribbiani, Monica Geller,...",6
1,"[Chandler Bing, Joey Tribbiani, Monica Geller,...","[Chandler Bing, Joey Tribbiani, Monica Geller,...",6
2,[Phoebe Buffay],[Phoebe Buffay],1
3,"[Chandler Bing, Joey Tribbiani, Ross Geller]","[Chandler Bing, Joey Tribbiani, Ross Geller]",3
4,"[Monica Geller, Paul the Wine Guy]",[Monica Geller],1


In [12]:
df_cast.to_excel("cast_scene.xlsx")

In [20]:
main_cast_duos =   df_cast[df_cast['count_cast'] == 2]['main_cast'].value_counts().reset_index()
main_cast_duos.to_excel("main_cast_duos.xlsx")

In [21]:
df_cast[df_cast['count_cast'] == 1]['main_cast'].value_counts().reset_index()

Unnamed: 0,main_cast,count
0,[Ross Geller],140
1,[Joey Tribbiani],134
2,[Phoebe Buffay],128
3,[Rachel Green],102
4,[Chandler Bing],91
5,[Monica Geller],62
