In [1]:
import pandas as pd
from itertools import combinations
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
co_occurrence = defaultdict(int)
character_counts = defaultdict(int)

In [3]:
df = pd.read_csv('Friends.csv')

In [4]:
df = df.dropna(subset=['Speaker'])

In [5]:
df['Speaker'] = df['Speaker'].str.title()

In [6]:
unique_speakers = df['Speaker'].unique()

In [7]:
top_30_characters = df['Speaker'].value_counts().head(30)

print(top_30_characters)

Ross              8869
Rachel            8774
Chandler          8058
Joey              8047
Monica            7965
Phoebe            7152
[Scene            2833
Mike               355
All                332
Rach               325
Mnca               259
Richard            254
Chan               233
Janice             208
Mr. Geller         204
Phoe               195
Carol              192
Charlie            190
Transcribed By     171
Mrs. Geller        167
Emily              167
Tag                146
Frank              133
Paul               130
Gunther            127
Written By         126
David              120
Amy                116
Mona               111
Woman              105
Name: Speaker, dtype: int64


In [8]:
entries_to_remove = ["[Scene", "All", "Transcribed By", "Written By", "Woman"]

In [9]:
df = df[~df['Speaker'].isin(entries_to_remove)]

In [10]:
replacements = {
    r'\bMnca\b': 'Monica',
    r'\bRach\b': 'Rachel',
    r'\bPhoe\b': 'Phoebe',
    r'\bChan\b': 'Chandler'
}

df['Speaker'] = df['Speaker'].replace(replacements, regex=True)

In [11]:
top_30_characters = df['Speaker'].value_counts().head(30)

print(top_30_characters)

Rachel         9099
Ross           8869
Chandler       8291
Monica         8224
Joey           8047
Phoebe         7347
Mike            355
Richard         254
Janice          208
Mr. Geller      204
Carol           192
Charlie         190
Emily           167
Mrs. Geller     167
Tag             146
Frank           133
Paul            130
Gunther         127
David           120
Amy             116
Mona            111
Pete            103
Susan           102
Joshua           98
Gary             96
Elizabeth        94
Janine           92
Kathy            91
Jill             83
Ben              73
Name: Speaker, dtype: int64


In [12]:
top_30_characters_list = top_30_characters.index.tolist()

In [13]:
df = df[df['Speaker'].isin(top_30_characters_list)]

In [14]:
print(df['Speaker'].unique())

['Monica' 'Joey' 'Chandler' 'Phoebe' 'Ross' 'Rachel' 'Paul' 'Carol'
 'Mrs. Geller' 'Mr. Geller' 'Susan' 'Jill' 'David' 'Janice' 'Gunther'
 'Richard' 'Ben' 'Frank' 'Mike' 'Pete' 'Kathy' 'Joshua' 'Emily'
 'Elizabeth' 'Gary' 'Janine' 'Tag' 'Mona' 'Amy' 'Charlie']


In [15]:
co_occurrence = defaultdict(int)
character_counts = defaultdict(int)

In [16]:
for _, episode_group in df.groupby(['Season', 'Episode']):
    characters = episode_group['Speaker'].unique()
    for character in characters:
        character_counts[character] += 1
    for char_a, char_b in combinations(characters, 2):
        co_occurrence[frozenset([char_a, char_b])] += 1

In [17]:
nodes = [{"Name": character, "Weight": count} for character, count in character_counts.items()]
edges = []

for characters, count in co_occurrence.items():
    char_a, char_b = characters
    edges.append({
        "source": char_a,
        "target": char_b,
        "weight": count,
        "sourceIndex": list(character_counts.keys()).index(char_a),
        "targetIndex": list(character_counts.keys()).index(char_b)
    })

In [18]:
final_dataset = {
    "nodes": nodes,
    "edges": edges
}

In [19]:
import json
with open('friends-cooccur.json', 'w') as f:
    json.dump(final_dataset, f, indent=4)

In [20]:
df['Season'] = df['Season'].str.extract(r'(\d+)').astype(int)
df['Episode'] = df['Episode'].str.extract(r'(\d+)').astype(int)

In [21]:
df["Text"]

3         There's nothing to tell! He's just some guy I...
4         C'mon, you're going out with the guy! There's...
5          So does he have a hump? A hump and a hairpiece?
6                                 Wait, does he eat chalk?
8         Just, 'cause, I don't want her to go through ...
                               ...                        
69966           Yeah! I'm going to Paris. Thank you, Ross!
69967                           Yeah, yeah, oh! (They hug)
69968                                Oh! Oh, I'm so happy.
69969     Then I'm happy too. (They're still hugging - ...
69972     Thank you all for coming. We're here today to...
Name: Text, Length: 53329, dtype: object

In [22]:
df['WordCount'] = df['Text'].apply(lambda text: len(str(text).split()))

In [23]:
word_count_by_season = df.groupby(['Speaker', 'Season']).WordCount.sum().reset_index()

In [24]:
print(word_count_by_season)

    Speaker  Season  WordCount
0       Amy       9        720
1       Amy      10        721
2       Ben       2          5
3       Ben       3          8
4       Ben       5          9
..      ...     ...        ...
150   Susan       3         35
151   Susan       4         70
152   Susan       6         36
153     Tag       7       1157
154     Tag       8        110

[155 rows x 3 columns]


In [25]:
unique_characters = df['Speaker'].unique()
unique_seasons = df['Season'].unique()

In [26]:
print(unique_characters)

['Monica' 'Joey' 'Chandler' 'Phoebe' 'Ross' 'Rachel' 'Paul' 'Carol'
 'Mrs. Geller' 'Mr. Geller' 'Susan' 'Jill' 'David' 'Janice' 'Gunther'
 'Richard' 'Ben' 'Frank' 'Mike' 'Pete' 'Kathy' 'Joshua' 'Emily'
 'Elizabeth' 'Gary' 'Janine' 'Tag' 'Mona' 'Amy' 'Charlie']


In [27]:
import itertools
all_combinations = pd.DataFrame(list(itertools.product(unique_characters, unique_seasons)), columns=['Speaker', 'Season'])

In [28]:
word_count_by_season = df.groupby(['Speaker', 'Season']).WordCount.sum().reset_index()

complete_data = all_combinations.merge(word_count_by_season, on=['Speaker', 'Season'], how='left').fillna(0)

complete_data['WordCount'] = complete_data['WordCount'].astype(int)

In [29]:
complete_data

Unnamed: 0,Speaker,Season,WordCount
0,Monica,1,8004
1,Monica,2,8207
2,Monica,3,8669
3,Monica,4,8217
4,Monica,5,9806
...,...,...,...
295,Charlie,6,0
296,Charlie,7,0
297,Charlie,8,0
298,Charlie,9,1428


In [30]:
complete_data.to_csv('word-count-per-season.csv', index=False)

In [31]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/emir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /Users/emir/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/emir/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/emir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Most Common Words

In [33]:
from nltk.tokenize import word_tokenize

In [34]:
lemmatizer = WordNetLemmatizer()

In [35]:
def preprocess_text(text):
    words = word_tokenize(text.lower())
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return filtered_words

In [36]:
df['ProcessedWords'] = df['Text'].apply(preprocess_text)

In [37]:
from collections import Counter

word_counts_by_character = df.groupby('Speaker')['ProcessedWords'].sum().apply(Counter)

In [38]:
top_n = 10
data = []

for speaker, counter in word_counts_by_character.items():
    top_words_counts = counter.most_common(top_n)
    speaker_data = {"Speaker": speaker}
    speaker_data.update({f"Word_{i+1}": word for i, (word, _) in enumerate(top_words_counts)})
    speaker_data.update({f"Count_{i+1}": count for i, (_, count) in enumerate(top_words_counts)})
    data.append(speaker_data)

In [39]:
words_df = pd.DataFrame(data)

In [40]:
most_common_words_data = []

for _, row in words_df.iterrows():
    speaker = row['Speaker']
    words_list = [{'word': row[f'Word_{i+1}'], 'count': row[f'Count_{i+1}']} for i in range(top_n)]
    most_common_words_data.append({'character': speaker, 'words': words_list})

### Most Distinguishing Words

In [41]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
df['Words'] = df['Text'].apply(lambda x: nltk.word_tokenize(x.lower()))
df['Words'] = df['Words'].apply(lambda x: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in x if word.isalpha() and word not in stop_words])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df['Words'] = df['Text'].apply(lambda x: nltk.word_tokenize(x.lower()))

In [None]:
df['Words']

In [None]:
df['Words'] = df['Words'].apply(lambda x: [word for word in x if word.isalpha() and word not in stop_words])

In [None]:
df["Words"]

In [None]:
df['ProcessedText'] = df['Words'].apply(lambda x: ' '.join(x))

In [None]:
corpora = df.groupby('Speaker')['ProcessedText'].apply(' '.join)

tfidf_vectorizer = TfidfVectorizer(min_df=3, max_df=0.85, ngram_range=(1, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(corpora)

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.todense(), index=corpora.index, columns=feature_names)

In [None]:
for speaker in corpora.index:
    print(f"{speaker}:")
    speaker_words = tfidf_df.loc[speaker].sort_values(ascending=False).head(10)
    print(speaker_words)

In [None]:
distinguishing_words_data = []

for speaker in corpora.index:
    speaker_words = tfidf_df.loc[speaker].sort_values(ascending=False).head(top_n)
    words_list = [{'word': word, 'count': count} for word, count in speaker_words.items()]
    distinguishing_words_data.append({'character': speaker, 'words': words_list})

### JSON Conversion

In [None]:
most_common_words_json = json.dumps(most_common_words_data)
distinguishing_words_json = json.dumps(distinguishing_words_data)

with open('most_common_words.json', 'w') as f:
    f.write(most_common_words_json)

with open('distinguishing_words.json', 'w') as f:
    f.write(distinguishing_words_json)