In [2]:
import pandas as pd
import numpy as np
import re
import ast

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

from sentence_transformers import SentenceTransformer
from sklearn.linear_model import Ridge
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("friends_transcripts.tsv", sep="\t")
df.head()


Unnamed: 0,season_id,episode_id,scene_id,utterance_id,speaker,tokens,transcript
0,s01,e01,c01,u001,Monica Geller,"[['There', ""'s"", 'nothing', 'to', 'tell', '!']...",There's nothing to tell! He's just some guy I ...
1,s01,e01,c01,u002,Joey Tribbiani,"[[""C'mon"", ',', 'you', ""'re"", 'going', 'out', ...","C'mon, you're going out with the guy! There's ..."
2,s01,e01,c01,u003,Chandler Bing,"[['All', 'right', 'Joey', ',', 'be', 'nice', '...","All right Joey, be nice. So does he have a hum..."
3,s01,e01,c01,u004,Phoebe Buffay,"[['Wait', ',', 'does', 'he', 'eat', 'chalk', '...","Wait, does he eat chalk?"
4,s01,e01,c01,u005,unknown,[],


In [4]:
def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"[^a-zA-Z\s]", " ", t)
    t = re.sub(r"\s+", " ", t)
    return t.strip()


In [5]:
df["clean_text"] = df["transcript"].apply(clean_text)

df = df.sort_values(["season_id", "episode_id", "scene_id", "utterance_id"])

df["prev_text"] = df["clean_text"].shift(1)
df["prev_speaker"] = df["speaker"].shift(1)

pairs = df.dropna(subset=["prev_text"])
pairs = pairs[["prev_speaker", "prev_text", "speaker", "clean_text"]]

pairs.head()


Unnamed: 0,prev_speaker,prev_text,speaker,clean_text
1,Monica Geller,there s nothing to tell he s just some guy i w...,Joey Tribbiani,c mon you re going out with the guy there s go...
2,Joey Tribbiani,c mon you re going out with the guy there s go...,Chandler Bing,all right joey be nice so does he have a hump ...
3,Chandler Bing,all right joey be nice so does he have a hump ...,Phoebe Buffay,wait does he eat chalk
4,Phoebe Buffay,wait does he eat chalk,unknown,
5,unknown,,Phoebe Buffay,just cause i don t want her to go through what...


In [6]:
main_chars = [
    "Ross Geller", "Rachel Green", "Monica Geller",
    "Phoebe Buffay", "Joey Tribbiani", "Chandler Bing"
]

pairs = pairs[pairs["prev_speaker"].isin(main_chars)]
pairs = pairs[pairs["speaker"].isin(main_chars)]
pairs.head()


Unnamed: 0,prev_speaker,prev_text,speaker,clean_text
1,Monica Geller,there s nothing to tell he s just some guy i w...,Joey Tribbiani,c mon you re going out with the guy there s go...
2,Joey Tribbiani,c mon you re going out with the guy there s go...,Chandler Bing,all right joey be nice so does he have a hump ...
3,Chandler Bing,all right joey be nice so does he have a hump ...,Phoebe Buffay,wait does he eat chalk
6,Phoebe Buffay,just cause i don t want her to go through what...,Monica Geller,okay everybody relax this is not even a date i...
7,Monica Geller,okay everybody relax this is not even a date i...,Chandler Bing,sounds like a date to me


In [7]:
pairs["model_input"] = pairs["prev_speaker"].str.lower() + " " + pairs["prev_text"]
pairs["model_output"] = pairs["clean_text"].astype(str)


In [8]:
vectorizer_input = TfidfVectorizer(stop_words="english", max_features=5000)
vectorizer_output = TfidfVectorizer(stop_words="english", max_features=5000)

X = vectorizer_input.fit_transform(pairs["model_input"])
Y = vectorizer_output.fit_transform(pairs["model_output"])

print("X:", X.shape)
print("Y:", Y.shape)


X: (38463, 5000)
Y: (38463, 5000)


In [9]:

embed_model = SentenceTransformer('all-MiniLM-L6-v2')

Y_emb = embed_model.encode(pairs["model_output"].tolist())


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y_emb, test_size=0.2, random_state=42
)


In [11]:

reg = MultiOutputRegressor(Ridge(alpha=1.0))
reg.fit(X_train, Y_train)

In [12]:


def predict_reply(prev_speaker, prev_line, next_speaker, top_n=1):
    inp = f"{prev_speaker.lower()} {clean_text(prev_line)}"
    x_vec = vectorizer_input.transform([inp])

    y_vec = reg.predict(x_vec)[0]

    sims = cosine_similarity([y_vec], Y_emb)[0]
    best_idx = np.argmax(sims)

    return pairs["model_output"].iloc[best_idx]


In [13]:
# Given input ("speaker 1", "quote from speaker 1", "speaker 2") outputs "quote from speaker 2"
# speakers must be out of list "Ross Geller", "Rachel Green", "Monica Geller", "Phoebe Buffay", "Joey Tribbiani", "Chandler Bing"
predict_reply("Ross Geller", "We were on a break!", "Rachel Green")


'oh my god if you say that one more time i m gonna break up with you'