In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
file_path = "./Conversation.csv"
GLOBAL_SEED=100

In [3]:
df_data = pd.read_csv(file_path)

In [4]:
df_data.head()

Unnamed: 0.1,Unnamed: 0,question,answer
0,0,"hi, how are you doing?",i'm fine. how about yourself?
1,1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,3,no problem. so how have you been?,i've been great. what about you?
4,4,i've been great. what about you?,i've been good. i'm in school right now.


In [5]:
df_data.drop(columns=['Unnamed: 0'],inplace=True)

In [6]:
df_data.head()

Unnamed: 0,question,answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [7]:
df_train, df_test = train_test_split(df_data,test_size = 0.2, random_state=GLOBAL_SEED)

In [8]:
df_test

Unnamed: 0,question,answer
2497,why is that?,because he likes to scuba dive.
1185,"yes, it's just a couple of screws.",that's nice.
1596,what is gravity?,it's the force that pulls everything down.
408,would you like to see a movie with me and my f...,do you know what movie you're going to watch?
2934,"yes, and they ask me what jobs are the best.",i tell my students to become a teacher.
...,...,...
2394,then stop eating the bread!,"okay, just one more piece. pass the butter, pl..."
1995,people will tell you if they have a good hand.,how do they do that?
709,we should hang out some time.,i think that would be nice.
343,that's exactly how i felt.,"i got the movie when it came out on dvd, do yo..."


In [19]:
from transformers import T5Tokenizer

model_max_length=512
INPUT_MAX_LEN=128

tokenizer_base = T5Tokenizer.from_pretrained("t5-base", model_max_length=model_max_length)
tokenizer_small = T5Tokenizer.from_pretrained("t5-small", model_max_length=model_max_length)

def generate_question(question, tokenizer, model):

    inputs_encoding =  tokenizer(
        question,
        add_special_tokens=True,
        max_length= INPUT_MAX_LEN,
        padding = 'max_length',
        truncation='only_first',
        return_attention_mask=True,
        return_tensors="pt"
        )

    
    generate_ids = model.model.generate(
        input_ids = inputs_encoding["input_ids"],
        attention_mask = inputs_encoding["attention_mask"],
        max_length = INPUT_MAX_LEN,
        num_beams = 4,
        num_return_sequences = 1,
        no_repeat_ngram_size=2,
        early_stopping=True,
        )

    preds = [
        tokenizer.decode(gen_id,
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=True)
        for gen_id in generate_ids
    ]

    return "".join(preds)

In [10]:
import sentence_transformers

from sentence_transformers import SentenceTransformer, util
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

2023-12-04 20:29:04.014112: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-04 20:29:04.340209: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-04 20:29:04.340262: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-04 20:29:04.341807: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-04 20:29:04.505628: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-04 20:29:04.507627: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

In [11]:
df_test.reset_index(drop=True, inplace=True)

In [12]:
df_test.head()

Unnamed: 0,question,answer
0,why is that?,because he likes to scuba dive.
1,"yes, it's just a couple of screws.",that's nice.
2,what is gravity?,it's the force that pulls everything down.
3,would you like to see a movie with me and my f...,do you know what movie you're going to watch?
4,"yes, and they ask me what jobs are the best.",i tell my students to become a teacher.


In [13]:
# Load base model
from model_base import T5Model
checkpoint_base = "./output_base/best-model-v1.ckpt"
model_base = T5Model.load_from_checkpoint(checkpoint_base)

Lightning automatically upgraded your loaded checkpoint from v1.5.10 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint output_base/best-model-v1.ckpt`


NameError: name 'model' is not defined

In [15]:
model_base.freeze()

In [71]:
# Load small model
from model_small import T5Model
checkpoint_base = "./output_small/best-model-v1.ckpt"
model_small = T5Model.load_from_checkpoint(checkpoint_base)
model_small.freeze()

Lightning automatically upgraded your loaded checkpoint from v1.5.10 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint output_small/best-model-v1.ckpt`


In [17]:
def apply_small(x):
    answer = generate_question(x, tokenizer_small, model_small)
    return answer
    
def apply_base(x):
    answer = generate_question(x, tokenizer_base, model_base)
    return answer

In [27]:
base_predictions = []
for i, item in enumerate(df_test.iloc()):
    prediction = generate_question(item["question"], tokenizer_base, model_base)
    base_predictions.append(prediction)
    if i%50==0:
        print("Processing {} sample...".format(i))

Processing 0 sample...
Processing 50 sample...
Processing 100 sample...
Processing 150 sample...
Processing 200 sample...
Processing 250 sample...
Processing 300 sample...
Processing 350 sample...
Processing 400 sample...
Processing 450 sample...
Processing 500 sample...
Processing 550 sample...
Processing 600 sample...
Processing 650 sample...
Processing 700 sample...


In [72]:
small_predictions = []
for i, item in enumerate(df_test.iloc()):
    prediction = generate_question(item["question"], tokenizer_small, model_small)
    small_predictions.append(prediction)
    if i%50==0:
        print("Processing {} sample...".format(i))

Processing 0 sample...
Processing 50 sample...
Processing 100 sample...
Processing 150 sample...
Processing 200 sample...
Processing 250 sample...
Processing 300 sample...
Processing 350 sample...
Processing 400 sample...
Processing 450 sample...
Processing 500 sample...
Processing 550 sample...
Processing 600 sample...
Processing 650 sample...
Processing 700 sample...


In [31]:
df_test.head(20)

Unnamed: 0,question,answer
0,why is that?,because he likes to scuba dive.
1,"yes, it's just a couple of screws.",that's nice.
2,what is gravity?,it's the force that pulls everything down.
3,would you like to see a movie with me and my f...,do you know what movie you're going to watch?
4,"yes, and they ask me what jobs are the best.",i tell my students to become a teacher.
5,did you have a date friday night?,"yes, in fact, i did."
6,i'll squeeze the cloth so it's almost dry.,don't rub the numbers off the remote.
7,how good is your math?,i can add two and two.
8,what about the other homeless people?,they got $1 each.
9,what did they do in the old days?,they brushed with their fingers.


In [32]:
base_predictions[:20]

['why is that?',
 "yes, it's just a couple of screws.",
 'what is gravity?',
 'would you like to see a movie with me?',
 'they ask me what jobs are the best.',
 'did you have a date friday night?',
 "i'll squeeze the cloth so it's almost dry.",
 "i'm a mathematician.",
 'what about the other homeless people?',
 'what did they do in the old days?',
 "i'll vote for him next time.",
 "oh,that's a lot of work.",
 "i didn't love you at first.",
 'what are you getting for your mom?',
 'i guess you see lots of different dogs.',
 'oh, a laptop.',
 'i have a lot of friends.',
 "there's no food in the woods.",
 "why's that?",
 "i'm actually in school right now."]

In [56]:
compute_similarity(df_test)

why is that?
because he likes to scuba dive.
yes, it's just a couple of screws.
that's nice.
what is gravity?
it's the force that pulls everything down.
would you like to see a movie with me and my friend?
do you know what movie you're going to watch?
yes, and they ask me what jobs are the best.
i tell my students to become a teacher.
did you have a date friday night?
yes, in fact, i did.
i'll squeeze the cloth so it's almost dry.
don't rub the numbers off the remote.
how good is your math?
i can add two and two.
what about the other homeless people?
they got $1 each.
what did they do in the old days?
they brushed with their fingers.
i'll vote for him next time, too.
i think everyone will.


In [34]:
answer_vector = sentence_model.encode(df_test["answer"])

In [35]:
prediction_vector = sentence_model.encode(base_predictions)

In [73]:
prediction_small_vector = sentence_model.encode(small_predictions)

In [38]:
answer_vector.shape

(745, 384)

In [39]:
prediction_vector.shape

(745, 384)

In [66]:
from numpy import dot
from numpy.linalg import norm

def cossim(a, b):
    result = dot(a, b)/(norm(a)*norm(b))
    return result

def compute_avg_cosine(A, B):
    result = 0
    rows = A.shape[0]
    for i, row in enumerate(A):
        result+= cossim(row, B[i])
    return result/rows
        
cosine_similarity(answer_vector, prediction_vector)

array([[ 0.25152558, -0.03182962,  0.12607825, ...,  0.05353519,
        -0.05841985,  0.15446049],
       [ 0.27841437,  0.0684548 , -0.01234981, ...,  0.19492339,
         0.285271  ,  0.11401501],
       [ 0.10283864,  0.2692343 ,  0.4709503 , ..., -0.04800774,
         0.07018243,  0.09085122],
       ...,
       [ 0.15157326,  0.04200397, -0.00964552, ...,  0.30294895,
         0.07840977,  0.12722413],
       [ 0.05075088,  0.08448079, -0.00048497, ...,  0.25835937,
         0.06951929,  0.00096918],
       [ 0.05121026,  0.05309503, -0.02351586, ...,  0.25202045,
         0.05000186,  0.28681558]], dtype=float32)

In [67]:
answer_vector

array([[ 0.07738902, -0.01598553,  0.09356439, ...,  0.02156039,
        -0.01761158,  0.0055808 ],
       [-0.02764896,  0.01042169, -0.01288704, ...,  0.01883932,
        -0.03664861,  0.03056649],
       [ 0.01224554,  0.00328223,  0.02998877, ..., -0.02520691,
         0.06890911,  0.03179877],
       ...,
       [-0.13381524,  0.00215585, -0.05557879, ..., -0.01614939,
        -0.10556188, -0.0128924 ],
       [-0.0835367 , -0.04664869, -0.02752446, ..., -0.0638754 ,
         0.00492482, -0.01165556],
       [ 0.06787718, -0.00944726,  0.01494324, ..., -0.07249399,
        -0.02249984,  0.09651412]], dtype=float32)

In [68]:
answer_vector[0]

array([ 7.73890242e-02, -1.59855317e-02,  9.35643911e-02,  3.82473134e-02,
        4.37900797e-02, -5.53101078e-02,  1.06111273e-01,  1.71032399e-02,
        2.60586869e-02, -2.24619284e-02, -1.23359961e-03, -4.07568738e-02,
       -6.84753284e-02,  9.01143923e-02, -1.54969906e-02, -2.31417385e-03,
       -4.27377298e-02, -4.12318520e-02, -7.29267895e-02, -1.06356954e-02,
        1.40269883e-02,  1.16701029e-01,  4.98836376e-02, -2.91447863e-02,
       -1.02798045e-01,  1.79176256e-02,  5.92196500e-03, -1.34642171e-02,
        8.83340389e-02,  1.84182767e-02,  3.20716538e-02,  2.72572245e-02,
       -1.77700457e-03, -3.17084305e-02, -6.86899871e-02, -6.28071232e-03,
       -8.10158718e-03,  5.18133417e-02, -7.63672516e-02,  9.90013685e-03,
       -6.46607652e-02, -2.45913826e-02,  1.78806167e-02,  1.03679247e-01,
       -5.43231964e-02, -1.36497719e-02, -5.67643680e-02,  1.32261193e-03,
        1.44775510e-01,  6.53869957e-02, -4.22751531e-02, -2.08946671e-02,
       -4.36832989e-03, -

In [69]:
compute_avg_cosine(prediction_vector, answer_vector)

0.2733495775173865

In [74]:
compute_avg_cosine(prediction_small_vector, answer_vector)

0.2270179176717159