In [1]:
from socket import gethostname
gethostname()

'login-i16'

In [9]:
from dotenv import load_dotenv
load_dotenv()

True

In [13]:
import os
from sqlalchemy import create_engine
import pandas as pd
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
import time

In [4]:
#establish connection
engine = create_engine(os.getenv('POSTGRE_DB_URL'))

#read the dialogues table into a pandas dataframe
dialogues = pd.read_sql("SELECT * FROM dialogues WHERE dataset = 'training';", engine)
dialogues.head()

Unnamed: 0,dialogue_id,dataset,dialogue_text,actual_summary,actual_sentiment
0,1,training,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...",positive
1,2,training,"#Person1#: Hello Mrs. Parker, how have you bee...",Mrs Parker takes Ricky for his vaccines. Dr. P...,positive
2,3,training,"#Person1#: Excuse me, did you see a set of key...",#Person1#'s looking for a set of keys and asks...,positive
3,4,training,#Person1#: Why didn't you tell me you had a gi...,#Person1#'s angry because #Person2# didn't tel...,negative
4,5,training,"#Person1#: Watsup, ladies! Y'll looking'fine t...",Malik invites Nikki to dance. Nikki agrees if ...,positive


In [26]:
dialogues["text_length"] = dialogues["dialogue_text"].str.len()
dialogues.head()

Unnamed: 0,dialogue_id,dataset,dialogue_text,actual_summary,actual_sentiment,text_length
0,1,training,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...",positive,931
1,2,training,"#Person1#: Hello Mrs. Parker, how have you bee...",Mrs Parker takes Ricky for his vaccines. Dr. P...,positive,776
2,3,training,"#Person1#: Excuse me, did you see a set of key...",#Person1#'s looking for a set of keys and asks...,positive,500
3,4,training,#Person1#: Why didn't you tell me you had a gi...,#Person1#'s angry because #Person2# didn't tel...,negative,526
4,5,training,"#Person1#: Watsup, ladies! Y'll looking'fine t...",Malik invites Nikki to dance. Nikki agrees if ...,positive,480


In [14]:
summary_df = pd.read_csv("../Datasets/mistral_summary_results.csv")
summary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12427 entries, 0 to 12426
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   dialogue_id     12427 non-null  int64  
 1   Time Taken (s)  12427 non-null  float64
 2   GPU Usage (MB)  12427 non-null  float64
 3   Summary         12427 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 388.5+ KB


In [28]:
dialogues.iloc[7].str.len()

dialogue_id            NaN
dataset                8.0
dialogue_text       1403.0
actual_summary       264.0
actual_sentiment       8.0
text_length            NaN
Name: 7, dtype: float64

In [29]:
dialogues["dialogue_text"].str.len().describe()

count    12427.000000
mean       730.809689
std        368.378838
min        190.000000
25%        484.500000
50%        652.000000
75%        913.000000
max       3028.000000
Name: dialogue_text, dtype: float64

In [30]:
missing_rows = summary_df[summary_df["Summary"].isna()]["dialogue_id"]
dialogues[dialogues["dialogue_id"].isin(missing_rows)]["dialogue_text"].str.len().describe()

count    1687.000000
mean      921.636633
std       436.992774
min       214.000000
25%       603.000000
50%       835.000000
75%      1159.000000
max      2968.000000
Name: dialogue_text, dtype: float64

In [32]:
dialogues.sort_values(by="text_length").tail(10)

Unnamed: 0,dialogue_id,dataset,dialogue_text,actual_summary,actual_sentiment,text_length
12361,12357,training,"#Person1#: Come in, please.\n#Person2#: Good m...",Mrs. Smith is interviewing Mr. Sun who tells h...,positive,2741
7135,7093,training,#Person1#: Tell me something about yourself an...,#Person1# interviews #Person2#. #Person2# says...,positive,2745
4642,4589,training,"#Person1#: Come in, please. \n#Person2#: Good ...",Mrs. Smith is interviewing Mr.Sun who majored ...,positive,2766
3967,3912,training,"#Person1#: Guess what, Mom. I got it.\n#Person...",The daughter gets the driving license and her ...,positive,2784
3731,3679,training,"#Person1#: Sally, come downstairs, please. Loo...",Eric finds a job advertisement looking for a p...,positive,2788
1218,1186,training,#Person1#: Hey. What's this on Facebook? It sa...,#Person2# is going to marry Jasmine but #Perso...,positive,2802
6578,6531,training,"#Person1#: Can I help you?\n#Person2#: Yes, I ...",#Person2# wants to buy a computer with a lot o...,positive,2805
8062,8022,training,"#Person1#: Mrs. Miller, you are an old friend ...",#Person1# offers to cut the price by 5% and Mr...,positive,2949
9295,9260,training,"#Person1#: Mr. Jones, shall we now discuss the...",#Person1# and #Person2# both think an attracti...,negative,2968
2650,7300,training,#Person1#: Today we are talking to John Knox a...,John Knox tells #Person1# about the structure ...,positive,3028


In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [49]:
test_input = tokenizer([f"Summarize the following dialogue concisely:\n{dialogues.iloc[2650]["dialogue_text"]}\nSummary:"], return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
test_output = model.generate(**test_input, max_new_tokens=120, do_sample=False, 
                                           pad_token_id=tokenizer.eos_token_id)

In [52]:
test_input["input_ids"].shape

torch.Size([1, 801])

In [38]:
tokenizer.decode(test_output[0], skip_special_tokens=True)[3073:]

"Summary: John Knox, General Manager of Biopaints Bath factory, discusses the company's organizational structure. Biopaints employs around 2000 people in two locations, with four main divisions: production, finance, personnel, and administration. The production division is responsible for operations at the Bath and Lille factories, with two General Managers. The finance division, headed by Financial Director Fred Rasmussen, manages the company's finances. The personnel division, led by Personnel Manager David Hopper, handles personnel matters. The production division includes research"

In [68]:
def split_dialogue(row, max_length=1024):
    text = row['dialogue_text']
    if len(text) > max_length:
        # Split by newline, but reassemble if any segment is longer than max_length characters
        parts = text.split('\n')
        new_texts = []
        current_part = ''
        for part in parts:
            if len(current_part) + len(part) < max_length:
                current_part += part + '\n'
            else:
                new_texts.append(current_part.strip())
                current_part = part + '\n'
        if current_part:
            new_texts.append(current_part.strip())
        return [(row['dialogue_id'], new_text) for new_text in new_texts]
    else:
        return [(row['dialogue_id'], text)]

def split_long_dialogues(df, max_length=1024):
    new_rows = []
    for _, row in df[df["dialogue_text"].str.len() >= max_length].iterrows():
        results = split_dialogue(row, max_length=max_length)
        new_rows.extend(results)
    return pd.concat([df[df["dialogue_text"].str.len() < max_length][['dialogue_id', 'dialogue_text']], 
                    pd.DataFrame(new_rows, columns=['dialogue_id', 'dialogue_text'])])

In [15]:
def generate_summary(model, tokenizer, dialogues, batch_size=5, device="cpu"):

    summaries = []
    times = []
    gpu_usages = []

    for i in range(0, len(dialogues), batch_size):

        batch = dialogues[i:i+batch_size]
        prompts = [f"Summarize the following dialogue concisely:\n{dialogue}\nSummary:" for dialogue in batch]

        start_time = time.time()
        if device == "cuda":
            torch.cuda.reset_peak_memory_stats(device)
        

        model_inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, 
                                 max_length=1024).to(device)
        
        with torch.no_grad():
            model_outputs = model.generate(**model_inputs, max_new_tokens=120, do_sample=True, 
                                           pad_token_id=tokenizer.eos_token_id)
        
        end_time = time.time()
        # Takes the average time of each batch
        for _ in range(len(batch)):
            times.append((end_time - start_time) / len(batch))

        if device == "cuda":
            peak_memory = torch.cuda.max_memory_allocated(device) / (1024 ** 2)  # Convert to MB
            gpu_usages.append(peak_memory)
            # Memory usage takes the average of each batch
            for _ in range(len(batch)):
                times.append((end_time - start_time) / len(batch))

        for output in model_outputs: 
            summary = tokenizer.decode(output, skip_special_tokens=True)
            prompt_end_index = summary.find("Summary: ") + len("Summary: ")
            if prompt_end_index != len("Summary: ")-1:
                summary = summary[prompt_end_index:].strip()
            summaries.append(summary)

    return summaries, times, gpu_usages

In [16]:
summaries, times, gpu_usages = generate_summary(model, tokenizer, dialogues["dialogue_text"][:3])    

In [17]:
summaries

["Doctor Hawkins checks in with Mr. Smith about his long-overdue annual check-up. Mr. Smith explains that he feels fine and hasn't seen a doctor in years, but Doctor Hawkins stresses the importance of preventative care. Mr. Smith reveals that he smokes and struggles to quit. Doctor Hawkins advises him of the health risks and offers resources to help him quit.",
 "Mrs. Parker brings in her son, Ricky, for his routine check-up and vaccines. According to his records, Ricky needs Hepatitis A, Chickenpox, Measles, Rubella, and Mumps shots. Mrs. Parker also mentions needing a Tetanus booster for herself. The doctor confirms and plans to administer the shots. The conversation concludes with Ricky's vaccines being given and the doctor mentioning a nurse will give Mrs. Parker her Tetanus booster.",
 'Person1 lost a set of five keys with a small foot ornament and asked Person2 for help in finding them. Person2 agreed to help and they searched together. Person1 was relieved when they were found.'

In [18]:
times

[23.92201574643453, 23.92201574643453, 23.92201574643453]