In [1]:
from sentence_transformers import SentenceTransformer, models, losses, util, InputExample
import pandas as pd
from datasets import Dataset
import os

In [5]:
from textacy import preprocessing
from tqdm.auto import tqdm
import re

In [3]:
clean_pipeline = preprocessing.make_pipeline(
                                            preprocessing.normalize.bullet_points, 
                                            preprocessing.normalize.whitespace,
                                            preprocessing.normalize.quotation_marks)

In [4]:
def get_files(start_path):
    paths = []
    for root, dirs, files in os.walk(start_path):
        for file in files:
            if file != ".DS_Store":
                paths.append(os.path.join(root, file))

    return paths

files = get_files("/Users/vignesh/Documents/george brown pgdm /DL 1/GeorgeBrownGPT/qaPairs")
print(f"Total number of files - {len(files)}")

Total number of files - 1647


In [6]:

def clean_text(input_text):
    cleaned_text =  clean_pipeline(input_text.lower().strip())

    # Remove non-alphanumeric characters and extra spaces
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', cleaned_text)
    cleaned_text = ' '.join(cleaned_text.split())

    # Ensure the cleaned text starts with a number or a letter
    cleaned_text = re.sub(r'^[^a-zA-Z0-9]+', '', cleaned_text)
    
    return cleaned_text

In [7]:
dfs = []
for file in tqdm(files):
    question_df = pd.read_csv(file)
    indexes = []
    for index, row in question_df.iterrows():
        question = row["question"]
        question_df.loc[index, "question"] = clean_text(question)
        if len(question) < 10:
            indexes.append(index)
        
    question_df.drop(indexes, inplace=True)
    question_df.drop(["Unnamed: 0"], axis=1, inplace=True)
    dfs.append(question_df)

  0%|          | 0/1647 [00:00<?, ?it/s]

In [8]:
merged_df = pd.concat(dfs)

In [9]:
merged_df.shape

(41447, 3)

In [10]:
merged_df.head()

Unnamed: 0,contextId,context,question
0,324bedfa-5934-4f6a-8df7-8269376ac767,* George Brown College gives approximately $7...,how much money does george brown college give ...
1,324bedfa-5934-4f6a-8df7-8269376ac767,* George Brown College gives approximately $7...,what number of students receive these awards
2,324bedfa-5934-4f6a-8df7-8269376ac767,* George Brown College gives approximately $7...,approximately how much funding did osap provid...
3,324bedfa-5934-4f6a-8df7-8269376ac767,* George Brown College gives approximately $7...,can students at george brown still apply for c...
4,324bedfa-5934-4f6a-8df7-8269376ac767,* George Brown College gives approximately $7...,which external awards are available to george ...


In [11]:
context_ids = list(merged_df["contextId"].value_counts().keys())
context_index_dict = {value: ind for ind, value in enumerate(context_ids)}

print(context_index_dict)

{'aafa9fc8-375e-4ec4-8b87-89e1d8509db3': 0, '477c5eb6-6cad-4c9a-aa0c-c6b6836b98b9': 1, '1c6af24e-097a-40e1-8889-22921754a05d': 2, 'eb165bd5-2f3f-425f-91e4-e016dbcd9940': 3, '3f4e414e-f52e-4f98-b0fb-a310bb955d63': 4, 'b2fd9155-2f77-4d7d-ad99-8a27d4ff1a24': 5, '732d44c2-74a5-4e35-9721-c2e86db55e7e': 6, '90862ac6-54c9-4dbe-bc68-27d18647086f': 7, '9349d66a-9f2f-4b09-be5b-76347c186152': 8, 'db88a8a9-a178-4613-bcac-a94f0fadd9ce': 9, 'fd8712b5-221c-4c2a-9b92-f36ac8be37f6': 10, '398b7fb9-7f95-49e3-8650-2b8c3e0101cc': 11, '3114d56e-8a72-4137-a98f-23e1bbb87f7e': 12, 'd0d0b483-d1e7-4612-8080-ae89e9f22179': 13, '054b3c08-dba5-46ea-8a23-6b1bf5d1b1cc': 14, 'c4096c30-545d-464f-8e94-662d995bd77f': 15, '7c269ec6-6901-427e-9a51-1efcd5085c1f': 16, 'f72b3350-68bf-4363-8616-5fbb94df5374': 17, '106ce067-65b4-45a7-a26c-c1657d3886bd': 18, '2cc2ec91-9d22-4585-8779-1b8c930042a1': 19, '33a27f97-07fb-4a1e-b9b2-bfeb286cdd0a': 20, 'bdee3389-6b02-4c9f-a20f-7dddb14e6802': 21, 'aeda4bc3-3924-4da0-867b-ea93cebddb70': 2

In [12]:
len(context_index_dict)

4713

In [13]:
dataset = Dataset.from_pandas(merged_df, split="train")

In [14]:
import random

def generate_random_with_exclusion(start, end, exclude):
    while True:
        random_number = random.randint(start, end)
        if random_number != exclude:
            return random_number


In [15]:

train_examples = []
train_data = dataset

n_examples = dataset.num_rows 

for i in tqdm(range(n_examples)):
    example = train_data[i]

    contextId = example["contextId"]
    random_number = generate_random_with_exclusion(0, len(context_index_dict)-1, context_index_dict[contextId] )
    negative_context_id = context_ids[random_number]
    negative_example = merged_df[merged_df["contextId"] == negative_context_id]

    train_examples.append(InputExample(texts=[example['question'], example['context'], negative_example.iloc[0]["context"]]))
    


  0%|          | 0/41447 [00:00<?, ?it/s]

In [16]:
print(f"We have a {type(train_examples)} of length {len(train_examples)} containing {type(train_examples[0])}'s.")

We have a <class 'list'> of length 41447 containing <class 'sentence_transformers.readers.InputExample.InputExample'>'s.


In [17]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)

In [18]:
print(train_examples[1].__dict__)

{'guid': '', 'texts': ['what number of students receive these awards', " * George Brown College gives approximately $7.9 million in awards to around 6000 students every year.\n* OSAP provided nearly $135 million in funding in the 2017/18 school year.\n* Students can apply for George Brown awards even if they're receiving OSAP funds.\n* External awards available to George Brown students.\n* OSAP is a combination of federal and provincial funds in the form of loans and grants. The loan portion must be paid back, but not the grant.\n* Eligibility criteria exist for OSAP, and extra support may be available through OSAP's Student Access Guarantee.\n* Aeroplan, TD Bank, or CIBC loyalty points can be turned into funds for use on campus.\n* George Brown College offers high-quality and career-focused programs and services.\n* Sign up for email updates to be entered in a contest to win $5000 towards tuition. Contest closes August 31, 2023.\n* Contact information: 416-415-2000, TTY: 1-877-515-555

In [19]:
print(train_examples[6].texts[2])

 This three-year diploma program offers training in fabricating partial or full dentures, including repairs, relines, and removable oral devices. Students can also learn techniques for implant overdentures through simulation. The curriculum includes academic, clinical, and laboratory skills with an emphasis on interprofessional education. Graduates are eligible to apply for the CDO provincial qualifying examination to practice as a denturist in Ontario (additional fees apply).

The program provides field experience opportunities in both on-site and off-site settings. In the first year, students focus on lab skills development. In the second year, they work with patients under supervision at WAVE Dental Clinic. In the final year, they treat complex oral health-care clients at WAVE Dental Clinic and off-site with field partners. Clinical hours may include early mornings and evenings. Students are responsible for securing their own field experience opportunities with pre-approval from pro

In [20]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', use_auth_token="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", device='mps')

In [22]:
train_lossB = losses.MultipleNegativesRankingLoss(model=model)
num_epochsB = 10
warmup_stepsB = int(len(train_dataloader) * num_epochsB * 0.1) #10% of train data

In [23]:
training_history = model.fit(train_objectives=[(train_dataloader, train_lossB)],
          epochs=num_epochsB,
          warmup_steps=warmup_stepsB)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1296 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
model

In [3]:
model = SentenceTransformer('sentence-transformer-finetuned/georgebrown-v2-embeddings')


In [4]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [24]:
## Evaluation

In [25]:
training_history

In [25]:
# raw_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

query_embedding = raw_model.encode('What is the name of the program offered at George Brown College? ')
passage_embedding = raw_model.encode(['Program Name: Applied A.I. Solutions Development','What is the name of the program offered at George Brown College?',
                                  'Identify, evaluate and manage relevant data sources to support data analytics and to meet organizational needs.'])

print("Similarity:", util.dot_score(query_embedding, passage_embedding))

Similarity: tensor([[ 0.2386,  1.0000, -0.0454]])


In [5]:
query_embedding = model.encode('What is a program learning outcome for Data Science students at George Brown College?')
passage_embedding = model.encode(['Program Name: Applied A.I. Solutions Development',
                                  'Identify, evaluate and manage relevant data sources to support data analytics and to meet organizational needs.'])

print("Similarity:", util.dot_score(query_embedding, passage_embedding))

Similarity: tensor([[0.1283, 0.4348]])


In [6]:
query_embedding = model.encode('What is the name of the program offered at George Brown College? ')
passage_embedding = model.encode(['Program Name: Applied A.I. Solutions Development',
                                  'Identify, evaluate and manage relevant data sources to support data analytics and to meet organizational needs.'])

print("Similarity:", util.dot_score(query_embedding, passage_embedding))

Similarity: tensor([[ 0.2386, -0.0454]])
