In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import numpy as np
import psycopg

In [5]:
import os
from dotenv import load_dotenv
    
load_dotenv()

user = os.getenv("user")
db_name = os.getenv("db_name")
db_pswd = os.getenv("db_pswd")

In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/diego/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
# Model details: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# - 256 token limit
# - 384 embedding dimension
# - 22M params

In [8]:

def loadText(filename):
  f = open(filename, 'r')
  content = f.read()
  f.close()
  return content

def simpleWordCounter(text): 
  return len([ word for word in text.split(" ") if word != "" ]) # Need to filter out double or triple spaces

In [4]:


example_lecture1 = "course-lectures/BIS 002A/BIS-002A: 2024-02-05 10:57.txt" 
example_lecture2 = "course-lectures/ECN 001A/ECN-001A: 2023-10-31 18:07.txt" 
example_lecture3 = "course-lectures/ECS 154A/ECS-154A: 2025-02-21 14:07.txt" 

lecture1_text = loadText(example_lecture1)
lecture2_text = loadText(example_lecture2)
lecture3_text = loadText(example_lecture3)


In [5]:
lecture_to_test = lecture1_text
print(simpleWordCounter(lecture_to_test))
print(lecture_to_test)

6247
Sorry  about  that.  It  on.  Hi,  everyone. Can  I  have  your  attention  for  a  quick  announcement?  Thank  you. My  name  is  Sammy.  I'm  the  field  studies Advisor  here  on  campus  with  Wildland  Studies. And  I'm  here  today  and  tomorrow  to  talk  to  you  guys  about field  study  opportunities  that  we  have offered  in  15  different  locations. Some  of  our  locations  abroad  include Iceland,  Thailand,  Nepal,  Australia. But  we  also  have  some  closer  to  home, like  Yellowstone,  Alaska,  or  Big  Sir,  California. All  of  our  programs  focus  on  core  concepts  in  ecology, wildlife  culture,  and  how  these  concepts  tie  together. And  if  you  were  to  join  us  on  a  program, you  would  get  to  spend  your  quarter  hiking,  camping, snorkeling,  conducting  field  research  while  earning up  to  15  quarter  credits  in  the  upper  division  level. I'm  here  today  because  I  got  to  join Wildlands  as  a  student,  where  in  Haw

In [9]:
def splitSentences(text):
    # Split into sentences with token counts
    sentences = nltk.sent_tokenize(text)
    sentence_token_counts = []
    
    for sent in sentences:
        # Tokenize once without special tokens
        tokens = tokenizer.tokenize(sent, add_special_tokens=False)
        sentence_token_counts.append((sent, len(tokens)))
    return sentence_token_counts
      

def combine_sentences(sentence_token_counts, max_seq_length=256):
  max_tokens = max_seq_length - 2

  chunks = []
  current_chunk = []
  current_token_count = 0

  for sentence, token_count in sentence_token_counts:
    if current_token_count + token_count > max_tokens:
      # Finalize current chunk
      chunks.append(" ".join(current_chunk))
      current_chunk = []
      current_token_count = 0
    
    # Handle giant sentences exceeding max_tokens
    if token_count > max_tokens:
      # Split into sub-chunks by tokens
      tokens = tokenizer.tokenize(sentence, add_special_tokens=False)
      for i in range(0, len(tokens), max_tokens):
        subchunk = tokens[i:i+max_tokens]
        chunks.append(tokenizer.convert_tokens_to_string(subchunk))
    else:
      current_chunk.append(sentence)
      current_token_count += token_count

  # Add remaining sentences
  if current_chunk:
    chunks.append(" ".join(current_chunk))
  
  return chunks

In [10]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def TextListToEmbedding(sentences):
  # Tokenize sentences
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

  # Compute token embeddings
  with torch.no_grad():
    model_output = model(**encoded_input)

  # Perform pooling
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

  # Normalize embeddings
  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

  # print("Sentence embeddings:")
  # print(sentence_embeddings)
  return sentence_embeddings.numpy()

In [69]:

embeddings_list = []
lectures = [lecture1_text, lecture2_text, lecture3_text]
for corpus in lectures:
  split_sentences = splitSentences(corpus)
  chunks = combine_sentences(split_sentences)
  embeddings = TextListToEmbedding(chunks)
  embeddings_list.append(embeddings)
  

In [None]:
print(len(embeddings_list), " | ", len(embeddings_list[0]), len(embeddings_list[1]), len(embeddings_list[2]))

3  |  33 47 35


In [71]:

with psycopg.connect(f"dbname={db_name} user={user} password={db_pswd} host=localhost") as conn:
  with conn.cursor() as cur:
    cur.execute(
        "SELECT * from test"
    )
    for record in cur:
      print(record)
    conn.commit()    

(1, 'Jeffrey')


In [72]:

for i, (chunk, embedding) in enumerate(zip(chunks, embeddings_list[0])):
  print(i, chunk, embedding)

0  Which is weird, is that? Any better? Is this any better? Yeah, that's better. Okay. So today we are going to continue. I want to sort of review what we did last time and then finish up this area so we can move into memory systems next time. So any questions before we start? I will sort of apologize if things seem a little bit. The problem was that when I taught this last 154B was a required class, not last last, but I used to teach this regularly, and 154B was a required class. So there basically wasn't much of what we've been talking about in here because you had to take 154B. Now, the CS students don't have to take 154B, but they need as much of that information as I can. can shove in here as possible. So I'm still trying to smooth some of that out. We've sort of seen something and then done something. And then people said, I didn't understand that first part. And then we're kind of, so hopefully, hopefully it'll be more clear. All right. So on Wednesday, we were talking about per

In [None]:

lecturepaths = [
  "course-lectures/BIS 002A/BIS-002A: 2024-02-05 10:57.txt", 
  "course-lectures/ECN 001A/ECN-001A: 2023-10-31 18:07.txt",
  "course-lectures/ECS 154A/ECS-154A: 2025-02-21 14:07.txt" 
]

with psycopg.connect(f"dbname={db_name} user={user} password={db_pswd} host=localhost") as conn:
  with conn.cursor() as cur:
    for lecturepath in lecturepaths:
      lecturename = lecturepath.split("/")[-1] 
      coursename = lecturepath.split("/")[1]
      lecture_text = loadText(lecturepath)

      split_sentences = splitSentences(lecture_text)
      chunks = combine_sentences(split_sentences)
      embeddings_list = TextListToEmbedding(chunks)

      avg_embedding = np.mean(embeddings_list, axis=0).tolist()

      cur.execute(
        "INSERT INTO lectures (title, class, avg_embedding) VALUES (%s, %s, %s) RETURNING lecture_id", 
        (lecturename, coursename, avg_embedding)       
      )
      lecture_id = cur.fetchone()[0]
      print(lecture_id)

      for i, (chunk, embedding) in enumerate(zip(chunks, embeddings_list)):
        cur.execute(
          "INSERT INTO chunks (lecture_id, content, embedding, position) VALUES (%s, %s, %s, %s)",
          (lecture_id, chunk, embedding.tolist(), i)
        )

      conn.commit()

     

7
8
9


In [None]:

# lecturepaths = [
#   "course-lectures/BIS 002A/BIS-002A: 2024-02-05 10:57.txt", 
#   "course-lectures/ECN 001A/ECN-001A: 2023-10-31 18:07.txt",
#   "course-lectures/ECS 154A/ECS-154A: 2025-02-21 14:07.txt" 
# ]

with psycopg.connect(f"dbname={db_name} user={user} password={db_pswd} host=localhost") as conn:
  with conn.cursor() as cur:
    for root, dirs, files in os.walk('./course-lectures'):
      for file in files:
        if not file.endswith('.txt'):
          continue
        
        try:
          lecturename = file
          coursename = lecturename.split(":")[0]
          lecturepath = f"{root}/{file}"
          lecture_text = loadText(lecturepath)
          if not lecture_text:
            continue

          split_sentences = splitSentences(lecture_text)
          chunks = combine_sentences(split_sentences)
          embeddings_list = TextListToEmbedding(chunks)

          cur.execute(
            "INSERT INTO lectures (title, class) VALUES (%s, %s) RETURNING lecture_id", 
            (lecturename, coursename)     
          )
          lecture_id = cur.fetchone()[0]
          print(lecture_id)

          for i, (chunk, embedding) in enumerate(zip(chunks, embeddings_list)):
            cur.execute(
              "INSERT INTO chunks (lecture_id, content, embedding, position) VALUES (%s, %s, %s, %s)",
              (lecture_id, chunk, embedding.tolist(), i)
            )
          conn.commit()
        except Exception as e:
          print(f"Failed to process {file}: {str(e)}")
          conn.rollback()     


10
11
12
13


Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors


14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
28

In [12]:

def NearestChunk(query):
  if type(query) != str:
    print("Invalid input")
    return False
  
  query_list = [query]
  query_embedding = TextListToEmbedding(query_list)[0].tolist()
  
  result_chunks = None
  with psycopg.connect(f"dbname={db_name} user={user} password={db_pswd} host=localhost") as conn:
    with conn.cursor() as cur:

      cur.execute(
        'SELECT c.content, l.title, 1 - (embedding <=> %s::vector) AS similarity FROM chunks c JOIN lectures l on c.lecture_id = l.lecture_id ORDER BY embedding <=> %s::vector LIMIT 3',
        (query_embedding,query_embedding)
      )
      result_chunks = cur.fetchall()

      conn.commit()

  print("Top 3 matches:\n--------------")
  for row in result_chunks:
    content, title, similarity = row  # Unpack all 3 values
    print(f"""
    Lecture: {title}
    Similarity: {similarity:.4f}
    Content: {content}
    """)
  return result_chunks

In [13]:


def NearestLecture(query):
  if type(query) != str:
    print("Invalid input")
    return False
  
  query_list = [query]
  query_embedding = TextListToEmbedding(query_list)[0].tolist()

  result_lectures = None
  with psycopg.connect(f"dbname={db_name} user={user} password={db_pswd} host=localhost") as conn:
    with conn.cursor() as cur:
      
      cur.execute(
        'SELECT title, class, 1 - (avg_embedding <=> %s::vector) AS similarity FROM lectures ORDER BY avg_embedding <=> %s::vector LIMIT 3',
        (query_embedding,query_embedding)
      )
      result_lectures = cur.fetchall()
  
      conn.commit()
      
  print("Top 3 matches:\n--------------")
  for row in result_lectures:
    title, classname, similarity = row  # Unpack all 3 values
    print(f"""
    Class: {classname}
    Lecture: {title}
    Similarity: {similarity:.4f}
    """)
  return result_lectures

In [None]:
queries = ['Derivatives and deep learning', 'Brain injury treatments, recovering from lesions', 'Design of large scale software', 'How the brain works?']

for query in queries:
  print(f"\n\nRunning with query: {query}")
  NearestChunk(query)
  NearestLecture(query)



Running with query: Derivatives and deep learning
Top 3 matches:
--------------

    Lecture: ECS-174: 2024-10-24 13:37.txt
    Similarity: 0.637
    Content: I'm  going  to  talk  about  that  now. Calculation  of  derivative. If  you  know  how  to  calculate  derivative, then  this  is  simple. Do  you  agree? But  calculation  of  derivative  in deep  m...
    

    Lecture: ECS-174: 2024-10-24 13:37.txt
    Similarity: 0.607
    Content: Don't  worry  about  it  for  now. But  imagine  from  that  point  of  view,  that's  possible,  right? But  I'm  arguing  that  that  optima  is  non  stable. And  by  little  noise,  we're  gonna  ...
    

    Lecture: ECS-174: 2024-10-24 13:37.txt
    Similarity: 0.583
    Content: If  you  do  the  forward  pass,  you're  done,  you  get  the  answer. But  if  you  want  to  do  the  learning, there's  another  step  called  backward  pass. That  means  after  the  forward, you...
    
Top 3 matches:
--------------

    Class: ECS-174
    

In [153]:


queries = ['Philosophy of science', 'How do I write good literature?', 'Career advice', 'Biological insights of DNA']

for query in queries:
  print(f"\n\nRunning with query: {query}")
  NearestChunk(query)
  NearestLecture(query)



Running with query: Philosophy of science
Top 3 matches:
--------------

    Lecture: PHI-013: 2023-04-19 10:57.txt
    Similarity: 0.441
    Content: But  you  might  think thought  experiments  are  a  thing  for  philosophers. Philosophers  like  to  think. We  reason  about  stuff. We  sit  in  our  armchairs  and  if  we  have  beards,  we  str...
    

    Lecture: PHI-013: 2023-05-15 10:57.txt
    Similarity: 0.415
    Content: I  work  on  consciousness  and  I  want a  scientific  theory  of  consciousness. That's  maybe  a  strange  thing, perhaps  coming  out  of  the  philosophers  mouth. But  there  are  a  lot  of  ph...
    

    Lecture: PHI-013: 2023-05-15 10:57.txt
    Similarity: 0.404
    Content: So  when  you're  thinking  about  how  to  develop  a  philosophy  of  mind, some  people  might  fall  into  this  category. Sit  in  their  armchair,  think, stroke  their  chin  and  asked  themse...
    
Top 3 matches:
--------------

    Class: PHI-013
    Lecture:

In [14]:




queries = ['Memory', 'Difference between implicit and explicit memory']

for query in queries:
  print(f"\n\nRunning with query: {query}")
  NearestChunk(query)
  NearestLecture(query)



Running with query: Memory
Top 3 matches:
--------------

    Lecture: ECS-154A: 2025-03-14 14:07.txt
    Similarity: 0.5810
    Content: So I may have a bite-addressable machine, but generally speaking, when I go out to get something, I'll get a word back. And maybe I'll just ignore the other three. There have been subwords, which doesn't really matter. Blocks are what we're familiar with now. Because, right, if I'm going to have a line size of four words, I can see. set up my machine to when I do a request, go out and get four words. Because that's my line size of my cache. So you just have to. OK, so we got that. This is kind of dull and dry at this point. But now we get into some interesting stuff. So there's various access methods. How do I access my memory? So there's sequential. which means I scan through memory until I hit the thing that I want. This is, it is laid out. Anybody know what sort of storage used that? No, the cache does not lay it out sequentially. This is more e

In [15]:


testing_spacing_effects = [
  "And  so  I  can  vividly  remember my  experience  like  when  I  saw  that  dog  and  where  I  saw  it. And  so  that's  an  episodic  memory because  it's  about  a  piece  of  information, but  it's  tied  to  a  particular  episode  on  a  more  cheerful  example. If  you  came  to  week  one  discussion, you'll  remember  that  I  showed  you  this  picture and  you  might  have  forgotten  some  details  about  the  episode. But  if  I  showed  you  this  picture  like  I  am  now, it's  likely  you'd  remember  at  least  oh, that's  Dr.  boy's  cat  and  dog. He  showed  us  pictures  of  them. In  week  one  discussion  in  cognitive  psychology, I  was  sitting  in  Wellman  Hall,  et  cetera. You'd  have  all  this  information  about the  particular  episode  that  you  experienced. Okay? And  then  we  also  have  what  are  called  implicit  memories. And  so  these  are  memories  that  you're not  necessarily  aware  of  and  you wouldn't  be  able  to  explain  in words  like  how  you  know  what  you  know, you  just  kind  of,  you  just  have  that  knowledge. So  a  classic  example  of procedural  implicit  memory  is  knowing  how  to  write  a  bike. So  you  just  kind  of  know  how  to  do  it.",
  "And so I can vividly remember my experience like when I saw that dog and where I saw it. And so that's an episodic memory because it's about a piece of information, but it's tied to a particular episode on a more cheerful example. If you came to week one discussion, you'll remember that I showed you this picture and you might have forgotten some details about the episode. But if I showed you this picture like I am now, it's likely you'd remember at least oh, that's Dr. boy's cat and dog. He showed us pictures of them. In week one discussion in cognitive psychology, I was sitting in Wellman Hall, et cetera. You'd have all this information about the particular episode that you experienced. Okay? And then we also have what are called implicit memories. And so these are memories that you're not necessarily aware of and you wouldn't be able to explain in words like how you know what you know, you just kind of, you just have that knowledge. So a classic example of procedural implicit memory is knowing how to write a bike. So you just kind of know how to do it."
]


for query in testing_spacing_effects:
  print(f"\n\nRunning with query: {query}")
  NearestChunk(query)
  NearestLecture(query)



Running with query: And  so  I  can  vividly  remember my  experience  like  when  I  saw  that  dog  and  where  I  saw  it. And  so  that's  an  episodic  memory because  it's  about  a  piece  of  information, but  it's  tied  to  a  particular  episode  on  a  more  cheerful  example. If  you  came  to  week  one  discussion, you'll  remember  that  I  showed  you  this  picture and  you  might  have  forgotten  some  details  about  the  episode. But  if  I  showed  you  this  picture  like  I  am  now, it's  likely  you'd  remember  at  least  oh, that's  Dr.  boy's  cat  and  dog. He  showed  us  pictures  of  them. In  week  one  discussion  in  cognitive  psychology, I  was  sitting  in  Wellman  Hall,  et  cetera. You'd  have  all  this  information  about the  particular  episode  that  you  experienced. Okay? And  then  we  also  have  what  are  called  implicit  memories. And  so  these  are  memories  that  you're not  necessarily  aware  of  and  you wouldn't  be  abl

In [18]:


burning_questions = [
  "Animal testing", "Horses"
]


for query in burning_questions:
  print(f"\n\nRunning with query: {query}")
  NearestChunk(query)
  NearestLecture(query)



Running with query: Animal testing
Top 3 matches:
--------------

    Lecture: NPB-162: 2025-03-06 13:37.txt
    Similarity: 0.6229
    Content: One just kind of thing to highlight is, anytime you're testing an animal for a strategy, so when you test, when you train animals on something, you train them with rewards when you want to test the animal to see what they remember you usually test them without a reward because there's all these problems with there could be smell of the reward or other things that are going on but then if you're actually more trying to understand what they're doing instead of just being sure then you would actually provide rewards steps that they keep investigating. So for this to be a perfect, perfect, perfect, perfect thing, you would have to remove all the rewards and just test it. Because the animal is going to behave based on what it expects to be happening and not what they can smell or other things that are going on. So the evidence supports this hypot