In [None]:
import os
import trafilatura
import json
from scipy import spatial
import pandas as pd
import re
from urllib.parse import urlparse
import trafilatura.sitemaps
import trafilatura.spider
from langchain.text_splitter import SpacyTextSplitter
from dotenv import load_dotenv
import openai

In [99]:
load_dotenv()

# PARAMATERS
# The embedding model is defined in the .env file as it needs to be shared with the server
# List of embedding models: 
EMBEDDING_CSV_NAME = os.getenv('EMBEDDING_CSV_NAME')
EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL')

# SCRAPING
#INITIAL_URLS = ["https://ucla.edu", "https://admission.ucla.edu", "https://transfers.ucla.edu", "https://newstudents.ucla.edu", "https://portal.housing.ucla.edu", 
#              "https://community.ucla.edu", "https://etransfercenter.seas.ucla.edu", "https://tap.ucla.edu"]

INITIAL_URLS = ["https://ucla.edu", "https://admission.ucla.edu", "https://transfers.ucla.edu", "https://portal.housing.ucla.edu"]

# CHUNKING
CHUNK_SIZE = 1600



In [104]:
# Scrapes the given initial URLS. First checks if sitemap exists, if not, crawls the page. 

urlList = []

def urlsMatch(x, y):
    parse1 = urlparse(x).netloc
    parse2 = urlparse(y).netloc
    
    if parse1.startswith('www.'):
        parse1 = re.sub(r'www.', '', parse1)
    if parse2.startswith('www.'):
        parse2 = re.sub(r'www.', '', parse2)
    return parse1 == parse2

failedUrls = INITIAL_URLS.copy()
failedUrlsIndex = []

for index, url in enumerate(reversed(INITIAL_URLS)):
    print(f"Currently Mapping: {url}")
    sites = trafilatura.sitemaps.sitemap_search(url)
    if(sites):
       del failedUrls[len(INITIAL_URLS) - index - 1]
        
    sites = list(filter(lambda s: urlsMatch(s, url), sites)) 
    print(f"{url} mapped: {len(sites)} sites found")
    urlList.extend(sites)

for url in failedUrls:
    print(f"Currently Mapping: {url}")
    to_visit, known_links = trafilatura.spider.focused_crawler(url, max_seen_urls=10, max_known_urls=100000)
    print(f"{url} mapped: {len(known_links)} sites found")
    urlList.extend(known_links)


urlList = list(set(urlList))


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://portal.housing.ucla.edu/sitemap_news.xml


Currently Mapping: https://portal.housing.ucla.edu


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://portal.housing.ucla.edu/sitemap_index.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://portal.housing.ucla.edu/sitemap.xml.gz
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://transfers.ucla.edu/sitemap_news.xml


https://portal.housing.ucla.edu mapped: 378 sites found
Currently Mapping: https://transfers.ucla.edu


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://transfers.ucla.edu/sitemap_index.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://transfers.ucla.edu/sitemap
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://transfers.ucla.edu/sitemap.xml.gz
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://transfers.ucla.edu/sitemap.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://admission.ucla.edu/sitemap_news.xml


https://transfers.ucla.edu mapped: 0 sites found
Currently Mapping: https://admission.ucla.edu


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://admission.ucla.edu/sitemap_index.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://admission.ucla.edu/sitemap
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://admission.ucla.edu/sitemap.xml.gz


https://admission.ucla.edu mapped: 200 sites found
Currently Mapping: https://ucla.edu


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ucla.edu/sitemap_news.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ucla.edu/sitemap_index.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ucla.edu/sitemap
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://ucla.edu/sitemap.xml.gz


https://ucla.edu mapped: 41 sites found
Currently Mapping: https://transfers.ucla.edu
https://transfers.ucla.edu mapped: 38 sites found
['https://transfers.ucla.edu']


In [107]:
# Grabs the content of each URL
textList = []

for url in urlList:
    dl = trafilatura.fetch_url(url)
    if not dl == None:
        meta = trafilatura.extract_metadata(dl)
    text = trafilatura.extract(dl, include_tables=True, include_formatting=True, favor_recall=True)
    #with open(f"./text2/{meta.title}.txt", 'w') as f:
    #    if not text == None:
    #        f.write(text)
    textList.append({"meta": meta, "text": text})


ERROR:trafilatura.downloads:download error: https://portal.housing.ucla.edu/2021-2022-deluxe-residence-hall-triple-rates HTTPSConnectionPool(host='portal.housing.ucla.edu', port=443): Max retries exceeded with url: https://portal.housing.ucla.edu/2021-2022-deluxe-residence-hall-triple-rates (Caused by ResponseError('too many redirects'))
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://transfers.ucla.edu/node/48
ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://transfers.ucla.edu/transfer-timelines/
ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://transfers.ucla.edu/resources/transfer-newsletter


In [108]:
# Chunks the text of each page

text_splitter = SpacyTextSplitter(chunk_size=CHUNK_SIZE)
chunkList = []

for content, url in zip(textList, urlList):
    if content["text"] == None: continue
        
    sections = text_splitter.split_text(content["text"])
    sections = list(map(lambda s: content["meta"].title + "\n" + s, sections))
    chunkList.extend(sections)

chunkDF = pd.DataFrame(data={"chunk": chunkList})
chunkDF.to_csv("chunk2.csv")




In [109]:
# Creating the embedding for each chunk

client = openai.OpenAI(api_key=os.getenv('OPENAI_KEY'))

embeddingsList = []

response = client.embeddings.create(model=EMBEDDING_MODEL, input=chunkList)
for i, be in enumerate(response.data):
    assert i == be.index
batch_embeddings = [e.embedding for e in response.data]
embeddingsList.extend(batch_embeddings)

embeddingDF = pd.DataFrame({'text': chunkList, 'embeddings': embeddingsList})
embeddingDF.to_csv("embeddings2.csv")

In [111]:
# Filters dataset based on similarity to given query

testEmbeddingDF = pd.read_csv("embeddings2.csv")

relatedness_queries = ["Transfer"]

relatededness_fn=lambda x,y: 1 - spatial.distance.cosine(x,y)
relatedness_query_embeds = client.embeddings.create(model=EMBEDDING_MODEL, input=relatedness_queries).data

# Value between 0 and 1, bigger value is less simlar.
DISTANCE_CUTOFF = 1


for queryEmbedding in relatedness_query_embeds:
    for i, row in testEmbeddingDF.iterrows():
        relatedness = relatededness_fn(queryEmbedding.embedding, json.loads(row["embeddings"]))
        print(relatedness)
        if(relatedness > DISTANCE_CUTOFF):
            print(f"dropped {row['text']}")
            testEmbeddingDF = testEmbeddingDF.drop([i])
            
testEmbeddingDF.to_csv(f"embeddings/{EMBEDDING_CSV_NAME}")



0.13715215483526977
0.1627168770105173
0.19744599413579378
0.11371921511677474
0.0867219236704816
0.1049391310032255
0.09714299617849453
0.16920151995946864
0.20205925328902918
0.17365953943402368
0.1581004546696141
0.15787492386375412
0.1457226456050006
0.16527215665793937
0.30983416736433633
0.2871960262283837
0.2136049479824147
0.29626359177583184
0.3791814562853695
0.25368890820378787
0.2014775273656868
0.1143771734712471
0.1086582920533582
0.0995202782164969
0.2352460482998422
0.3423298668873507
0.14480946585836751
0.23085282470014423
0.13906796876412697
0.12159508663185703
0.14225789814205114
0.12439560157995888
0.11765584110761174
0.08924101578099186
0.05554279812672658
0.04978322123955503
0.06311703475628327
0.044136224666463275
0.07464741614006376
0.0671904896985751
0.15328496606857633
0.146491844536909
0.09483739956726223
0.09742442922034145
0.06234946525109364
0.18239181332457932
0.20008969093275297
0.16967744962258102
0.1881171194464728
0.18587044210714398
0.191792930456987