In [6]:
import os
from dotenv import load_dotenv

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

import requests
import dl_google_cl
import json

In [7]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key is None:
    raise ValueError("OpenAI API key not found. Please check your .env file.")

llm = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0, api_key=openai_api_key)
llm_4 = ChatOpenAI(model="gpt-4-1106-preview", temperature=0, api_key=openai_api_key)

In [8]:
session = requests.Session()
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})

proxies = dl_google_cl.get_proxies('direct')

url = 'https://scholar.google.com/scholar_case?case=16629036189640273971&q=michael+a+cunningham+v+new+york&hl=en&as_sdt=2006'
case1, links, soup = dl_google_cl.get_text_links(url=url, proxies=proxies, session=session)
new_links = dl_google_cl.parse_links(links)
print("Case:", case1[:40], "...")
print("Num new links:", len(new_links))

Full API Response: {'count': 10, 'next': None, 'previous': None, 'results': [{'id': 'd-15375923031', 'username': 'zeobvnhf', 'password': '9djpunuob54a', 'proxy_address': '38.154.227.167', 'port': 5868, 'valid': True, 'last_verification': '2023-12-30T17:45:12.276734-08:00', 'country_code': 'US', 'city_name': 'Piscataway', 'asn_name': 'B2 Net Solutions Inc.', 'asn_number': 55286, 'high_country_confidence': True, 'created_at': '2023-09-25T14:16:00.284955-07:00'}, {'id': 'd-782270686', 'username': 'zeobvnhf', 'password': '9djpunuob54a', 'proxy_address': '185.199.229.156', 'port': 7492, 'valid': True, 'last_verification': '2023-12-30T17:43:36.852019-08:00', 'country_code': 'ES', 'city_name': 'Madrid', 'asn_name': 'Cafe Tecnologia Llc', 'asn_number': 205993, 'high_country_confidence': True, 'created_at': '2022-10-08T17:21:18.335739-07:00'}, {'id': 'd-715009785', 'username': 'zeobvnhf', 'password': '9djpunuob54a', 'proxy_address': '185.199.228.220', 'port': 7300, 'valid': True, 'last_verifica

In [9]:
with open("./json-files/prompts.json", 'r') as file:
    prompts_data = json.load(file)

with open("./json-files/variables.json", 'r') as file:
    variables_data = json.load(file)

summary_prompt = ChatPromptTemplate.from_template(str(prompts_data['summary']))
summary_llm = summary_prompt | llm_4

top_citations_prompt = ChatPromptTemplate.from_template(str(prompts_data['top_citations']))
top_citations_llm = top_citations_prompt | llm_4

best_next_case_prompt = ChatPromptTemplate.from_template(str(prompts_data['best_next_case']))
best_next_case_llm = best_next_case_prompt | llm

synthesis_prompt = ChatPromptTemplate.from_template(str(prompts_data['synthesis']))
synthesis_llm = synthesis_prompt | llm_4

In [10]:
summaries_list = []
citations_list = []

for i in range(5):
    print("Summarizing", i)
    print("Case length in tokens (char/4):", len(case1)/4)
    # Summarize case, add to summaries_list
    summary = summary_llm.invoke({
        "background": variables_data['background'],
        "issue": variables_data['issue'],
        "case": case1
    }).content
    summaries_list.append(summary)

    print("Finding citations")
    # Look at backwards citations, add them to list
    top_citations = top_citations_llm.invoke({
        "background": variables_data['background'],
        "issue": variables_data['issue'],
        "case": case1
    }).content
    citations_list.append(top_citations)

    # Decide best next citation
    best_next = best_next_case_llm.invoke({
        "background": variables_data['background'],
        "issue": variables_data['issue'],
        "case": top_citations
    }).content

    url = dl_google_cl.get_citation_url(best_next, soup)
    print(best_next, url)
    if len(url) <= len('https://scholar.google.com/'):
        break
    case1, _, soup = dl_google_cl.get_text_links(url=url, proxies=proxies, session=session)

synthesis = synthesis_llm.invoke({
        "background": variables_data['background'],
        "issue": variables_data['issue'],
        "summaries": summaries_list
}).content

Summarizing 0
Case length in tokens (char/4): 3664.5
Finding citations
<function process_citation at 0x000001E64921F760>
<BEST_CASE:> People v. Weaver, 12 NY3d 433 (2009) https://scholar.google.com/scholar_case?case=11458299455679828571&q=michael+a+cunningham+v+new+york&hl=en&as_sdt=2006
2
<!DOCTYPE html>
<html><head><title>People v. Weaver, 909 NE 2d 1195 - NY: Court of Appeals 2009 - Google Scholar</title><meta content="text/html;charset=utf-8" http-equiv="Content-Type"/><meta content="IE=Edge" http-equiv="X-UA-Compatible"/><meta content="origin-when-cross-origin" name="referrer"/><meta content="width=device-width,initial-scale=1,minimum-scale=1,maximum-scale=2" name="viewport"/><meta content="telephone=no" name="format-detection"/><link href="/favicon.ico" rel="shortcut icon"/><style>html,body,form,table,div,h1,h2,h3,h4,h5,h6,img,ol,ul,li,button{margin:0;padding:0;border:0;}table{border-collapse:collapse;border-width:0;empty-cells:show;}html,body{height:100%}#gs_top{position:relativ

TypeError: object of type 'NoneType' has no len()

In [11]:
synthesis = synthesis_llm.invoke({
        "background": variables_data['background'],
        "issue": variables_data['issue'],
        "summaries": summaries_list
}).content

In [None]:
print(synthesis)