In [1]:
import os
from dotenv import load_dotenv

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

import requests
import dl_google_cl
import json

import time

In [2]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key is None:
    raise ValueError("OpenAI API key not found. Please check your .env file.")

llm = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0, api_key=openai_api_key)
llm_4 = ChatOpenAI(model="gpt-4-1106-preview", temperature=0, api_key=openai_api_key)

In [3]:
session = requests.Session()
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})

proxies = dl_google_cl.get_proxies('direct')

url = 'https://scholar.google.com/scholar_case?case=16629036189640273971&q=michael+a+cunningham+v+new+york&hl=en&as_sdt=2006'
case1, links, soup = dl_google_cl.get_text_links(url=url, proxies=proxies, session=session)
new_links = dl_google_cl.parse_links(links)
print("Case:", case1[:40], "...")
print("Num new links:", len(new_links))

Full API Response: {'count': 10, 'next': None, 'previous': None, 'results': [{'id': 'd-15375923031', 'username': 'zeobvnhf', 'password': '9djpunuob54a', 'proxy_address': '38.154.227.167', 'port': 5868, 'valid': True, 'last_verification': '2023-12-31T12:04:37.801007-08:00', 'country_code': 'US', 'city_name': 'Piscataway', 'asn_name': 'Server-Mania', 'asn_number': 55286, 'high_country_confidence': True, 'created_at': '2023-09-25T14:16:00.284955-07:00'}, {'id': 'd-782270686', 'username': 'zeobvnhf', 'password': '9djpunuob54a', 'proxy_address': '185.199.229.156', 'port': 7492, 'valid': True, 'last_verification': '2023-12-31T12:05:02.165795-08:00', 'country_code': 'ES', 'city_name': 'Madrid', 'asn_name': 'Cafe Tecnologia Llc', 'asn_number': 205993, 'high_country_confidence': True, 'created_at': '2022-10-08T17:21:18.335739-07:00'}, {'id': 'd-715009785', 'username': 'zeobvnhf', 'password': '9djpunuob54a', 'proxy_address': '185.199.228.220', 'port': 7300, 'valid': True, 'last_verification': '2

In [30]:
with open("./json-files/prompts.json", 'r') as file:
    prompts_data = json.load(file)

with open("./json-files/variables.json", 'r') as file:
    variables_data = json.load(file)

summary_prompt = ChatPromptTemplate.from_template(str(prompts_data['summary']))
summary_llm = summary_prompt | llm

top_citations_prompt = ChatPromptTemplate.from_template(str(prompts_data['top_citations']))
top_citations_llm = top_citations_prompt | llm_4

best_next_case_prompt = ChatPromptTemplate.from_template(str(prompts_data['best_next_case']))
best_next_case_llm = best_next_case_prompt | llm

synthesis_prompt = ChatPromptTemplate.from_template(str(prompts_data['synthesis']))
synthesis_llm = synthesis_prompt | llm_4

decision_prompt = ChatPromptTemplate.from_template(str(prompts_data['decide_if_relevant']))
decision_llm = decision_prompt | llm

NAIVE APPROACH:

In [None]:
summaries_list = []
citations_list = []

for i in range(5):
    print("Summarizing", i)
    print("Case length in tokens (char/4):", len(case1)/4)
    # Summarize case, add to summaries_list
    summary = summary_llm.invoke({
        "background": variables_data['background'],
        "issue": variables_data['issue'],
        "case": case1
    }).content
    summaries_list.append(summary)

    print("Finding citations")
    # Look at backwards citations, add them to list
    top_citations = top_citations_llm.invoke({
        "background": variables_data['background'],
        "issue": variables_data['issue'],
        "case": case1
    }).content
    citations_list.append(top_citations)

    # Decide best next citation
    best_next = best_next_case_llm.invoke({
        "background": variables_data['background'],
        "issue": variables_data['issue'],
        "case": top_citations
    }).content

    url = dl_google_cl.get_citation_url(best_next, soup)
    print(best_next, url)
    if len(url) <= len('https://scholar.google.com/'):
        break
    case1, _, soup = dl_google_cl.get_text_links(url=url, proxies=proxies, session=session)

synthesis = synthesis_llm.invoke({
        "background": variables_data['background'],
        "issue": variables_data['issue'],
        "summaries": summaries_list
}).content

In [5]:
def count_case_appearances(input_list):
    # Flatten the list of lists
    flat_list = [item for sublist in input_list for item in sublist]

    # Count appearances and store one link per case using a dictionary
    count_dict = {}
    for case, link in flat_list:
        if case in count_dict:
            # Increment the count, keep the existing link
            count, _ = count_dict[case]
            count_dict[case] = (count + 1, link)
        else:
            # New case, set count to 1 and store the link
            count_dict[case] = (1, link)

    # Create a list of tuples (case, link, count) from the dictionary
    counted_list = [(case, link, count) for case, (count, link) in count_dict.items()]

    # Sort the list based on the count in descending order
    sorted_counted_list = sorted(counted_list, key=lambda x: x[2], reverse=True)

    return sorted_counted_list

BETTER APPROACH:

Step 1: Generate relevant QUERIES (need to implement)

Step 2: Search + add all cases to a list of results

Step 3: Rank the cases by number of appearances in results for different QUERIES

Step 4: Decide whether or not they are worth truly researching

In [6]:
queries = ['Case law on warrant expiration in police surveillance',
'GPS tracking and admissibility of evidence',
'Fourth Amendment and warrantless surveillance',
'Exclusionary rule in cases of expired warrants',
'Precedents on police tracking after warrant expiration',
'Search and seizure cases involving expired warrants']

In [7]:
cases = []

for query in queries:
    query_url = dl_google_cl.create_link_query(query)
    time.sleep(2.5)
    potential_cases = dl_google_cl.parse_query_url(query_url,proxies=proxies,session=session)
    cases.append(potential_cases)

1
2
3
4
5
6


In [8]:
sorted_cases = count_case_appearances(cases)

In [32]:
research_decisions = []

t = 0
for case in sorted_cases:
    if t > 10:
        break
    t+=1
    
    prefix = 'https://scholar.google.com'
    url = prefix + case[1]
    text, _, _ = dl_google_cl.get_text_links(url=url, proxies=proxies, session=session)

    time.sleep(5)

    initial_look = text[:int(len(text)/4)]

    case_summary = summary_llm.invoke({
        "background": variables_data['background'],
        "case": initial_look,
        "issue": variables_data['issue']
    })

    print(case_summary)

    decision = decision_llm.invoke({
        "background": variables_data['background'],
        "case": case_summary,
        "issue": variables_data['issue'],
        "queries": queries
    }).content

    print(decision)

    while decision != ('YES' or 'NO'):
        decision = decision_llm.invoke({
            "background": variables_data['background'],
            "case": initial_look,
            "issue": variables_data['issue'],
            "queries": queries
        }).content
        
    research_decisions.append([decision, case[0], case[1], case[2]])

9
content="The case of US v. Jones directly pertains to the issue of whether the police should be allowed to use evidence discovered after a warrant has expired. Here's how each side would use the case for their arguments:\n\n- The defense would argue that the installation of a GPS tracking device on the vehicle and its use to monitor the vehicle's movements constituted a search under the Fourth Amendment, and therefore, the evidence obtained after the warrant expired should be inadmissible.\n- The prosecution would argue that the use of the GPS device to monitor the vehicle's movements on public streets did not violate Jones's reasonable expectation of privacy, as the information obtained was voluntarily conveyed to the public.\n\nEvery key thought about the case in direct relation to the issue has a direct text citation from the case."
<RESPONSE:> YES
0
content="- The case of Carpenter v. US directly pertains to the issue at hand because it involves the acquisition of cell-site recor

In [34]:
for i in research_decisions:
    print(i)

['YES', 'US v. Jones', '/scholar_case?case=3066032366235422373&q=Search+and+seizure+cases+involving+expired+warrants&hl=en&oe=ASCII&as_sdt=6,33', 5]
['YES', 'Carpenter v. US', '/scholar_case?case=14655974745807704559&q=Precedents+on+police+tracking+after+warrant+expiration&hl=en&oe=ASCII&as_sdt=6,33', 4]
['YES', 'Katz v. United States', '/scholar_case?case=9210492700696416594&q=Precedents+on+police+tracking+after+warrant+expiration&hl=en&oe=ASCII&as_sdt=6,33', 3]
['YES', 'Pennsylvania v. Mimms', '/scholar_case?case=16533225265380952768&q=Search+and+seizure+cases+involving+expired+warrants&hl=en&oe=ASCII&as_sdt=6,33', 3]
['YES', 'Sibron v. New York', '/scholar_case?case=16128185020980229395&q=Search+and+seizure+cases+involving+expired+warrants&hl=en&oe=ASCII&as_sdt=6,33', 3]
['YES', 'United States v. Karo', '/scholar_case?case=495897577064781112&q=Fourth+Amendment+and+warrantless+surveillance&hl=en&oe=ASCII&as_sdt=6,33', 3]
['YES', 'Herring v. US', '/scholar_case?case=105739868649083083