In [1]:
import sqlite3
import os
import pandas as pd

# Get relative path to the database file
db_path = "../amici/database/supreme_court_docs.db"

# Check if the file exists
if not os.path.exists(db_path):
    print(f"Database file not found at: {db_path}")
    print(f"Current working directory: {os.getcwd()}")
else:
    # Connect to SQLite database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    print(f"Successfully connected to SQLite database at {db_path}")

Successfully connected to SQLite database at ../amici/database/supreme_court_docs.db


## Testing the Connection

Once connected, you can test the connection by executing a simple query:

In [2]:
# Use this after connecting with either method above
try:
    # Sample query - adjust table name as needed
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    
    # Fetch and display results
    tables = cursor.fetchall()
    print("Tables in the database:")
    for table in tables:
        print(f"- {table[0]}")
except Exception as e:
    print(f"Error executing query: {e}")

Tables in the database:
- documents
- sqlite_sequence
- dockets
- amici
- lawyers


## Make excel sheets

In [7]:
# Make excel sheets from the database
try:
    # Sample query to fetch data from a specific table
    cursor.execute("SELECT * FROM amici WHERE document_id=524;")  # Replace with your actual table name
    rows = cursor.fetchall()

    column_names = [description[0] for description in cursor.description]
    print(column_names)
    
    # Print the first few rows
    print("Sample data from the table:")
    for row in rows:  # Display first 5 rows
        print(row)
except Exception as e:
    print(f"Error fetching data: {e}")

['amicus_id', 'document_id', 'name', 'category']
Sample data from the table:
(1998, 524, 'Center for Constitutional Rights', 'organization')
(1999, 524, 'Transgender Law Center', 'organization')
(2000, 524, 'National Center for Lesbian Rights', 'organization')
(2001, 524, 'Make the Road New York', 'organization')
(2002, 524, 'Bay Area Lawyers for Individual Freedom', 'organization')
(2003, 524, 'Black & Pink National', 'organization')
(2004, 524, 'Black Alliance for Just Immigration', 'organization')
(2005, 524, 'Black Trans Nation', 'organization')
(2006, 524, 'Center for Community Alternatives', 'organization')
(2007, 524, 'DC LGBTQ+ Community Center', 'organization')
(2008, 524, 'Desiree Alliance', 'organization')
(2009, 524, 'Drug Policy Alliance', 'organization')
(2010, 524, 'Equality Federation', 'organization')
(2011, 524, 'Equality New York', 'organization')
(2012, 524, 'Fountain House', 'organization')
(2013, 524, 'Free to Be Youth Project', 'organization')
(2014, 524, 'GLBTQ 

In [33]:
try:
    # Query to find documents with complete_amici_list=False
    cursor.execute("SELECT * FROM documents WHERE complete_amici_list=0")
    
    # Fetch and display results
    documents_with_appendix = cursor.fetchall()
    
    # Get column names from cursor description
    column_names = [description[0] for description in cursor.description]
    
    # Print count of documents
    print(f"Found {len(documents_with_appendix)} documents with amici list incomplete.")
    
    # Display column names and first few results
    if documents_with_appendix:
        print("\nColumn names:", column_names)
        print("\nFirst 5 documents with incomplete amici:")
        for doc in documents_with_appendix[:5]:
            print(doc)
except Exception as e:
    print(f"Error executing query: {e}")

# Create a file storing the blobs of all documents with complete_amici_list=False
try:
    cursor.execute("SELECT * FROM documents WHERE complete_amici_list=0")
    documents_with_appendix = cursor.fetchall()
    
    # Open a file to write the blobs
    with open("../amici/data/incomplete_amici_blobs.txt", "w") as f:
        for doc in documents_with_appendix:
            # Assuming the blob is in the first column (index 0)
            f.write(doc[7]+'\n')
    
    print("Blobs of incomplete amici documents have been written to incomplete_amici_blobs.txt")
except Exception as e:
    print(f"Error writing blobs to file: {e}")
# Close the database connection
finally:
    if conn:
        conn.close()
        print("Database connection closed.")

Found 1179 documents with amici list incomplete.

Column names: ['document_id', 'url', 'docket_url', 'date', 'date_formatted', 'label', 'doc_title', 'blob', 'transcribed', 'neededOCR', 'complete_amici_list', 'counsel_of_record']

First 5 documents with incomplete amici:
(4, 'http://www.supremecourt.gov/DocketPDF/22/22-535/252021/20230111151208528_22-506and22-535tsacLawyersCommitteeForCivilRightsUnderLaw.pdf', 'www.supremecourt.gov/search.aspx?filename=/docket/docketfiles/html/public/22-535.html', 'Jan 11 2023', '2023-01-11', 'Brief amici curiae of Lawyers’ Committee For Civil Rights Under Law and 21 Other Organizations filed (also in 22-506).  VIDED.  (Distributed)', 'Main Document', 'SUPREMECOURT/www.supremecourt.gov/DocketPDF/22/22-535/252021/20230111151208528_22-506and22-535tsacLawyersCommitteeForCivilRightsUnderLaw.pdf', 1, 0, 0, 'Damon Hewitt')
(29, 'http://www.supremecourt.gov/DocketPDF/19/19-1392/185243/20210729123007530_41063%20pdf%20Pierce.pdf', 'www.supremecourt.gov/search.as

In [17]:
df = pd.read_csv("../amici/data/features.csv")
df['leftrightsorted'] = df.apply(lambda row: '\t'.join(sorted([row['left_norm'], row['right_norm']])), axis=1)
df = df.drop_duplicates(subset=['leftrightsorted'])

In [22]:
df.sort_values('hbsbm_prob')[::-1][:30]

Unnamed: 0.1,Unnamed: 0,left_norm,right_norm,left_raw,right_raw,left_doc,right_doc,charsim,wordsim,ratio,...,tokensort,tokenset,levenstein,jaro_winkler,first_letter_jaccard,combined_len,len_ratio,sentence_cross_encoding,hbsbm_prob,leftrightsorted
114741,114741,natl. fed. of independent business small busin...,nfib small business legal ctr.,National Federation of Independent Business Sm...,NFIB Small Business Legal Center,30,725,0.627791,0.628154,0.621359,...,0.563107,0.915254,0.450704,0.577788,0.625,4.634729,0.450704,0.706659,0.999,natl. fed. of independent business small busin...
189412,189412,assn. of am. publishers,"assn. of am. publishers,.",Association of American Publishers,"Association of American Publishers, Inc.",1323,2232,0.923122,1.0,0.918919,...,0.918919,0.918919,0.85,0.97,0.75,4.304065,0.85,0.953626,0.999,assn. of am. publishers\tassn. of am. publishe...
131808,131808,california chamber of commerce,chamber of commerce of the united st.,California Chamber of Commerce,The Chamber of Commerce of the United States,30,3583,0.60897,0.676922,0.540541,...,0.513514,0.77551,0.295455,0.592929,0.333333,4.304065,0.681818,0.765045,0.999,california chamber of commerce\tchamber of com...
190104,190104,california st. assn. of counties,texas assn. of counties,California State Association of Counties,Texas Association of Counties,44,552,0.539644,0.660639,0.724638,...,0.724638,0.884615,0.625,0.707825,0.6,4.234107,0.725,0.658529,0.999,california st. assn. of counties\ttexas assn. ...
106111,106111,intellectual property law assn. of chicago,intellectual property owners assn.,Intellectual Property Law Association of Chicago,Intellectual Property Owners Association,272,1635,0.655769,0.65475,0.795455,...,0.795455,0.90411,0.645833,0.897976,0.428571,4.477337,0.833333,0.800238,0.999,intellectual property law assn. of chicago\tin...
190922,190922,natl. assn. of counties,texas assn. of counties,National Association of Counties,Texas Association of Counties,139,552,0.702262,0.768272,0.819672,...,0.819672,0.884615,0.78125,0.747773,0.6,4.110874,0.90625,0.691232,0.999,natl. assn. of counties\ttexas assn. of counties
211110,211110,natl. apartment assn.,san francisco apartment assn.,National Apartment Association,San Francisco Apartment Association,744,383,0.613361,0.60003,0.769231,...,0.8,0.823529,0.685714,0.742328,0.25,4.174387,0.857143,0.733272,0.999,natl. apartment assn.\tsan francisco apartment...
327103,327103,madison soc. fdn.,"madison soc. fdn.,.",Madison Society Foundation,"Madison Society Foundation, Inc.",106,3828,0.941351,1.0,0.896552,...,0.896552,0.896552,0.8125,0.9625,0.75,4.060443,0.8125,0.945344,0.999,"madison soc. fdn.\tmadison soc. fdn.,."
192130,192130,fed. of defense & corporate counsel,intl. assn. of defense counsel,Federation of Defense & Corporate Counsel,International Association of Defense Counsel,3142,2364,0.618573,0.557023,0.611765,...,0.705882,0.705882,0.386364,0.669677,0.428571,4.442651,0.931818,0.755434,0.999,fed. of defense & corporate counsel\tintl. ass...
103687,103687,am. intellectual property law assn.,intellectual property owners assn.,American Intellectual Property Law Association,Intellectual Property Owners Association,238,1635,0.729826,0.749119,0.813953,...,0.813953,0.90411,0.673913,0.793783,0.6,4.454347,0.869565,0.878092,0.999,am. intellectual property law assn.\tintellect...
