In [1]:
import sqlite3
import os
import pandas as pd

# Get relative path to the database file
db_path = "../amici/database/supreme_court_docs.db"

# Check if the file exists
if not os.path.exists(db_path):
    print(f"Database file not found at: {db_path}")
    print(f"Current working directory: {os.getcwd()}")
else:
    # Connect to SQLite database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    print(f"Successfully connected to SQLite database at {db_path}")

Successfully connected to SQLite database at ../amici/database/supreme_court_docs.db


## Testing the Connection

Once connected, you can test the connection by executing a simple query:

In [2]:
# Use this after connecting with either method above
try:
    # Sample query - adjust table name as needed
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    
    # Fetch and display results
    tables = cursor.fetchall()
    print("Tables in the database:")
    for table in tables:
        print(f"- {table[0]}")
except Exception as e:
    print(f"Error executing query: {e}")

Tables in the database:
- documents
- sqlite_sequence
- dockets
- amici
- lawyers


## Make excel sheets

In [7]:
# Make excel sheets from the database
try:
    # Sample query to fetch data from a specific table
    cursor.execute("SELECT * FROM amici WHERE document_id=524;")  # Replace with your actual table name
    rows = cursor.fetchall()

    column_names = [description[0] for description in cursor.description]
    print(column_names)
    
    # Print the first few rows
    print("Sample data from the table:")
    for row in rows:  # Display first 5 rows
        print(row)
except Exception as e:
    print(f"Error fetching data: {e}")

['amicus_id', 'document_id', 'name', 'category']
Sample data from the table:
(1998, 524, 'Center for Constitutional Rights', 'organization')
(1999, 524, 'Transgender Law Center', 'organization')
(2000, 524, 'National Center for Lesbian Rights', 'organization')
(2001, 524, 'Make the Road New York', 'organization')
(2002, 524, 'Bay Area Lawyers for Individual Freedom', 'organization')
(2003, 524, 'Black & Pink National', 'organization')
(2004, 524, 'Black Alliance for Just Immigration', 'organization')
(2005, 524, 'Black Trans Nation', 'organization')
(2006, 524, 'Center for Community Alternatives', 'organization')
(2007, 524, 'DC LGBTQ+ Community Center', 'organization')
(2008, 524, 'Desiree Alliance', 'organization')
(2009, 524, 'Drug Policy Alliance', 'organization')
(2010, 524, 'Equality Federation', 'organization')
(2011, 524, 'Equality New York', 'organization')
(2012, 524, 'Fountain House', 'organization')
(2013, 524, 'Free to Be Youth Project', 'organization')
(2014, 524, 'GLBTQ 

In [33]:
try:
    # Query to find documents with complete_amici_list=False
    cursor.execute("SELECT * FROM documents WHERE complete_amici_list=0")
    
    # Fetch and display results
    documents_with_appendix = cursor.fetchall()
    
    # Get column names from cursor description
    column_names = [description[0] for description in cursor.description]
    
    # Print count of documents
    print(f"Found {len(documents_with_appendix)} documents with amici list incomplete.")
    
    # Display column names and first few results
    if documents_with_appendix:
        print("\nColumn names:", column_names)
        print("\nFirst 5 documents with incomplete amici:")
        for doc in documents_with_appendix[:5]:
            print(doc)
except Exception as e:
    print(f"Error executing query: {e}")

# Create a file storing the blobs of all documents with complete_amici_list=False
try:
    cursor.execute("SELECT * FROM documents WHERE complete_amici_list=0")
    documents_with_appendix = cursor.fetchall()
    
    # Open a file to write the blobs
    with open("../amici/data/incomplete_amici_blobs.txt", "w") as f:
        for doc in documents_with_appendix:
            # Assuming the blob is in the first column (index 0)
            f.write(doc[7]+'\n')
    
    print("Blobs of incomplete amici documents have been written to incomplete_amici_blobs.txt")
except Exception as e:
    print(f"Error writing blobs to file: {e}")
# Close the database connection
finally:
    if conn:
        conn.close()
        print("Database connection closed.")

Found 1179 documents with amici list incomplete.

Column names: ['document_id', 'url', 'docket_url', 'date', 'date_formatted', 'label', 'doc_title', 'blob', 'transcribed', 'neededOCR', 'complete_amici_list', 'counsel_of_record']

First 5 documents with incomplete amici:
(4, 'http://www.supremecourt.gov/DocketPDF/22/22-535/252021/20230111151208528_22-506and22-535tsacLawyersCommitteeForCivilRightsUnderLaw.pdf', 'www.supremecourt.gov/search.aspx?filename=/docket/docketfiles/html/public/22-535.html', 'Jan 11 2023', '2023-01-11', 'Brief amici curiae of Lawyers’ Committee For Civil Rights Under Law and 21 Other Organizations filed (also in 22-506).  VIDED.  (Distributed)', 'Main Document', 'SUPREMECOURT/www.supremecourt.gov/DocketPDF/22/22-535/252021/20230111151208528_22-506and22-535tsacLawyersCommitteeForCivilRightsUnderLaw.pdf', 1, 0, 0, 'Damon Hewitt')
(29, 'http://www.supremecourt.gov/DocketPDF/19/19-1392/185243/20210729123007530_41063%20pdf%20Pierce.pdf', 'www.supremecourt.gov/search.as

In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv("../amici/deduplication/data/feature.csv")
df['leftrightsorted'] = df.apply(lambda row: '\t'.join(sorted([row['left_norm'], row['right_norm']])), axis=1)
df = df.drop_duplicates(subset=['leftrightsorted'])
print(len(df))

4518


In [6]:
df.sort_values('dc_reg_eq')[::-1][:11]

Unnamed: 0,left_norm,right_norm,charsim,wordsim,ratio,partialratio,tokensort,tokenset,levenstein,jaro_winkler,first_letter_jaccard,combined_len,len_ratio,dc_reg_eq,leftrightsorted
3927,jewish war veterans of the united states of am...,veterans of foreign wars,0.347456,0.21786,0.453333,0.666667,0.506667,0.628571,0.313725,0.608497,0.333333,4.317488,0.470588,6.159984,jewish war veterans of the united states of am...
3923,veterans of foreign wars,veterans of foreign wars of the united states,0.80241,0.843153,0.695652,1.0,0.695652,1.0,0.533333,0.906667,0.571429,4.234107,0.533333,4.551082,veterans of foreign wars\tveterans of foreign ...
1341,california financial services assn,financial services inst,0.70369,0.750308,0.701754,0.869565,0.631579,0.878049,0.588235,0.731497,0.4,4.043051,0.676471,4.257824,california financial services assn\tfinancial ...
3403,california state sheriffs assn,western states sheriffs’ assn,0.500685,0.471792,0.745763,0.830189,0.644068,0.644068,0.666667,0.694288,0.5,4.077537,0.966667,2.710935,california state sheriffs assn\twestern states...
46,informed consent action network,physicians for informed consent,0.515592,0.726392,0.516129,0.680851,0.580645,0.680851,0.096774,0.670088,0.333333,4.127134,1.0,2.380562,informed consent action network\tphysicians fo...
2,united south and eastern tribes sovereignty pr...,uset sovereignty protection fund,0.656144,0.54729,0.703297,0.95082,0.659341,0.915254,0.542373,0.731462,0.571429,4.51086,0.542373,1.260449,united south and eastern tribes sovereignty pr...
2053,advocates for human rights,human rights defense ctr,0.462446,0.576962,0.48,0.666667,0.68,0.68,0.115385,0.600641,0.333333,3.912023,0.923077,1.177976,advocates for human rights\thuman rights defen...
1376,americans for immigrant justice,dream corps justice,0.191354,0.163135,0.52,0.645161,0.56,0.56,0.419355,0.67712,0.166667,3.912023,0.612903,1.105257,americans for immigrant justice\tdream corps j...
2914,legacy fdn,legacy medical transport,0.459355,0.610497,0.529412,0.823529,0.352941,0.75,0.375,0.855,0.25,3.526361,0.416667,1.087605,legacy fdn\tlegacy medical transport
1436,apartment owners assn of california,california apartment assn,0.686949,0.814911,0.466667,0.608696,0.833333,1.0,0.171429,0.640476,0.666667,4.094345,0.714286,1.024919,apartment owners assn of california\tcaliforni...


In [64]:
all_names = set(df.left_norm.tolist() + df.right_norm.tolist())
all_names = sorted(all_names)

P = df.set_index(['left_norm', 'right_norm']).pivot_table(index='left_norm', columns='right_norm', values='combo', fill_value=0)
P = P.reindex(index=all_names, columns=all_names, fill_value=0)
# Make sure P is symmetric; fill upper triangle with lower triangle values
P = P.where(np.triu(np.ones(P.shape), k=1).astype(bool), P.T)
P = P.values
P = np.nan_to_num(P, nan=0.0)  # Replace NaN with 0
# np.fill_diagonal(P, 1)  # Set diagonal to 0
# Replace 0 values with a small random number from the exponential distribution
# fill_value = np.random.exponential(0.1, size=P.shape)
# P[P == 0] = fill_value[P == 0]

Q = np.linalg.inv(np.identity(P.shape[0]) - 0.9 * P)# - np.linalg.inv(np.identity(P.shape[0]) - (1 - P))
Q = pd.DataFrame(Q, index=all_names, columns=all_names)
# Q = Q.fillna(0)

In [65]:
Q[abs(Q) < 0.2] = 0
np.fill_diagonal(Q.values, 0)  # Set diagonal to 0
# Get the edges and weights
W = Q.stack().reset_index()
W.columns = ['source', 'target', 'weight']
W = W[W['weight'] > 0]
W = W.sort_values(by='weight', ascending=False)
W['leftrightsorted'] = W.apply(lambda row: '\t'.join(sorted([row['source'], row['target']])), axis=1)
W = W.drop_duplicates(subset=['leftrightsorted'])
W = W.drop(columns=['leftrightsorted'])
W

Unnamed: 0,source,target,weight
5470949,georgia conservation voters,georgia conservation voters education fund,36.576935
5475561,georgia conservation voters education fund,montana conservation voters education fund,34.972152
6586388,internet assn.,internet soc.,26.876799
6696411,intl. municipal lawyers assn..,intl. municipal lawyers’ assn.,26.333210
6703997,intl. municipal lawyer’s assn.,intl. municipal lawyers’ assn.,26.260665
...,...,...,...
4405383,emory intellectual property soc.,inst. for intellectual property and social jus...,0.200461
8429825,minnesota school boards assn.,iowa assn. of school boards,0.200404
11424703,public defender service for the district of co...,american civil liberties union fdn. of arizona,0.200394
9365149,natl. fed. for the blind,american council of the blind,0.200284


In [16]:
P.sum()

right_norm
(new) orleans public defenders                     0.000000
10 members of the united states senate             0.000000
138 women hurt by planned parenthood abortions     0.000000
21 in right.                                       0.000000
2556 operation outcry women injured by abortion    0.000000
                                                     ...   
young ctr. for immigrant children’s rights         0.061428
youth oasis                                        0.150951
ywca kalamazoo                                     0.000000
zionist org. of america                            0.000000
zoological assn. of america                        0.222671
Length: 3793, dtype: float64