# RECAP Search queries containing "description"

The user search queries could help inform which descriptions the users often search for. So we extracted all search queries from RECAP containing "description". This notebook documents the extensive cleaning performed to select the frequent queries for review.

# Import Libaries

In [1]:
import numpy as np
import pandas as pd

import re

# Load dataset

In [2]:
with open("population/description-queries.txt", "r") as file:
    queries = file.readlines()

In [3]:
queries

['q=&type=r&order_by=dateFiled+desc&description=unclaimed&filed_after=01%2F01%2F2018&filed_before=10%2F15%2F2024&court=vaeb+vawb\n',
 'q=&type=r&order_by=score+desc&case_name=davis+v.+city+of+new+york&description=amended+complaint&court=nysd\n',
 'q=&type=r&order_by=score+desc&description=order&assigned_to=Blackwell&court=mnd&page=7\n',
 'q=&type=r&order_by=score+desc&description=summary+judgment&assigned_to=Blackwell&court=mnd\n',
 'q=&type=r&order_by=score+desc&description=summary+judgment&assigned_to=Blackwell&court=mnd\n',
 'q=&type=r&order_by=dateFiled+desc&description=unclaimed&filed_after=01%2F01%2F2018&filed_before=10%2F15%2F2024&court=vaeb+vawb&page=2\n',
 'q=&type=r&order_by=score+desc&description=summary+judgment&assigned_to=Blackwell&court=mnd\n',
 'q=&type=r&order_by=dateFiled+desc&description=unclaimed&filed_after=01%2F01%2F2018&filed_before=10%2F15%2F2024&court=vaeb+vawb&page=3\n',
 'q=&type=r&order_by=dateFiled+desc&description=unclaimed&filed_after=01%2F01%2F2018&filed

# Clean up the data

In [4]:
df = pd.DataFrame(queries)
df = df.rename(columns={0: "query"})
df

Unnamed: 0,query
0,q=&type=r&order_by=dateFiled+desc&description=...
1,q=&type=r&order_by=score+desc&case_name=davis+...
2,q=&type=r&order_by=score+desc&description=orde...
3,q=&type=r&order_by=score+desc&description=summ...
4,q=&type=r&order_by=score+desc&description=summ...
...,...
53302,q=chapter%3A11+AND+document_type%3A%22PACER+Do...
53303,q=chapter%3A11+AND+document_type%3A%22PACER+Do...
53304,q=chapter%3A11+AND+document_type%3A%22PACER+Do...
53305,q=chapter%3A11+AND+document_type%3A%22PACER+Do...


In [5]:
for i, row in df.iterrows():
    query = row["query"]
    pattern = r"description=([^&]+)|description%3A([^\+]+)|short_description%3A([^&]+)"
    match = re.search(pattern, query)
    if match:
        description = next(group for group in match.groups() if group is not None)
        replacements = {
        "%22": "", "%2C": "", "%26": "", "%27": "", "%28": "", 
        "%3A": "AND", "%2f": "AND", "%e2%80%9c": "", "%e2%80%9d": "", 
        "\n": "", "+": " ", ".": ""
        }
        # Replace each pattern in the description
        for old, new in replacements.items():
            description = description.replace(old, new)
        
        description = description.lower().strip()
        df.at[i, "description"] = description

In [6]:
df

Unnamed: 0,query,description
0,q=&type=r&order_by=dateFiled+desc&description=...,unclaimed
1,q=&type=r&order_by=score+desc&case_name=davis+...,amended complaint
2,q=&type=r&order_by=score+desc&description=orde...,order
3,q=&type=r&order_by=score+desc&description=summ...,summary judgment
4,q=&type=r&order_by=score+desc&description=summ...,summary judgment
...,...,...
53302,q=chapter%3A11+AND+document_type%3A%22PACER+Do...,amended
53303,q=chapter%3A11+AND+document_type%3A%22PACER+Do...,amended
53304,q=chapter%3A11+AND+document_type%3A%22PACER+Do...,amended
53305,q=chapter%3A11+AND+document_type%3A%22PACER+Do...,amended


In [7]:
len(df[df["description"].isna()])

438

In [8]:
df["description"] = df["description"].str.split("AND")
df = df.explode("description").reset_index(drop=True)
df["description"] = df["description"].str.strip()

In [9]:
df["description"] = df["description"].str.split("OR")
df = df.explode("description").reset_index(drop=True)
df["description"] = df["description"].str.strip()

In [10]:
df["description"] = df["description"].str.split(" or ")
df = df.explode("description").reset_index(drop=True)
df["description"] = df["description"].str.strip()

In [11]:
df["description"].nunique()

5303

In [12]:
df["description"].value_counts()

description
chapter 7                       4281
complaint                       2716
search warrant                  2704
sentencing memorandum by usa    2305
chapter 11                      2030
                                ... 
matthew melsen                     1
brett allen bedusek                1
impoundment                        1
selling awah                       1
plaintiff pro se                   1
Name: count, Length: 5303, dtype: int64

In [13]:
fillers = ["and", "or", "in", "on", "at", "for", "of", "to", "a", "by", "usa", "not", "the", "", "v", "no", "yes", "is", "inc", "llc", "llp", "this", "that", "an", "with", "versus", "as", "day", "en", "law", "new"]
df = df[~df["description"].isin(fillers)]

df["description"] = (df["description"]
                     .str.replace("%2a", "")
                     .str.replace("descriptionand", "")
                     .str.replace("-", "")
                     .str.replace("~", "")
                     .str.replace("attorneys", "attorney")
                     .str.replace("judgement", "judgment")
                     .str.replace("%e2%80%9corder%e2%80%9d", "order")
                     .str.replace("%e2%80%9corder%e2%80%9d %e2%80%9c", "order")
                     .str.replace("12%2f16%2f2024 ", "")
                     .str.replace("%c2%a7", "")
                     .str.replace("%e2%80%9c", "")
                     .str.replace("12%2f17%2f2024", "2024")
                     .str.replace("dismi", "dismiss")
                     .str.replace("descriptionanddesign-patent", "patent")
                     .str.replace("12%2f16%2f2024 ", "")
                     .str.replace("12%2f17%2f2024 ", "")
                     .str.replace("%2a", "")
                     .str.replace("2024 ", "")
                     .str.replace("26f%29", "")
                     .str.replace("341 meeting transcript", "transcript")
                    )
df = df[~df["description"].isna()]
df = df[df["description"] != ""]
df = df[~df["description"].apply(lambda x: isinstance(x, str) and x.isnumeric())]
df["description"].nunique()

5182

# Look at the frequency distribution & select only the top 95 percentile for manual review

In [14]:
df["description"].value_counts().describe()

count    5182.000000
mean       10.205519
std       104.591669
min         1.000000
25%         1.000000
50%         2.000000
75%         4.000000
max      4281.000000
Name: count, dtype: float64

In [15]:
reviews = df["description"].value_counts().reset_index()

In [16]:
threshold = reviews["count"].quantile(0.95)
threshold

17.0

In [17]:
reviews = reviews[reviews["count"] > threshold]
len(reviews)

258

In [18]:
reviews = reviews.sort_values(by=["description"])
reviews

Unnamed: 0,description,count
101,administrative,48
68,adversary,70
21,affidavit,268
97,affidavit iphone,49
150,agreement,31
...,...,...
28,warrant,183
42,withdraw,120
238,word count,19
235,word limit,19


# Save for manual review

In [19]:
reviews.to_csv("population/review_queries.csv", index=False)