In [2]:
import pandas as pd

In [5]:
doj = pd.read_json("combined.json", lines = True)

## due to json, topics are in a list so remove them and concatenate with ;
doj['topics_clean'] = ["; ".join(topic) 
                      if len(topic) > 0 else "No topic" 
                      for topic in doj.topics]

## similarly with components
doj['components_clean'] = ["; ".join(comp) 
                           if len(comp) > 0 else "No component" 
                           for comp in doj.components]

## drop older columns from data
doj = doj[['id', 'title', 'contents', 'date', 'topics_clean', 
           'components_clean']].copy()

doj.head()

Unnamed: 0,id,title,contents,date,topics_clean,components_clean
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,No topic,National Security Division (NSD)
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,No topic,Environment and Natural Resources Division
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,No topic,Environment and Natural Resources Division
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,No topic,Environment and Natural Resources Division
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,Environment,Environment and Natural Resources Division


In [8]:
doj_subset = doj[(doj['topics_clean'] == 'Civil Rights') | 
                 (doj['topics_clean'] == 'Hate Crimes') | 
                 (doj['topics_clean'] == 'Project Safe Childhood')]

# 4. Optional extra credit (2 points)

You notice that the pharmaceutical kickbacks press release we analyzed in question 1 was for an indictment, and that in the original data, there's not a clear label for whether a press release outlines an indictment (charging someone with a crime), a conviction (convicting them after that charge either via a settlement or trial), or a sentencing (how many years of prison or supervised release a defendant is sentenced to after their conviction).

You want to see if you can identify pairs of press releases where one press release is from one stage (e.g., indictment) and another is from a different stage (e.g., a sentencing).

You decide that one way to approach is to find the pairwise string similarity between each of the processed press releases in `doj_subset`. There are many ways to do this, so Google for some approaches, focusing on ones that work well for entire documents rather than small strings.

Find the top two pairs (so four press releases total)-- do they seem like different stages of the same crime or just press releases covering similar crimes?

In [27]:
# Assuming doj_subset is your DataFrame and 'contents' is the name of the column containing the documents

# Preprocess documents to create inverted indices
word_to_docs = defaultdict(set)
for idx, doc in enumerate(doj_subset['contents']):
    words = set(doc.split())
    for word in words:
        word_to_docs[word].add(idx)

# Keep track of the top two pairs of documents with most overlap
top_pairs = [(0, 0), (0, 0)]  # Initialize with dummy pairs
top_overlaps = [0, 0]  # Initialize with zero overlaps

# Iterate through pairs of documents and calculate the intersection of their inverted indices
for i, doc1 in enumerate(doj_subset['contents'][:-1]):
    words_doc1 = set(doc1.split())
    for j, doc2 in enumerate(doj_subset['contents'][i+1:], start=i+1):
        words_doc2 = set(doc2.split())
        intersection = len(words_doc1.intersection(words_doc2))
        # Update top pairs if necessary
        if intersection > top_overlaps[0]:
            top_pairs[1] = top_pairs[0]
            top_overlaps[1] = top_overlaps[0]
            top_pairs[0] = (i, j)
            top_overlaps[0] = intersection
        elif intersection > top_overlaps[1]:
            top_pairs[1] = (i, j)
            top_overlaps[1] = intersection

# Print the top two pairs of documents with the most overlapping words
for pair, overlap in zip(top_pairs, top_overlaps):
   # print(f"The pair of documents {pair} have {overlap} overlapping words.")
    print(f'The pair of documents', doj_subset.loc[pair[0]], doj_subset.loc[pair[0]], 'have', doj_subset.loc[overlap[0]],doj_subset.loc[overlap[1]], 'overlapping words.')


KeyError: 107