## Process Citations

Requirements: raw data from CourtListener, U.S. Codes from LII
- parses all citations to the U.S. codes from every opinion document
- automatically generates prediction labels of the top n citations (default=100)
- splits data into train:dev:test with a ratio of 80:5:15

In [None]:
#!conda install -y pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch
#!conda install -y -c huggingface transformers datasets

In [None]:
import re, os

import datasets

data_file = 'data/opinions'

In [None]:
uscode_reg = r'(?<=\s)([\d]+[a-z]*)\s+U.S.C. §+ (([0-9]+[a-z]*[-–]*)+)(\([a-z]\))*(\([\d]\))*'

n = 100 # maximum label length

In [None]:
from scripts.utils import preprocess
    


In [None]:
import json
import pandas as pd

# citations = pd.read_json('../citations.json')
jurisdictions = []
filenames = []
text = []

for root, dirs, files in os.walk(data_file):
    for file in files:
        jurisdictions.append(root.split('/')[-1])
        filenames.append(file)
        with open(root+file, 'r') as f:
            temp = json.loads(''.join(f.readlines()))
            text.append(temp['html_lawbox'])

citations = {'jurisdiction': jurisdictions, 'file': filenames, 'text': text}

citations = pd.DataFrame.from_dict(citations)


In [None]:
test = [x.split('/')[2].split('.')[0] for x in citations['file']]

In [None]:
len(test), len(set(test)), len(set(citations['file']))

In [None]:
with open('opinions{0}'.format(citations['file'].iloc[1]), 'r', encoding='utf-8') as f:
    test = json.loads(''.join([x.strip() for x in f.readlines()]))
    
test

In [None]:
to_predict = citations[['title','section', 'subsection']].value_counts()[:100] # only predict top 100 most frequently cited US codes
to_predict = ['_'.join(x) for x in list(to_predict.index)]


label_dict = {k: v for v, k in enumerate(to_predict)}
print(label_dict)

citations['partial_citation'] = citations['title'] +'_'+ citations['section'] + '_' + citations['subsection']

citations['file_name'] = ['{0}-{1}'.format(label_dict[x], x) if x in label_dict else None for x in citations['partial_citation'] ]


In [None]:

## reading in the relevant US codes

lab_text = {}
for k,v in label_dict.items():
    paragraph = re.sub(r'[^a-z]', '', k[2])
    if os.path.exists(f'lii/text/_uscode_text_{k[0]}_{k[1]}{paragraph}.txt'):
        with open(f'lii/text/_uscode_text_{k[0]}_{k[1]}{paragraph}.txt') as f:
            lab_text[k] = ' '.join([x.strip() for x in f.readlines()[1:]])
    elif os.path.exists(f'lii/text/_uscode_text_{k[0]}_{k[1]}.txt'):
        with open(f'lii/text/_uscode_text_{k[0]}_{k[1]}.txt') as f:
            lab_text[k] = ' '.join([x.strip() for x in f.readlines()[1:]])
    else:
        print('U.S. Code not found')

In [None]:
with open('citation_map.json', 'w') as f:
    f.write(json.dumps(label_dict))

In [None]:
for k,v in lab_text.items():
    n = '_'.join(k)
    with open(f'{label_dict[k]}-{n}.json', 'w') as f:
        f.write(v)

In [None]:
citations.to_json('citations.json')

In [None]:


r_list = list(citations['file'].unique())
print(len(r_list))

citations_red = citations[citations['file'].isin(r_list)]
len(citations_red)


In [None]:
citations.head()
citations['partial_citation'] = citations['partial_citation'].astype('category')

In [None]:
citations['partial_citation'] = citations['partial_citation'].cat.rename_categories(label_dict)

citations['partial_citation'] = pd.to_numeric(citations['partial_citation'], errors='coerce')
citations.dropna(inplace=True)

len(citations)

In [None]:
files = citations.groupby('file')

labels = {}

for name, group in files:
    labels[name] = list(set(group['partial_citation']))


In [None]:
type(r_list)

r_list = list(r_list)

In [None]:
import json, os
from tqdm.notebook import tqdm

text = {}
mlm_labels = {}
labels_fin = {}
for i in tqdm(range(len(r_list)), desc='preprocessing'):
        #print(r_list[i])
        with open('opinions{0}'.format(r_list[i]), 'r', encoding='utf-8') as f:
            test = json.loads(''.join([x.strip() for x in f.readlines()]))
        #print(test['id'])
        
        to_extract = ''
        if test['html_lawbox'] != None:
            to_extract = test['html_lawbox']
        elif test['html_with_citations'] != None:
            to_extract = test['html_with_citations']
        else:
            to_extract = test['plain_text']

        p = preprocess(to_extract)
        text[r_list[i]] = p[0]
        #mlm_labels[r_list[i]] = p[1]
        lab = [0] * 20
        if r_list[i] in labels:
            for x in labels[r_list[i]]:
                lab[int(x)] = 1
        labels_fin[r_list[i]] = lab
        
        



In [None]:
print(text[r_list[0]])

In [None]:
from nltk.util import everygrams
from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords
import spacy
stopw = set(stopwords.words('english'))

# nlp = spacy.load('en_core_web_sm')

to_predict = citations[['title','section', 'subsection']].value_counts()[:100]

to_predict = [' '.join(x) for x in list(to_predict.index)]
citations['final_citation'] = citations['title'] +' '+ citations['section'] + ' ' + citations['subsection']

is_noun = lambda pos: pos[:2] == 'NN'

stats = []
for q in tqdm(reversed(to_predict)):
    docs = set(citations[citations['final_citation'] == q]['file'])
    ngrams = {}
    print(q)
    for doc in tqdm(docs):
        
        qu = word_tokenize(re.sub(r'[^a-zA-Z\s]', '', str(text[doc])))
        #print(qu)
        ng_temp = [' '.join(x) for x in everygrams(qu, max_len=3)]
        
        # ng_temp = nlp(sentence)
        
        for ngram in ng_temp:
            count = 0
            for word in ngram:
                if word.lower() in stopw:
                    count += 1
                    break
            if count == 0:
                if ngram in ngrams:
                    ngrams[ngram] += 1
                else:
                    ngrams[ngram] = 1
    most_freq = sorted(ngrams.items(), key=lambda x: x[1], reverse=True)[:100]
    print(most_freq)
    stats.append(most_freq)

In [None]:
print(len(labels_fin), len(mlm_labels), len(text))

In [None]:
from collections import Counter

codes = Counter([x for y in labels_fin for x in y])

In [None]:
import random

ind = random.sample(range(len(text)), k=len(text))

j = int(len(text)*0.8)
k = int(len(text)*0.15)
l = int(len(text)*0.05)

#train_data = [data[i] for i in ind[:j]]
train_files = [r_list[i] for i in ind[:j]]

val_files = [r_list[i] for i in ind[j:j+k]]

test_files = [r_list[i] for i in ind[j+k:j+k+l]]



In [None]:
def save(files, data, mlm, labels, ds, tokenizer, max_seq_len):
    with open('{0}.files'.format(ds), 'w') as f:
        f.write('\n'.join(files))
    
    for x in tqdm(files):
        with open('{0}_{1}'.format(ds, x.replace('/','_')), 'w') as f:
            text = {}
            text['text'] = tokenizer(data[x], 
            max_length=max_seq_len,
            padding=True,
            truncation=True,
            return_token_type_ids=False,
            return_tensors='pt')['input_ids'].tolist()
            
            
            text['mlm'] = tokenizer(mlm[x], 
            max_length=max_seq_len,
            padding=True,
            truncation=True,
            return_token_type_ids=False,
            return_tensors='pt')['input_ids'].tolist()
            
            f.write(json.dumps(text))
    
            
    with open('{0}.label'.format(ds), 'w') as f:
        for x in tqdm(files):
            f.write(str(labels[x]) + '\n')
        
            

In [None]:
!mkdir -p roberta
from transformers import RobertaTokenizerFast, RobertaForMaskedLM

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

save(train_files, text, mlm_labels, labels_fin, 'roberta/train', tokenizer, 512)
save(val_files, text, mlm_labels, labels_fin, 'roberta/val', tokenizer, 512)
save(test_files, text, mlm_labels, labels_fin, 'roberta/test', tokenizer, 512)
    
    
    

In [None]:

pd.DataFrame(train_label).T.to_json('train_label.json')
pd.DataFrame(train_text).T.to_json('train_data.json')

pd.DataFrame(val_text).T.to_json('val_data.json')
pd.DataFrame(val_label).T.to_json('val_label.json')

pd.DataFrame(test_text).T.to_json('test_data.json')
pd.DataFrame(test_label).T.to_json('test_label.json')

In [None]:
#!mkdir longformer
#with open('longformer/train.data', 'w') as f:
    #f.write('\n'.join(train_data))
    
with open('longformer/train.label', 'w') as f:
    f.write('\n'.join([','.join([str(c) for c in x]) for x in train_label]))

#with open('longformer/train.mlm', 'w') as f:
    #f.write('\n'.join(train_mlm))

with open('train.txt', 'w') as f:
    f.write(';\n'.join(['\'{0}\''.format(x) for x in train_text]))
    

#with open('longformer/val.data', 'w') as f:
    #f.write('\n'.join(val_data))
    
with open('longformer/val.label', 'w') as f:
    f.write('\n'.join([','.join([str(c) for c in x]) for x in val_label]))

#with open('longformer/val.mlm', 'w') as f:
    #f.write('\n'.join(val_mlm))

with open('val.txt', 'w') as f:
    f.write(';\n'.join(['\'{0}\''.format(x) for x in val_text]))
    

#with open('longformer/test.data', 'w') as f:
    #f.write('\n'.join(test_data))
    
with open('longformer/test.label', 'w') as f:
    f.write('\n'.join([','.join([str(c) for c in x]) for x in test_label]))

#with open('longformer/test.mlm', 'w') as f:
    #f.write('\n'.join(test_mlm))

with open('test.txt', 'w') as f:
    f.write(';\n'.join(['\'{0}\''.format(x) for x in test_text]))
    
    

In [None]:
import dill
dill.dump_session('notebook_env.db')