### Set up environment

In [None]:
!pip install oscn==0.0.0.25

import oscn

### Set up the OSCN information needed

In [None]:

years = ['2016','2017','2018']
counties = ['mayes', 'love','rogers','delaware']
types =['CM','CF']

### Request the cases from OSCN and save as files
This takes a couple hours

In [None]:
import pickle


# for county in counties:
#   for year in years:
#     for type in types:      
#       cases=oscn.request.CaseList(type=type, year=year,county=county)
#       all_cases = [c for c in cases]
#       file_name = f'data/{type}.{county}.{year}.oscn'
#       f = open(file_name, 'wb')
#       pickle.dump(all_cases, f)
#       f.close()
#       print(f'{file_name}: {len(all_cases)}')


### Retrieve the saved files

In [None]:
import pickle

case_count =0 
saved_cases =[]

for county in counties:
  for year in years:
    for type in types:      
      file_name = f'data/{type}.{county}.{year}.oscn'
      new_cases = pickle.load(open(file_name, 'rb'))
      new_case_count = len(new_cases)
      case_count += new_case_count
      print(f'{file_name} added {new_case_count}')
      saved_cases += new_cases
        
print(f'counted case: {case_count} length saved {len(saved_cases)}')


## Create a pandas dataframe

In [None]:
import re
import numpy as np
import pandas as pd

columns = ['Filed','County','Type', 'CaseNumber', 'Docket','Source']
case_data = lambda c: [c.filed, c.county, c.type, c.case_number, c.docket, c.source]
%time all_cases = pd.DataFrame([case_data(c) for c in saved_cases], columns = columns)

In [None]:
all_cases.groupby(['County']).count()

In [None]:
minute_list = lambda docket:[minute.description for minute in docket ] if docket else []

all_cases['Minutes'] = [minute_list(docket) for docket in all_cases['Docket'] ] 

## Expand minutes

In [None]:
# convert the date from a string to a python DateTime type
all_cases['FiledDate'] = pd.to_datetime(all_cases['Filed'])

# create a map to all the existing non Count data

safe_len = lambda counts: len(counts) if counts else 0
lens = all_cases['Minutes'].apply(safe_len)
vals = range(all_cases.shape[0])
ilocations = np.repeat(vals, lens)

# get the index of all non Count columns
cols = [i for i,c in enumerate(all_cases.columns) if c != 'Minutes']
# create a new expanded data frame and copy the old data
count_frame = all_cases.iloc[ilocations, cols].copy()

# Create a new column with the exanded data
# count_desc = [item for sublist in all_cases['Counts'] for item in sublist] 

description = lambda count: count['description'] if count else ""
safe_counts = lambda counts: counts if counts else {}
count_frame['Minute'] = [minute for minutes in all_cases['Minutes'] for minute in minutes ]
# count_frame.reset_index()
count_frame.count()

### Count of counts

In [None]:
# count_frame.drop(['Count'])
count_frame.head()



### Distribution of single count cases

In [None]:
import spacy
nlp = spacy.load('en')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
import string
punctuations = string.punctuation

document = lambda count: nlp(count, disable=['parser', 'ner'])
tokens =  lambda count: [tok.lemma_.lower().strip() for tok in document(count) if tok.lemma_ != '-PRON-']
clean_tokens = lambda count: [tok for tok in tokens(count) if tok not in stopwords and tok not in punctuations]

%time count_frame['Tokens'] = count_frame['Minute'].apply(clean_tokens)

count_frame.head(10)

In [None]:
count_frame.describe()

In [None]:
import gensim
from gensim import corpora, models

dictionary = corpora.Dictionary(count_frame['Tokens'])
corpus = [dictionary.doc2bow(doc) for doc in count_frame['Tokens']]
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=10)


In [None]:
[c for c in lda_model.print_topics(num_topics=20, num_words=3)]

In [None]:
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis