In [337]:
import pandas as pd
import numpy as np
import re, nltk

In [338]:
raw_data = pd.read_json('data/all_v1.json').T

In [339]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 446 entries, legalsum01 to tosdr421
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   doc                446 non-null    object
 1   id                 85 non-null     object
 2   original_text      446 non-null    object
 3   reference_summary  446 non-null    object
 4   title              85 non-null     object
 5   uid                446 non-null    object
 6   case_code          117 non-null    object
 7   case_text          117 non-null    object
 8   note               361 non-null    object
 9   title_code         352 non-null    object
 10  title_text         352 non-null    object
 11  urls               361 non-null    object
 12  tldr_code          101 non-null    object
 13  tldr_text          101 non-null    object
dtypes: object(14)
memory usage: 68.4+ KB


In [340]:
raw_data.tail()

Unnamed: 0,doc,id,original_text,reference_summary,title,uid,case_code,case_text,note,title_code,title_text,urls,tldr_code,tldr_text
tosdr417,Privacy Policy,,third party vendors including google use cooki...,this service allows tracking via third party c...,,tosdr417,,,,"1,s",This service allows tracking via third-party c...,{'icepop.com'},,
tosdr418,Privacy Policy,,the third parties that display advertisements ...,this service employs third party cookies but w...,,tosdr418,,,,"1,s","This service employs third-party cookies, but ...",{'icepop.com'},,
tosdr419,Privacy Policy,,we may make use of third party service provide...,third parties may be involved in operating the...,,tosdr419,,,,"1,s",Third parties may be involved in operating the...,{'icepop.com'},,
tosdr420,Privacy Policy,,please be aware that we may transfer your info...,your data may be processed and stored anywhere...,,tosdr420,,,,"1,s",Your data may be processed and stored anywhere...,{'icepop.com'},,
tosdr421,Terms of Service,,we may update these terms from time to time. y...,terms may be changed any time at their discret...,,tosdr421,,,,"1,s",Terms may be changed any time at their discret...,{'epicgames.com'},,


# Retrieve document types and domain/company names from given columns:

In [341]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [342]:
doctypes = [
    'terms of service',
    'terms of use',
    'privacy policy',
    'legal documents',
    'license agreement'
]
df=raw_data.copy()

In [343]:
def get_doctype(dfdoccol, rownum, doctypelist=doctypes, strtovec=TfidfVectorizer(ngram_range=(1,1)), prep=None):
    if prep:
        dfdoccol = prep(dfdoccol)
        doctypelist_compare = prep(doctypelist)
    else: doctypelist_compare = doctypelist
    
    vectorizer = strtovec
    sparsecol = vectorizer.fit_transform(dfdoccol)
    sparsetypes = vectorizer.transform(doctypelist_compare)
    docstr = dfdoccol[rownum]

    sims = cosine_similarity(sparsecol[rownum], sparsetypes)[0]
    #print(docstr)
    series =pd.Series(dict(zip(doctypelist, sims))).sort_values(ascending=False)
    maxsim = max(series)
    guessed_doctypes = [series.index[i] for i in range(len(doctypelist)) if i in np.where(series==maxsim)[0]]
    if len(guessed_doctypes)==len(doctypelist):
        #print(f'Could not determine doctype for {docstr}')
        return 'inconclusive'
    elif len(guessed_doctypes)==1:
        return guessed_doctypes[0]
    else:
        return "|".join(guessed_doctypes)

print(df.doc[0])
get_doctype(df.doc, 0)

Pokemon GO Terms of Service


'terms of service'

In [344]:
def get_doctypes(dfdoccol, doctypelist=doctypes, strtovec=TfidfVectorizer(ngram_range=(1,1)), prep=None):
    """This function returns a series with the original doc column \\
    and an additional doctype column"""
    res = []
    resid=[]
    for rownum in range(dfdoccol.shape[0]):
        resid.append(dfdoccol.index[rownum])
        res.append(( dfdoccol[rownum], \
        get_doctype(dfdoccol, rownum, doctypelist=doctypelist, strtovec=strtovec, prep=prep)))
    return pd.DataFrame(res, index=resid, columns=['doc', 'doctype'])

In [345]:
get_doctypes(df.doc).doctype.value_counts()

terms of service     181
privacy policy       129
terms of use          48
license agreement     34
inconclusive          27
legal documents       27
Name: doctype, dtype: int64

In [346]:
from nltk.stem import PorterStemmer

def parse_doc_col_entry(s):
    s1 = re.sub(r" \(.*\)", '', s) # removes all content in parentheses
    s2 = re.sub(r"\B(?=[A-Z][a-z])", ' ', s1) # inserts spaces between words for strings WrittenWithoutSpacesLikeThis
    s3 = re.sub(r"['`,?]", '', re.sub(r"[/&]", 'and', s2)) # replace "&" and "/" with "and", and then remove punctuation
    s4 = re.sub(r"\.\w+|\s-\s.+", '', s3) # eliminate ".XXXX" (extensions) and " - XXXX" (subtitles)
    

    stemr = PorterStemmer()
    newentry = " ".join([stemr.stem(w).lower() for w in s4.split()])
    return newentry

def parse_doc_col(doccol_or_list):
    if type(doccol_or_list) == list or type(doccol_or_list) == np.ndarray:
        return [parse_doc_col_entry(elem) for elem in doccol_or_list]
    else:
        return doccol_or_list.apply(parse_doc_col_entry)

#parse_doc_col(df.doc).unique(), parse_doc_col(doctypes)

In [347]:
def get_domains(urlset):
    try:
        domains=[]
        x = re.sub(r"[{}']", '', urlset)
        for url in x.split(","):
            domains.append(x.split(".")[0])
        return list(set(domains))[0]
    except: return np.nan


# Reorganizing data to clean up dataframe:

In [348]:
gdt = get_doctypes(raw_data.doc, 
        strtovec=TfidfVectorizer(ngram_range=(1,1)), 
        prep=parse_doc_col)

In [349]:
gdt['domain']=df.urls.apply(get_domains)
gdt.domain = gdt.domain.fillna(gdt.doc.apply(lambda s: s.split()[0].lower()))
gdt.groupby('doc', group_keys=False).apply(lambda dfg: dfg.sample(1))

Unnamed: 0,doc,doctype,domain
tosdr091,About our edit tool,inconclusive,tosdr
tosdr003,Additional Terms of Service,terms of service,flickr
legalsum57,Android SDK License Agreement (June 2014),license agreement,android
legalsum83,Apple Website Terms of Service,terms of service,apple
tosdr148,Application-Based Services Terms of Use,terms of use,apple
tosdr094,CoReadingWebPrivacyPolicy,privacy policy,readingpa
tosdr268,Conditions of Use,terms of use,amazon
tosdr394,Cookie Policy,privacy policy,facebook
tosdr390,Cookie Statement,inconclusive,amnesty
tosdr221,Cookie notice,inconclusive,ebay


In [350]:
raw_data.drop(['id', 'uid', 'note'], axis=1).columns

Index(['doc', 'original_text', 'reference_summary', 'title', 'case_code',
       'case_text', 'title_code', 'title_text', 'urls', 'tldr_code',
       'tldr_text'],
      dtype='object')

In [351]:
def get_rating(code):
    try:
        return int(code[0])
    except:
        return pd.NA

In [352]:
df=pd.DataFrame(raw_data.doc)
df['domain']=gdt.domain
df['doctype']=gdt.doctype

df['title']=raw_data.title_text.fillna(raw_data.title)
df['titlerating']=raw_data.title_code.apply(get_rating)
df['fulltext']=raw_data.original_text.fillna(raw_data.case_text)
df['textrating']=raw_data.case_code.apply(get_rating)
df['tldr']=raw_data.reference_summary.fillna(raw_data.tldr_text)
df['tldrrating']=raw_data.tldr_code.apply(get_rating)

df.replace([np.nan, pd.NA, None], pd.NA, inplace=True)

# Randomly selecting 5 rows to show, and comparing plain english tldr's with results from a summerizer function:

In [354]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
# will need to install en_core_web_sm, via...
    # python: 
    #   python -m spacy download en_core_web_sm
    # conda:
    #   conda install -c conda-forge spacy-model-en_core_web_sm
nlp = spacy.load('en_core_web_sm')

# will need to install textblob and download its corpora via...
    # conda:
    #   conda install -c conda-forge textblob
    # pip:
    #   pip install -U textblob
# and then
    # pip and conda:
    #   python -m textblob.download_corpora
from textblob import TextBlob

In [355]:
def summarize(text, per):
    nlp = spacy.load('en_core_web_sm')
    doc= nlp(text)
    tokens=[token.text for token in doc]
    fd = nltk.probability.FreqDist(tokens)
    words_to_use = [w for w in doc if not str(w).lower() in list(STOP_WORDS)+list(punctuation)]
    word_frequencies={str(word).lower():fd.freq(str(word).lower()) for word in words_to_use}

    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    select_length=max([int(len(sentence_tokens)*per), 1])
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary

In [356]:
for l in np.random.choice(df.index, 5):
    row=df.loc[l]
    print('=='*25)
    print(f'{l} | {row.domain} |', row.title,":")
    print('= '*25)
    print(row.fulltext)
    print('- '*25)
    print(row.tldr)
    print('-+'*25)
    print(summarize(row.fulltext, .2))
    print('=='*25)

legalsum74 | facebook | Developers :
= = = = = = = = = = = = = = = = = = = = = = = = = 
if you are a developer or operator of a platform application or website the following additional terms apply to you you are responsible for your application and its content and all uses you make of platform. this includes ensuring your application or use of platform meets our facebook platform policies and our advertising guidelines. your access to and use of data you receive from facebook will be limited as follows you will only request data you need to operate your application. you will have a privacy policy that tells users what user data you are going to use and how you will use display share or transfer that data and you will include your privacy policy url in the developer application. you will not use display share or transfer a user s data in a manner inconsistent with your privacy policy. you will delete all data you receive from us concerning a user if the user asks you to do so and will p

# Now I will construct a function that returns a sentence that has its words (more) properly seperated

In [357]:
vect = CountVectorizer( # for testing
    ngram_range=(1,3),
    token_pattern=r'\b\w+\b',
    strip_accents='unicode')

In [358]:
def get_grams(sentence, ngram_range=(1,1)):
    """Returns a list of all ngrams in sentence for a range of n values."""
    if type(sentence) == str: sentence = sentence.split()
    
    grams=[]
    for n in range(ngram_range[0], ngram_range[1]+1):
        grams += nltk.ngrams(sentence, n)

    return [" ".join(list(gram)) for gram in grams]
# 

In [359]:
def prepstring(documentstring):
    res = re.sub(r"\b\d+\b", "", documentstring)
    res = re.sub(r"\b[b-z]\b", "", res)
    return res

def gramcounts(wordstr, vct, corpus, vectcorpus=None, countmethod='ratio'):
    """Returns a list of tuples containing an ngram and it's frequency in the corpus \\
        (as a percentage, assuming countmethod='ratio', of the total number of words plus the sum of ngram \\
        frequencies) for all ngrams considered by the vectorizer vct. 
        
        Note: vct must be a Count Vectorizer instance."""
    for i, docu in enumerate(corpus):
        corpus[i]=prepstring(docu)

    gramlist = get_grams(wordstr, ngram_range=vct.get_params()['ngram_range'])
    if vectcorpus is None:
        vectorized_corpus = vct.fit_transform(corpus)
    else: vectorized_corpus = vectcorpus
    vcorp_out = vectorized_corpus

    gramcounts = np.array(vectorized_corpus.sum(axis=0))[0].ravel()
    gramcounts_total = gramcounts.sum()
    res = []
    for gram in gramlist:
        gramcolid = vct.vocabulary_.get(gram)
        if gramcolid is None: res.append((gram, 0.0))
        else:
            gramcount = gramcounts[gramcolid]
            if countmethod=='ratio':
                res.append((gram, gramcount/gramcounts_total))
            else: res.append((gram, gramcount))
    return vcorp_out, res

gramcounts(
    "governing lawthese terms",
    vect,
    df.fulltext.to_list())[1]

[('governing', 2.25392746861406e-05),
 ('lawthese', 7.513091562046867e-06),
 ('terms', 0.0023215452926724817),
 ('governing lawthese', 7.513091562046867e-06),
 ('lawthese terms', 7.513091562046867e-06),
 ('governing lawthese terms', 7.513091562046867e-06)]

In [369]:
def validsplits(string, splits):
    try:
        splits=np.array(list(set(splits)), dtype=int)
    except Exception as e:
        print(splits, "-->", e)
        return 0
    if len(splits)==0 or len(splits)>=len(string): return 0
    elif min(splits)<=0 or max(splits)>=len(string): return 0
    else: return 1

def convert_gramcounts_to_score(gramcount_list):

    max_ngrams = len(gramcount_list[-1][0].split(" "))
    resscore=0.0
    for numgrams in range(1, max_ngrams+1):
        numgramcounts = [gramcount[1] for gramcount in gramcount_list if len(gramcount[0].split(" "))==numgrams]
        resscore += sum(numgramcounts)
    return resscore

def get_splits_scores(wordstr, vectorizer, corpus, splitlocs=None, vectcorpus=None):
    if vectcorpus is not None:
        vectcorpus0, gc0 = gramcounts(wordstr, vectorizer, corpus, vectcorpus=vectcorpus)
    else:
        vectcorpus0, gc0 = gramcounts(wordstr, vectorizer, corpus)
    score0 = (0, wordstr, convert_gramcounts_to_score(gc0))
    res = [score0]
    if splitlocs is None:
        splitlocs=range(1,len(wordstr))
    elif not validsplits(splitlocs): 
        print(f'splitlocs={splitlocs} invalid')
        return res
    
    for splitloc in splitlocs:
        if wordstr[splitloc] == " " or wordstr[splitloc-1] == " ": continue
        else:
            newwordstr = " ".join([wordstr[:splitloc],wordstr[splitloc:]])
            gc = gramcounts(newwordstr, vectorizer, corpus, vectcorpus=vectcorpus0)[1]
            res.append((splitloc, newwordstr, convert_gramcounts_to_score(gc)))
    return res

get_splits_scores(
    "governing lawthese terms",
    vect,
    df.fulltext.to_list())

[(0, 'governing lawthese terms', 0.0023741369336068097),
 (1, 'g overning lawthese terms', 0.0023365714757965753),
 (2, 'go verning lawthese terms', 0.00239667620829295),
 (3, 'gov erning lawthese terms', 0.002344084567358622),
 (4, 'gove rning lawthese terms', 0.0023365714757965753),
 (5, 'gover ning lawthese terms', 0.0023365714757965753),
 (6, 'govern ing lawthese terms', 0.0024041892998549973),
 (7, 'governi ng lawthese terms', 0.0023365714757965753),
 (8, 'governin g lawthese terms', 0.0023365714757965753),
 (11, 'governing l awthese terms', 0.002344084567358622),
 (12, 'governing la wthese terms', 0.0023816500251688565),
 (13, 'governing law these terms', 0.006100630348382055),
 (14, 'governing lawt hese terms', 0.002344084567358622),
 (15, 'governing lawth ese terms', 0.002344084567358622),
 (16, 'governing lawthe se terms', 0.002344084567358622),
 (17, 'governing lawthes e terms', 0.002344084567358622),
 (20, 'governing lawthese t erms', 3.756545781023433e-05),
 (21, 'governing

In [370]:
def splitwords(wordstr, vectorizer, corpus, splitlocs=None, vectcorpus=None, verbose=False, select_one=True):
    scores = get_splits_scores(
        wordstr,
        vectorizer,
        corpus,
        splitlocs=splitlocs,
        vectcorpus=vectcorpus)
    splitloc, splitstr, splitscore = list(zip(*scores))
    scoresdf = pd.DataFrame({'splitstring': list(splitstr), 'score': list(splitscore)}, index=splitloc).sort_values('score', ascending=False)
    if verbose: return scoresdf
    else: 
        maxscore = scoresdf.score.max()
        maxscorestrings = scoresdf.iloc[np.where(scoresdf.score==maxscore)].splitstring.tolist()
        Nmaxscores = len(maxscorestrings)
        if select_one: 
            if Nmaxscores==1:
                res = maxscorestrings[0]
            elif wordstr in maxscorestrings:
                res = wordstr
            else:
                display(scoresdf.head(Nmaxscores))
                splitloc_in = input("type the splitloc (index) of the best split shown:")
                while splitloc_in not in [str(i) for i in scoresdf.index]:
                    display(scoresdf.head(Nmaxscores))
                    splitloc_in = input("Incorrect index. Please type in a valid index of the best split. (Type x to break):")
                    if splitloc_in=='x': break
                res = scoresdf.loc[int(splitloc_in), 'splitstring']
        else:
            res = maxscorestrings
        return res, maxscore

splitwords(
    "governing lawthese terms",
    vect,
    df.fulltext.to_list())

('governing law these terms', 0.006100630348382055)

# Testing my function:

In [382]:
testsentences = [str(sent) for sent in TextBlob(df.loc['legalsum17'].fulltext).sentences]
testsent = testsentences[0]

vect = CountVectorizer(
    ngram_range=(1,3),
    token_pattern=r'\b\w\w+\b',
    strip_accents='unicode',
    stop_words='english')

print(testsent)
testsent2=splitwords(
    testsent,
    vect,
    df.fulltext.to_list())
print(testsent2)
testsent3=splitwords(
    testsent2[0],
    vect,
    df.fulltext.to_list())
print(testsent3)

governing lawthese terms and any action related thereto will be governed by the laws of the state of california without regard to its conflict of laws provisions.
('governing law these terms and any action related thereto will be governed by the laws of the state of california without regard to its conflict of laws provisions.', 0.011059562303763064)
('governing law these terms and any action related thereto will be governed by the law s of the state of california without regard to its conflict of laws provisions.', 0.011450083571551308)


It seems the function works, but it has trouble handling cases where popping off a letter (in the above example, 's') produces a more common word (i.e. 'law' is more common than 'laws'). This could likely be remedied by stemming or adding a penalty to splitting plural nouns.