In [2]:
# import our matcher class
import spacy
from spacy.matcher import Matcher

# load up the plain text from the bills
with open ('./out/117s_text_clean.txt', 'r') as f:
    text = f.read()

In [3]:
# load the language model from spacy

nlp = spacy.load("en_core_web_sm")

In [4]:
# check the pipe components

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [5]:
len(text)

20706436

In [6]:
text[:1000]

'b"Congressional Bills 117th CongressFrom the U.S. Government Publishing OfficeS. 5242 Introduced in Senate (IS)&lt;DOC&gt;117th CONGRESS2d SessionS. 5242To prevent international violence against women, and for otherpurposes. IN THE SENATE OF THE UNITED STATES December 13, 2022Mrs. Shaheen (for herself and Ms. Collins) introduced the following bill; which was read twice and referred to the Committee on ForeignRelations A BILL To prevent international violence against women, and for otherpurposes.Be it enacted by the Senate and House of Representatives of the United States of America in Congress assembled,SECTION 1. SHORT TITLE; TABLE OF CONTENTS.(a) Short Title; Table of Contents.--This Act may be cited as the ``International Violence Against Women Act of 2022\'\'.(b) Table of Contents.--The table of contents for this Act is as follows:Sec. 1. Short title; table of contents.Sec. 2. Findings.Sec. 3. Statement of policy.TITLE I--UNITED STATES STRATEGY TO PREVENT AND RESPOND TO GENDER-BAS

In [7]:
# we want to split our bills into lists of individual bills, which
# will make it easier to process with spacy later

# split by text that is included in all the bills, for example
# "Congressional Bills 117th Congress"

texts = text.split('b"Congressional Bills 117th Congress')

In [8]:
len(texts)

376

In [9]:
# the amount of text is overwhelming for spacy to process at once, so
# make into lists for easier processing

one = texts[:100]
two = texts[200:300]
three = texts[300:400]

In [201]:
# check the first item
one[1]

'From the U.S. Government Publishing OfficeS. 5242 Introduced in Senate (IS)&lt;DOC&gt;117th CONGRESS2d SessionS. 5242To prevent international violence against women, and for otherpurposes. IN THE SENATE OF THE UNITED STATES December 13, 2022Mrs. Shaheen (for herself and Ms. Collins) introduced the following bill; which was read twice and referred to the Committee on ForeignRelations A BILL To prevent international violence against women, and for otherpurposes.Be it enacted by the Senate and House of Representatives of the United States of America in Congress assembled,SECTION 1. SHORT TITLE; TABLE OF CONTENTS.(a) Short Title; Table of Contents.--This Act may be cited as the ``International Violence Against Women Act of 2022\'\'.(b) Table of Contents.--The table of contents for this Act is as follows:Sec. 1. Short title; table of contents.Sec. 2. Findings.Sec. 3. Statement of policy.TITLE I--UNITED STATES STRATEGY TO PREVENT AND RESPOND TO GENDER-BASEDVIOLENCE GLOBALLYSec. 101. Global 

In [11]:
# join the texts into one long string again

text = ''.join(one)

In [13]:
# create an entity ruler to label gender terms as "gender", etc

ruler = nlp.add_pipe("entity_ruler", after="ner")

patterns = [
                {"label": "GENDER", "pattern": 'gender'},
                {"label": "GENDER", "pattern": 'trans'},
                {"label": "GENDER", "pattern": 'nonbinary'},
                {"label": "GENDER", "pattern": 'male'},
                {"label": "GENDER", "pattern": 'female'},
                {"label": "SEX", "pattern": 'sex'},
                {"label": "SEX", "pattern": 'biological'},
                {"label": "SEXUALITY", "pattern": 'sexuality'},
                {"label": "SEXUALITY", "pattern": 'orientation'},
                {"label": "SEXUALITY", "pattern": 'queer'},
            ]

ruler.add_patterns(patterns)

In [14]:
# check pipe names to make sure our entity ruler is added

nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [15]:
# run the nlp() function on our texts to get the docs object
# may have to change max length depending if you get the error when 
# running nlp()

nlp.max_length = 9945910
docs = nlp(text)

In [16]:
docs[:1000]

From the U.S. Government Publishing OfficeS. 5242 Introduced in Senate (IS)&lt;DOC&gt;117th CONGRESS2d SessionS. 5242To prevent international violence against women, and for otherpurposes. IN THE SENATE OF THE UNITED STATES December 13, 2022Mrs. Shaheen (for herself and Ms. Collins) introduced the following bill; which was read twice and referred to the Committee on ForeignRelations A BILL To prevent international violence against women, and for otherpurposes.Be it enacted by the Senate and House of Representatives of the United States of America in Congress assembled,SECTION 1. SHORT TITLE; TABLE OF CONTENTS.(a) Short Title; Table of Contents.--This Act may be cited as the ``International Violence Against Women Act of 2022''.(b) Table of Contents.--The table of contents for this Act is as follows:Sec. 1. Short title; table of contents.Sec. 2. Findings.Sec. 3. Statement of policy.TITLE I--UNITED STATES STRATEGY TO PREVENT AND RESPOND TO GENDER-BASEDVIOLENCE GLOBALLYSec. 101. Global str

In [17]:
len(docs)

446207

In [18]:
# making sure we have all of the components correct in the docs object

for token in docs[:10]:
    print(token)
    print(token.pos_)
    print(token.dep_)

From
ADP
prep
the
DET
det
U.S.
PROPN
compound
Government
PROPN
compound
Publishing
PROPN
pobj
OfficeS.
PROPN
pobj
5242
NUM
nummod
Introduced
VERB
advcl
in
ADP
prep
Senate
PROPN
pobj


In [220]:
# making sure we have entities correct in docs object

labels = ['GENDER', 'SEX', 'SEXUALITY']
for ent in docs.ents[:100]:
    if ent.label_ in labels:
        print(ent.text, ent.label_)

sex SEX
gender GENDER
male GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
sex SEX
gender GENDER
female GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER


In [20]:
# writing a pattern matcher to grab patterns of text where gender terms are being defined

pattern = [ 
      {
          # punctuation that appears zero or more
          'IS_PUNCT': True, 'OP': '*'
      },
      {
          # specifying the entity type, which can be one of our three
          # custom entities
          "ENT_TYPE": {
              'IN': [
                  'GENDER', 'SEX', 'SEXUALITY'
              ]
          }
      },
      {'OP': '?'}, # catches a "wild card" if it appears zero or one time
      {'OP': '?'}, # catches a "wild card" if it appears zero or one time.
      {
          'IS_PUNCT': True, 'OP': '*' # catches punctuation that appears zero or more times
      },
      {
          # getting the lowercase word of any of the following terms
          'LOWER': {
              'IN': [
                  'means', 'signifies', 'includes'
              ]
          }
      }
  ]

In [180]:
# use matcher class to create a matcher object
matcher = Matcher(nlp.vocab)

# add pattern to matcher
matcher.add('definition', [pattern])

# run matcher over doc
matches = matcher(docs)

In [181]:
len(matches)

40

In [182]:
# printing first ten matches

for match_id, start, end in matches[:10]:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = docs[start:end]  # The matched span
    print(string_id, start, end, span.text)
    print(docs[start].sent)
    print('\n')

definition 2285 2289 gender analysis''--(A) means
Gender analysis.--The term ``gender analysis''--(A) means a socioeconomic analysis of available or gathered quantitative and qualitative information to identify, understand, and explain gaps between men and women, which typically involves examining--(i) differences in the status of women and men and differential access to and control over assets, resources, education, opportunities, and services;(ii) the influence of gender roles, structural barriers, and norms on the division of time between paid, unpaid work (including the subsistence production and care for family members), and volunteer activities;(iii) the influence of gender roles, structural barriers, and norms on leadership roles and decision making; constraints, opportunities, and entry points for narrowing gender gaps and empowering women; and(iv) potential differential impacts of development policies and programs on men and women, including unintended or negative consequences

In [183]:
# writing results to text file

results = []
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = docs[start:end]  # The matched span
    results.append(str((' ')))
    results.append(str(span.text))
    results.append(str(('\n')))
    results.append(str(docs[start].sent))
    results.append(str('\n'))
    # defs.append(str(('\n ')))
    
with open('test/save_text.txt', 'w') as f:
    for line in results:
        f.writelines(line)

# from text to csv
Now, we will load up the text file we just saved, with the goal of re-formatting it into a CSV file.

In [184]:
# loading up the text file

with open('test/save_text.txt', 'r') as f:
    text = f.read()

In [185]:
text[:1000]

" gender analysis''--(A) means\nGender analysis.--The term ``gender analysis''--(A) means a socioeconomic analysis of available or gathered quantitative and qualitative information to identify, understand, and explain gaps between men and women, which typically involves examining--(i) differences in the status of women and men and differential access to and control over assets, resources, education, opportunities, and services;(ii) the influence of gender roles, structural barriers, and norms on the division of time between paid, unpaid work (including the subsistence production and care for family members), and volunteer activities;(iii) the influence of gender roles, structural barriers, and norms on leadership roles and decision making; constraints, opportunities, and entry points for narrowing gender gaps and empowering women; and(iv) potential differential impacts of development policies and programs on men and women, including unintended or negative consequences; and(B) includes 

In [186]:
# want separate out our text into individual strings, using new lines as delimiter
# for item in text
# split item by "/n"

splitted = text.split('\n')


In [187]:
splitted[:10]

[" gender analysis''--(A) means",
 "Gender analysis.--The term ``gender analysis''--(A) means a socioeconomic analysis of available or gathered quantitative and qualitative information to identify, understand, and explain gaps between men and women, which typically involves examining--(i) differences in the status of women and men and differential access to and control over assets, resources, education, opportunities, and services;(ii) the influence of gender roles, structural barriers, and norms on the division of time between paid, unpaid work (including the subsistence production and care for family members), and volunteer activities;(iii) the influence of gender roles, structural barriers, and norms on leadership roles and decision making; constraints, opportunities, and entry points for narrowing gender gaps and empowering women; and(iv) potential differential impacts of development policies and programs on men and women, including unintended or negative consequences; and(B) inclu

In [190]:
# creating lists of our matches and the definitions

defs = []
matches = []
for item in splitted:
    if item != '':
        space = ' '
        if space == item[0]:
            defs.append(item.strip())
        else:
            matches.append(item)

In [191]:
defs[:10]

["gender analysis''--(A) means",
 'orientation includes',
 "gender identity' means",
 "orientation' means",
 "gender transition procedure' means",
 "gender' means",
 "gender transition' means",
 "gender transition surgery' means",
 "-sex hormones' means",
 "sex hormones' means"]

In [192]:
matches[:10]

["Gender analysis.--The term ``gender analysis''--(A) means a socioeconomic analysis of available or gathered quantitative and qualitative information to identify, understand, and explain gaps between men and women, which typically involves examining--(i) differences in the status of women and men and differential access to and control over assets, resources, education, opportunities, and services;(ii) the influence of gender roles, structural barriers, and norms on the division of time between paid, unpaid work (including the subsistence production and care for family members), and volunteer activities;(iii) the influence of gender roles, structural barriers, and norms on leadership roles and decision making; constraints, opportunities, and entry points for narrowing gender gaps and empowering women; and(iv) potential differential impacts of development policies and programs on men and women, including unintended or negative consequences; and(B) includes conclusions and recommendation

In [193]:
# creat data frame, making columns of our lists

import pandas as pd

df = pd.DataFrame(
    {'Match': matches,
     'Definition': defs
    })

In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Match       40 non-null     object
 1   Definition  40 non-null     object
dtypes: object(2)
memory usage: 768.0+ bytes


In [197]:
df

Unnamed: 0,Match,Definition
0,Gender analysis.--The term ``gender analysis''...,gender analysis''--(A) means
1,An explicit and comprehensive national solutio...,orientation includes
2,Gender identity.--The term `gender identity' m...,gender identity' means
3,``(5) Sexual orientation.--The term `sexual or...,orientation' means
4,In general.--The term `gender transition proce...,gender transition procedure' means
5,Gender.--The term `gender' means the psycholog...,gender' means
6,Gender transition.--The term `gender transitio...,gender transition' means
7,In general.--The term `gender transition surge...,gender transition surgery' means
8,hormones.--The term `cross-sex hormones' means...,-sex hormones' means
9,hormones.--The term `cross-sex hormones' means...,sex hormones' means


In [198]:
# dropping any duplicates

df.drop_duplicates

<bound method DataFrame.drop_duplicates of                                                 Match  \
0   Gender analysis.--The term ``gender analysis''...   
1   An explicit and comprehensive national solutio...   
2   Gender identity.--The term `gender identity' m...   
3   ``(5) Sexual orientation.--The term `sexual or...   
4   In general.--The term `gender transition proce...   
5   Gender.--The term `gender' means the psycholog...   
6   Gender transition.--The term `gender transitio...   
7   In general.--The term `gender transition surge...   
8   hormones.--The term `cross-sex hormones' means...   
9   hormones.--The term `cross-sex hormones' means...   
10  Biological sex.--The term ``biological sex'' m...   
11  Biological sex.--The term ``biological sex'' m...   
12  Gender identity.--The term ``gender identity''...   
13  In general.--The term ``gender transition'' in...   
14  Gender identity.--The term ``gender identity''...   
15  Gender identity.--The term ``gender ident

In [199]:
# saving to a csv file

df.to_csv('test/defs.csv')