In [2]:
# import our matcher class
import spacy
from spacy.matcher import Matcher

In [59]:
# load the language model from spacy

nlp = spacy.load("en_core_web_sm")

In [4]:
# check the pipe components

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [57]:
# load up the plain text from the bills
with open ('./data/118s_clean.txt', 'r') as f:
    text = f.read()

In [58]:
len(text)

8085198

In [60]:
text[:1000]

'b"Congressional Bills 118th CongressFrom the U.S. Government Publishing OfficeS. 1595 Introduced in Senate (IS)&lt;DOC&gt;118th CONGRESS1st SessionS. 1595To prohibit taxpayer-funded gender transition procedures, and for otherpurposes. IN THE SENATE OF THE UNITED STATESMay 15, 2023 Mr. Marshall (for himself, Mrs. Blackburn, Mr. Braun, Mr. Cramer, Mr. Daines, Mrs. Hyde-Smith, Mr. Lee, Mr. Mullin, Mr. Risch, Mr. Rubio, Mr.Wicker, and Mr. Hawley) introduced the following bill; which was readtwice and referred to the Committee on Finance A BILL To prohibit taxpayer-funded gender transition procedures, and for otherpurposes.Be it enacted by the Senate and House of Representatives of the United States of America in Congress assembled,SECTION 1. SHORT TITLE; TABLE OF CONTENTS.(a) Short Title.--This Act may be cited as the ``End Taxpayer Funding of Gender Experimentation Act of 2023\'\'.(b) Table of Contents.--The table of contents of this Act is as follows:Sec. 1. Short title; table of conten

In [61]:
type(text)

str

In [62]:
# we want to split our bills into lists of individual bills, which
# will make it easier to process with spacy later

# split by text that is included in all the bills, for example
# "Congressional Bills 117th Congress"

texts = text.split('b"Congressional Bills 118th Congress')

In [63]:
len(texts)

210

In [11]:
# the amount of text is overwhelming for spacy to process at once, so
# make into lists for easier processing

# text_one = texts[:200]
# text_two = texts[200:300]
# three = texts[300:400]

In [64]:
#####

#####

##### CHANGE THE VARIABLE TO MAKE SURE ITS THE CORRECT TEXT

#####

#####

type(texts)

list

In [65]:
#####

#####

##### CHANGE THE VARIABLE TO MAKE SURE ITS THE CORRECT TEXT

#####

#####


texts[3]

'From the U.S. Government Publishing OfficeS. 1597 Introduced in Senate (IS)&lt;DOC&gt;118th CONGRESS1st SessionS. 1597 To amend chapter 110 of title 18, United States Code, to prohibit gender transition procedures on minors, and for other purposes. IN THE SENATE OF THE UNITED STATESMay 15, 2023 Mr. Marshall (for himself, Mrs. Blackburn, Mr. Cramer, Mr. Braun, Mr. Daines, Mrs. Hyde-Smith, Mr. Mullin, and Mr. Wicker) introduced thefollowing bill; which was read twice and referred to the Committee onthe Judiciary A BILLTo amend chapter 110 of title 18, United States Code, to prohibit gender transition procedures on minors, and for other purposes.Be it enacted by the Senate and House of Representatives of the United States of America in Congress assembled,SECTION 1. SHORT TITLE.This Act may be cited as the ``Protecting Children From Experimentation Act of 2023\'\'.SEC. 2. GENDER TRANSITION PROCEDURES ON MINORS PROHIBITED.Chapter 110 of title 18, United States Code, is amended--(1) by addi

In [66]:
# create an entity ruler to label gender terms as "gender", etc

ruler = nlp.add_pipe("entity_ruler", after="ner")

patterns = [
                {"label": "SEX", "pattern": [{'LOWER': 'biological'}, {'LOWER': 'sex'}]},
                {"label": "SEX", "pattern": 'biological'},
                {"label": "SEX", "pattern": [{'LOWER': 'cross'}, {'IS_PUNCT': True}, {'LOWER': 'sex'}]},
                {"label": "SEX", "pattern": [{'LOWER': 'opposite'}, {'LOWER': 'sex'}]},
                {"label": "SEX", "pattern": 'sex'},
                {"label": "SEX", "pattern": [{'LOWER': 'trans'}, {'IS_PUNCT': True}, {'LOWER': 'sexual'}]},
                {"label": "SEX", "pattern": [{'LOWER': 'trans'}, {'LOWER': 'sexual'}]},
                {"label": "SEX", "pattern": 'transsexual'},
                {"label": "GENDER", "pattern": 'genderqueer'},
                {"label": "GENDER", "pattern": 'gender'},
                {"label": "GENDER", "pattern": 'transgender'},
                {"label": "GENDER", "pattern": [{'LOWER': 'trans'}, {'IS_PUNCT': True}, {'LOWER': 'gender'}]},
                {"label": "GENDER", "pattern": [{'LOWER': 'trans'}, {'LOWER': 'gender'}]},
                {"label": "GENDER", "pattern": 'trans'},
                {"label": "GENDER", "pattern": [{'LOWER': 'non'}, {'IS_PUNCT': True}, {'LOWER': 'binary'}]},
                {"label": "GENDER", "pattern": [{'LOWER': 'non'}, {'LOWER': 'binary'}]},
                {"label": "GENDER", "pattern": 'nonbinary'},
                {"label": "GENDER", "pattern": 'male'},
                {"label": "GENDER", "pattern": 'female'},
                {"label": "SEXUALITY", "pattern": 'sexuality'},
                {"label": "SEXUALITY", "pattern": 'orientation'},
                {"label": "SEXUALITY", "pattern": 'lgbt'},
                {"label": "SEXUALITY", "pattern": 'lgbt+'},
                {"label": "SEXUALITY", "pattern": 'lgbtqia+'},
                {"label": "SEXUALITY", "pattern": 'queer'}
            ]

ruler.add_patterns(patterns)

In [67]:
# check pipe names to make sure our entity ruler is added

nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [68]:
#####

#####

##### CHANGE THE VARIABLE TO MAKE SURE ITS THE CORRECT TEXT

#####

#####

texts = ''.join(texts)

In [69]:
len(texts)

8077674

In [70]:
# run the nlp() function on our texts to get the docs object
# may have to change max length depending if you get the error when 
# running nlp()

nlp.max_length = 9945910
docs = nlp(texts)

In [71]:
docs[:1000]

From the U.S. Government Publishing OfficeS. 1595 Introduced in Senate (IS)&lt;DOC&gt;118th CONGRESS1st SessionS. 1595To prohibit taxpayer-funded gender transition procedures, and for otherpurposes. IN THE SENATE OF THE UNITED STATESMay 15, 2023 Mr. Marshall (for himself, Mrs. Blackburn, Mr. Braun, Mr. Cramer, Mr. Daines, Mrs. Hyde-Smith, Mr. Lee, Mr. Mullin, Mr. Risch, Mr. Rubio, Mr.Wicker, and Mr. Hawley) introduced the following bill; which was readtwice and referred to the Committee on Finance A BILL To prohibit taxpayer-funded gender transition procedures, and for otherpurposes.Be it enacted by the Senate and House of Representatives of the United States of America in Congress assembled,SECTION 1. SHORT TITLE; TABLE OF CONTENTS.(a) Short Title.--This Act may be cited as the ``End Taxpayer Funding of Gender Experimentation Act of 2023''.(b) Table of Contents.--The table of contents of this Act is as follows:Sec. 1. Short title; table of contents. TITLE I--PROHIBITING FEDERALLY FUND

In [72]:
len(docs)

1371174

In [73]:
# making sure we have all of the components correct in the docs object

for token in docs[:10]:
    print(token)
    print(token.pos_)
    print(token.dep_)

From
ADP
ROOT
the
DET
det
U.S.
PROPN
compound
Government
PROPN
compound
Publishing
PROPN
pobj
OfficeS.
PROPN
pobj
1595
NUM
nummod
Introduced
VERB
acl
in
ADP
prep
Senate
PROPN
pobj


In [74]:
# making sure we have entities correct in docs object

labels = ['GENDER', 'SEX', 'SEXUALITY']
for ent in docs.ents[:1000]:
    if ent.label_ in labels:
        print(ent.text, ent.label_)

gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
biological sex SEX
biological SEX
male GENDER
female GENDER
sex SEX
sex SEX
gender GENDER
Cross-sex SEX
cross-sex SEX
biological SEX
biological SEX
biological SEX
biological SEX
gender GENDER
male GENDER
female GENDER
gender GENDER
gender GENDER
biological sex SEX
gender GENDER
biological sex SEX
gender GENDER
biological sex SEX
sex SEX
sex SEX
gender GENDER
cross-sex SEX
opposite sex SEX
gender GENDER
gender GENDER
sex SEX
biological sex SEX
sex SEX
sex SEX
sex SEX
biological SEX
male GENDER
biological SEX
female GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
biological sex SEX
sex SEX
sex SEX
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
gender GENDER
male GENDER
female GENDER
gender GENDER
gender

In [75]:
# writing a pattern matcher to grab patterns of text where gender terms are being defined

pattern = [ 
      {
          # punctuation that appears zero or more
          'IS_PUNCT': True, 'OP': '*'
      },
      {
          # specifying the entity type, which can be one of our three
          # custom entities
          "ENT_TYPE": {
              'IN': [
                  'GENDER', 'SEX', 'SEXUALITY'
              ]
          }
      },
      {'OP': '?'}, # catches a "wild card" if it appears zero or one time
      {'OP': '?'}, # catches a "wild card" if it appears zero or one time.
      {'OP': '?'}, # catches a "wild card" if it appears zero or one time.
      {
          'IS_PUNCT': True, 'OP': '*' # catches punctuation that appears zero or more times
      },
      {
          # getting the lowercase word of any of the following terms
          'LOWER': {
              'IN': [
                  'means', 'signifies', 'includes'
              ]
          }
      }
  ]

In [76]:
# use matcher class to create a matcher object
matcher = Matcher(nlp.vocab)

# add pattern to matcher
matcher.add('definition', [pattern])

# run matcher over doc
matches = matcher(docs)

In [77]:
len(matches)

67

In [78]:
# printing first ten matches

for match_id, start, end in matches[:10]:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = docs[start:end]  # The matched span
    print(string_id, start, end, span.text)
    print(docs[start].sent)
    print('\n')

definition 801 804 sex' means
Biological sex.--The term `biological sex' means the biological indication of male or female in the context of reproductive potential or capacity, such as sex chromosomes, naturally occurring sex hormones, gonads, and non-ambiguous internal and external genitalia present at birth, without regard to an individual's psychological, chosen, or subjective experience of gender.


definition 800 804 biological sex' means
Biological sex.--The term `biological sex' means the biological indication of male or female in the context of reproductive potential or capacity, such as sex chromosomes, naturally occurring sex hormones, gonads, and non-ambiguous internal and external genitalia present at birth, without regard to an individual's psychological, chosen, or subjective experience of gender.


definition 933 936 gender' means
Gender.--The term `gender' means the psychological, behavioral, social, and cultural aspects of being male or female.


definition 958 962 gen

In [79]:
# writing results to text file

results = []
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = docs[start:end]  # The matched span
    results.append(str((' ')))
    results.append(str(span.text))
    results.append(str(('\n')))
    results.append(str(docs[start].sent))
    results.append(str('\n'))
    # defs.append(str(('\n ')))

#####

#####

##### CHANGE THE FILENAME TO MAKE SURE ITS THE CORRECT ONE

#####

#####

with open('new_out/118_s.txt', 'w') as f:
    for line in results:
        f.writelines(line)

# from text to csv
Now, we will load up the text file we just saved, with the goal of re-formatting it into a CSV file.

In [80]:
# loading up the text file

#####

#####

##### CHANGE THE FILENAME TO MAKE SURE ITS THE CORRECT TEXT

#####

#####

with open('new_out/118_s.txt', 'r') as f:
    text = f.read()

In [81]:
text[:1000]

" sex' means\nBiological sex.--The term `biological sex' means the biological indication of male or female in the context of reproductive potential or capacity, such as sex chromosomes, naturally occurring sex hormones, gonads, and non-ambiguous internal and external genitalia present at birth, without regard to an individual's psychological, chosen, or subjective experience of gender.\n biological sex' means\nBiological sex.--The term `biological sex' means the biological indication of male or female in the context of reproductive potential or capacity, such as sex chromosomes, naturally occurring sex hormones, gonads, and non-ambiguous internal and external genitalia present at birth, without regard to an individual's psychological, chosen, or subjective experience of gender.\n gender' means\nGender.--The term `gender' means the psychological, behavioral, social, and cultural aspects of being male or female.\n gender transition' means\nGender transition.--The term `gender transition'

In [82]:
# want separate out our text into individual strings, using new lines as delimiter
# for item in text
# split item by "/n"

splitted = text.split('\n')


In [83]:
splitted[:10]

[" sex' means",
 "Biological sex.--The term `biological sex' means the biological indication of male or female in the context of reproductive potential or capacity, such as sex chromosomes, naturally occurring sex hormones, gonads, and non-ambiguous internal and external genitalia present at birth, without regard to an individual's psychological, chosen, or subjective experience of gender.",
 " biological sex' means",
 "Biological sex.--The term `biological sex' means the biological indication of male or female in the context of reproductive potential or capacity, such as sex chromosomes, naturally occurring sex hormones, gonads, and non-ambiguous internal and external genitalia present at birth, without regard to an individual's psychological, chosen, or subjective experience of gender.",
 " gender' means",
 "Gender.--The term `gender' means the psychological, behavioral, social, and cultural aspects of being male or female.",
 " gender transition' means",
 "Gender transition.--The te

In [84]:
# creating lists of our matches and the definitions

defs = []
matches = []
for item in splitted:
    if item != '':
        space = ' '
        if space == item[0]:
            matches.append(item.strip())
        else:
            defs.append(item)

In [85]:
defs[:10]

["Biological sex.--The term `biological sex' means the biological indication of male or female in the context of reproductive potential or capacity, such as sex chromosomes, naturally occurring sex hormones, gonads, and non-ambiguous internal and external genitalia present at birth, without regard to an individual's psychological, chosen, or subjective experience of gender.",
 "Biological sex.--The term `biological sex' means the biological indication of male or female in the context of reproductive potential or capacity, such as sex chromosomes, naturally occurring sex hormones, gonads, and non-ambiguous internal and external genitalia present at birth, without regard to an individual's psychological, chosen, or subjective experience of gender.",
 "Gender.--The term `gender' means the psychological, behavioral, social, and cultural aspects of being male or female.",
 "Gender transition.--The term `gender transition' means the process in which an individual goes from identifying with a

In [86]:
matches[:10]

["sex' means",
 "biological sex' means",
 "gender' means",
 "gender transition' means",
 "gender transition procedure' means",
 "gender transition surgery' means",
 "gender transition surgery' includes",
 "gender transition surgery' means",
 "gender transition surgery' means",
 "-sex hormones'' means"]

In [87]:
# creat data frame, making columns of our lists

import pandas as pd

df = pd.DataFrame(
    {'Match': matches,
     'Definition': defs
    })

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67 entries, 0 to 66
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Match       67 non-null     object
 1   Definition  67 non-null     object
dtypes: object(2)
memory usage: 1.2+ KB


In [89]:
df

Unnamed: 0,Match,Definition
0,sex' means,Biological sex.--The term `biological sex' mea...
1,biological sex' means,Biological sex.--The term `biological sex' mea...
2,gender' means,Gender.--The term `gender' means the psycholog...
3,gender transition' means,Gender transition.--The term `gender transitio...
4,gender transition procedure' means,In general.--The term `gender transition proce...
...,...,...
62,sex (which includes,Nondiscrimination.--No person in the United St...
63,gender identity' means,the term `Form 10-K' means the form described ...
64,orientation' means,"the term `judgment' means, with respect to an ..."
65,sex'' means,Biological sex.--The term ``biological sex'' m...


In [90]:
# dropping any duplicates

df.drop_duplicates

<bound method DataFrame.drop_duplicates of                                  Match  \
0                           sex' means   
1                biological sex' means   
2                        gender' means   
3             gender transition' means   
4   gender transition procedure' means   
..                                 ...   
62                 sex (which includes   
63              gender identity' means   
64                  orientation' means   
65                         sex'' means   
66              biological sex'' means   

                                           Definition  
0   Biological sex.--The term `biological sex' mea...  
1   Biological sex.--The term `biological sex' mea...  
2   Gender.--The term `gender' means the psycholog...  
3   Gender transition.--The term `gender transitio...  
4   In general.--The term `gender transition proce...  
..                                                ...  
62  Nondiscrimination.--No person in the United St...  
63  

In [91]:
# saving to a csv file

#####

#####

##### CHANGE THE FILENAME TO MAKE SURE ITS THE CORRECT TEXT

#####

#####

df.to_csv('new_out/s_118_defs.csv')