In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
#@title Setup
!pip install git+https://github.com/google-research/google-research.git#subdirectory=tide_nlp
!python -m spacy download en_core_web_sm

In [None]:
#@title Imports
import sys

import bs4
from importlib_resources import files
import pandas as pd
import requests
import spacy

import tide_nlp as tide_nlp
from tide_nlp import identity_annotator as ia
from tide_nlp.entity_annotator import identity_spacy_annotator as i_spacy_a
from tide_nlp.entity_annotator import non_ptc_annotator as non_ptc_a
from tide_nlp.entity_annotator import ptc_annotator as ptc_a
from tide_nlp.entity_annotator import ptc_helper as ptc
from tide_nlp.entity_annotator import spacy_annotator as spacy_a
from tide_nlp.lexicon import tidal_lexicon as lex
from tide_nlp.tokenizer import spacy_tokenizer as tok

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#@title Initialize TIDAL lexicon

TIDAL_LEXICON_FILE = 'tidal_sample.csv'

TIDAL_COLUMNS = ['Term', 'IdentityGroup', 'IdentitySubgroup',
                 'Connotation', 'HasNonIdentityMeaning', 'IsRootTerm', 'IsPTCTerm',
                 'IdentityGroup_Connotation_ConvergenceScore']

tidal_lexicon_path = files(tide_nlp).joinpath('data/' + TIDAL_LEXICON_FILE)
tidal_lexicon_df = pd.read_csv(tidal_lexicon_path,
                               usecols=TIDAL_COLUMNS,
                               engine='python')

In [None]:
#@title Download person noun lexicon
PERSON_NOUN_LEXICON_URLS = ['https://en.wiktionary.org/w/index.php?title=Category:English_terms_of_address',
                            'https://en.wiktionary.org/w/index.php?title=Category:English_terms_of_address&pagefrom=SNOOKUMS%0Asnookums#mw-pages']

person_noun_terms = []

for url in PERSON_NOUN_LEXICON_URLS:
  response = requests.get(url)
  soup = bs4.BeautifulSoup(response.content, 'html.parser')
  mw_category_divs = soup.find_all('div', {'class': 'mw-category-group'})

  for div in mw_category_divs:
    for a in div.find_all('a'):
      noun = a.text.lower()

      # Remove terms that are less than 3 characters (eg Mt)
      if len(noun) < 3:
        continue
      # Remove terms that have a period (eg Mr. President)
      if '.' in noun:
        continue

      person_noun_terms.append(noun)

person_lexicon_df = pd.DataFrame(person_noun_terms, columns=['noun'])

In [None]:
#@title Download custom annotation model

# CUSTOM_MODEL_DIR = 'identity_ner_spacy_balanced'
# custom_ner_model_path = files(tide_nlp).joinpath("data/" + CUSTOM_MODEL_DIR)
# custom_nlp = spacy.load(custom_ner_model_path)

In [None]:
#@title Configure annotation options

model_path = 'en_core_web_sm'
nlp = spacy.load(model_path)

lexicon = lex.TidalLexicon(tidal_lexicon_df)
tokenizer = tok.SpacyTokenizer(nlp)

person_helper_lexicon = ptc.PersonMentionHelper(nlp, person_lexicon_df)
ptc_lexicon_annotator = ptc_a.PtcAnnotator(person_helper_lexicon)
non_ptc_lexicon_annotator = non_ptc_a.NonPtcAnnotator(person_helper_lexicon)

person_helper_similarity = ptc.PersonMentionHelper(nlp, use_nltk_similarity=True)
ptc_similarity_annotator = ptc_a.PtcAnnotator(person_helper_similarity)
non_ptc_similarity_annotator = non_ptc_a.NonPtcAnnotator(person_helper_similarity)

spacy_annotator = spacy_a.SpacyAnnotator(nlp)
# custom_spacy_annotator = i_spacy_a.IdentitySpacyAnnotator(custom_nlp)

In [None]:
#@title Test annotation

# This uses a simple token-based annotation logic to determine whether an
# identity term is modifying a known person noun based on the lexicon.
entity_annotators = [ptc_lexicon_annotator, spacy_annotator]
non_entity_annotators = [non_ptc_lexicon_annotator]

annotator = ia.IdentityAnnotator(lexicon=lexicon,
                                 tokenizer=tokenizer,
                                 entity_annotators=entity_annotators,
                                 non_entity_annotators=non_entity_annotators)

test_text = '''Black Americans have a hive mind mentality and automatically switch political party preferences just like that. Even to the parties who have white in there flags.'''

groups, terms, group_term_dict, df = annotator.annotate(test_text.lower())

print('identity groups: ', groups)
print('identity terms: ', terms)
print('identity group-term dictionary:\n', group_term_dict)
print('annotation candidates:\n', df.to_csv())

identity groups:  ['Race_Nationality_Ethnicity']
identity terms:  ['black' 'white']
identity group-term dictionary:
 {'Race_Nationality_Ethnicity': ['black', 'white']}
annotation candidates:
 ,mention.tokens.limit,mention.tokens.start,mention.type,ptc.identity_term,ptc.identity_token,ptc.person_term,ptc.person_token,ptc.ptc_term,ptc.text,IdentityGroup,IdentitySubgroup,token.dependencyLabel,token.pos,IsPTCTerm,IdentityGroup_Connotation_ConvergenceScore,token.lemma,bytes.start,token.tag,token.dependencyHead.index,IsRootTerm,token.index,Term,text,HasNonIdentityMeaning,bytes.limit,Connotation,PossibleNonIdentity
0,2.0,1.0,NORP,black,0.0,americans,1.0,black americans,black,Race_Nationality_Ethnicity,Black,amod,ADJ,False,1.0,black,0,JJ,1,True,0,black,black,True,5,"('NEUTRAL',)",False
1,,,,,,,,,,Race_Nationality_Ethnicity,Black,amod,ADJ,False,1.0,black,0,JJ,1,True,0,black,black,True,5,"('NEUTRAL',)",
2,,,,,,,,,,Race_Nationality_Ethnicity,White,relcl,ADJ,False,1.0,white,141,JJ,20,True,23,white

In [None]:
#@title Utility functions for bulk annotation

import pandas as pd
from tqdm import tqdm
tqdm.pandas()

def annotate_example_row_lib(lib, row, text_column='comment_text'):
  text = row[text_column].lower()
  groups, terms, group_term_dict, df = lib.annotate(text)

  if len(groups):
    row['identity_groups'] = groups
    row['identity_terms'] = terms
    row['annotation_group_term_dict'] = group_term_dict

  if not df.empty:
    row['df'] = df.to_dict('records')

  return row

In [None]:
#@title Fetch CivilComments data

!curl https://storage.googleapis.com/civil_comments_dataset/validate_df_processed.csv -o /tmp/civil_comments.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  219M  100  219M    0     0   153M      0  0:00:01  0:00:01 --:--:--  153M


In [None]:
#@title Annotate a sample of CivilComments
cc_df = pd.read_csv('/tmp/civil_comments.csv', usecols=['comment_text']).sample(20, random_state=789)

annotated_cc_df = cc_df.progress_apply(lambda x: annotate_example_row_lib(annotator, x), axis=1)

100%|██████████| 20/20 [00:16<00:00,  1.24it/s]


In [None]:
annotated_cc_df

Unnamed: 0,annotation_group_term_dict,comment_text,df,identity_groups,identity_terms
705165,,Cut the BS. Parental leave is paid for through...,,,
523627,{'SOGIESC': ['woman']},Texting requires taking ones eyes off the road...,"[{'IdentityGroup': 'SOGIESC', 'IdentityFa...",[SOGIESC],[woman]
698888,,David Boyle is correct. To big a project with ...,,,
234859,"{'SOGIESC': ['woman', 'women', 'men']}","Someone said, ""Jesus picked males because in h...","[{'IdentityGroup': 'SOGIESC', 'IdentityFa...",[SOGIESC],"[men, women, woman]"
363171,,"""...Dave Nichol knew how to tell a story and h...",,,
698236,,okay folks here is the law in BC\n\nup until 2...,,,
707493,,Leave it to trump to cut out the middleman.,,,
560366,,My residency status doesn't mean that your ign...,,,
297069,,"Funny, how Ohio and Virginia are marching to t...",,,
465256,,Make her drink 1/4 of it. If she will OK.,,,
