In [4]:
import numpy
import pandas as pd
import Cython
import gensim
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
from collections import defaultdict
from gensim.models import Word2Vec, KeyedVectors
import Levenshtein, re
import sys

from google.colab import auth
auth.authenticate_user()
from google.colab import drive
drive.mount('/content/gdrive/')


Mounted at /content/gdrive/


In [6]:
%cd /content/gdrive/MyDrive/Diss_Detecting_Provider_Bias/Aim 1/Doubt Markers/1_Data Prep
%ls

/content/gdrive/MyDrive/Diss_Detecting_Provider_Bias/Aim 1/Doubt Markers/1_Data Prep
1_doubt_makers_word_embeddings.py  trig-vectors-phrase.bin
doubt_markers_lexicon_dev.ipynb


# Loading word2vec model

In [8]:

model = KeyedVectors.load_word2vec_format('trig-vectors-phrase.bin', binary=True, encoding='latin-1')




# Stigmatizing Labels and Negative Patient Descriptors 
* NIDA Words Matter list
* Extra words selected by substance use experts previously used for MOUD stigma/bias (Jenn Drew and Abeed)
* Negative patient descriptor list 

In [9]:

# Doubt Markers/Evidentials identified through:
# Beach et al. 2021 Testimonial Injustice : 
# Judgment words: "adamant", "apparently", "claims","insists", "states", 
# Evidentials: complains, complained, complaining, denies, denied, denying, endorses, endorsed, endorsing, notes, noting , reports, reporting, says, saying, tells, told, telling 

## List: 
# Biber and Finegan, 1989 Styles of stance in english 
## Doubt adverbs: allegedly, apparently, conceivably, ostensibly, perchance, perhaps, possibly, presumably, purportedly, reportedly, reputedly, seemingly, supposedly
## Doubt verbs: disbelieve, doubt, suspected, speculated, 
## Doubt adjectives: alleged, arguable, conceivable, disputable, doubtful, dubious, imaginable, improbable, presumable, questionable, reputed, supposed, uncertain, likely  
# Papafragou et al., 2007, Evidentiality in language and cognition

doubt_stem_words = ["adamant", "claims", "insists", "states", "allegedly", "apparently", "conceivably", "ostensibly", "perchance", "perhaps", "possibly", "presumably", "purportedly", "reportedly", "reputedly", "seemingly", "supposedly", "disbelieve", "doubt", "suspected", "speculated", "alleged", "arguable", "conceivable", "disputable", "doubtful", "dubious", "imaginable", "improbable", "presumable", "questionable", "reputed", "supposed", "uncertain", "likely"]



bias_words_df = pd.DataFrame({
    'stem_word': doubt_stem_words
})

In [12]:


bias_words_df['most_similar_words'] = bias_words_df['stem_word'].apply(model.most_similar)

bias_words_df_2 = bias_words_df.explode("most_similar_words", ignore_index=True)
bias_words_df_2['new_word_id'] = range(1, 1 + len(bias_words_df_2))
# bias_words_df_2[['similar_word','similarity_score']] =
words_sep = pd.DataFrame(bias_words_df_2['most_similar_words'].values.tolist())
words_sep['new_word_id'] = range(1, 1 + len(bias_words_df_2))
bias_words_3 = bias_words_df_2.merge(words_sep, on = 'new_word_id')
#bias_words_3['similar_word'], bias_words_3['score'] = bias_words_3[3],bias_words_3[4]

bias_words_3= bias_words_3.rename(columns={0: "similar_word", 1: "score"})
bias_words_3["Relevant_to_study"] = ""
bias_words_3.to_csv("doubt_words_lexicon_stem_and_similar_round1.csv")

bias_words_3

Unnamed: 0,stem_word,most_similar_words,new_word_id,similar_word,score,Relevant_to_study
0,adamant,"(adament, 0.7605476379394531)",1,adament,0.760548,
1,adamant,"(timid, 0.760417640209198)",2,timid,0.760418,
2,adamant,"(impish, 0.7475950717926025)",3,impish,0.747595,
3,adamant,"(jolly, 0.736566960811615)",4,jolly,0.736567,
4,adamant,"(dreamball, 0.7204171419143677)",5,dreamball,0.720417,
...,...,...,...,...,...,...
345,likely,"(probable, 0.5059902667999268)",346,probable,0.505990,
346,likely,"(unlikely, 0.4867241084575653)",347,unlikely,0.486724,
347,likely,"(eliminations_luke_harper, 0.4614644944667816)",348,eliminations_luke_harper,0.461464,
348,likely,"(eliminations_daniel_bryan, 0.45495110750198364)",349,eliminations_daniel_bryan,0.454951,


Following 10 most similar words identified, we'll use  stigmatizing_labels_descriptors_lexicon_stem_and_similar_round1.csv to filter out words deemed irrelevant to study by JL and DW. 

# Misspelling Generator
* 

In [None]:

## Misspelling Generator



def generate_spelling_variants(seedwordlist, word_vectors, semantic_search_length=500, levenshtein_threshold = 0.85, setting = 1):
    """
        setting -> 0 = weighted levenshtein ratios
                -> 1 = standard levenshtein ratios

    :param seedwordlist:            list of words for which spelling variants are to be generated
    :param word_vectors:            the word vector model
    :param semantic_search_length:  the number of semantically similar terms to include in each iteration
    :param levenshtein_threshold:   the threshold for levenshtein ratio

    :return: dictionary containing the seedwords as key and all the variants as a list of values

    """
    vars = defaultdict(list)
    for seedword in seedwordlist:
        #a dictionary to hold all the variants, key: the seedword, value: the list of possible misspellings
        #a dynamic list of terms that are still to be expanded
        terms_to_expand = []
        terms_to_expand.append(seedword)
        all_expanded_terms = []
        level = 1
        while len(terms_to_expand)>0:
                t = terms_to_expand.pop(0)
                all_expanded_terms.append(t)
                try:
                    similars = word_vectors.most_similar(t, topn=semantic_search_length)
                    for similar in similars:
                        similar_term = similar[0]
                        if setting == 1:
                            seq_score = Levenshtein.ratio(str(similar_term),seedword)
                        if setting == 0:
                            seq_score = weighted_levenshtein_ratio(str(similar_term), seedword)
                        if seq_score>levenshtein_threshold:
                            if not re.search(r'\_',similar_term):
                                vars[seedword].append(similar_term)
                                if not similar_term in all_expanded_terms and not similar_term in terms_to_expand:
                                    terms_to_expand.append(similar_term)
                except:
                        pass
                level+=1
        vars[seedword] = list(set(vars[seedword]))
    return vars

bias_stem_words_round_2 = pd.read_csv("word_list_round_2.csv")
bias_stem_words_round_2["similar_word"] = bias_stem_words_round_2["similar_word"].replace("_", " ", regex = True)

bias_expanded_word_list = bias_stem_words_round_2["similar_word"]


expanded = generate_spelling_variants(bias_expanded_word_list, model2, semantic_search_length=500, levenshtein_threshold = 0.85, setting = 1)

df = pd.DataFrame.from_dict(expanded, orient ='index')



df.to_csv("expanded_misspellings.csv")

In [None]:
!git add "doubt_markers_le"