In [29]:
import pandas as pd
import numpy as np
import spacy

In [30]:
narratives = pd.read_csv("conversion-narratives/data/website/walkaway_testimonials.csv")

In [31]:
narratives.head()

Unnamed: 0,Title,Body,Views,Being,Becoming,postid
0,Trace & Gary's #WalkAway,We don't fall for identity politics. We are a ...,"8,996 views",1.0,0.0,1
1,Avialea's #WalkAway,With every rabid attack that the left makes on...,"5,850 views",1.0,0.0,2
2,Shane's #WalkAway,I am not your normal republican. I am gay and ...,"8,532 views",0.0,1.0,3
3,Sherry's #WalkAway,It is so encouraging to read the stories of pe...,"4,085 views",0.0,1.0,4
4,Terri's #WalkAway,I am 66 yrs old and have been a lifelong moder...,"4,760 views",0.0,1.0,5


In [32]:
#need to first load the spacy package
nlp = spacy.load('en_core_web_lg')

In [33]:
#run all narratives through the spacy parsing 
narratives['parsed_text'] = list(nlp.pipe(narratives['Body']))

In [34]:
#example to make sure I know what is going on
for tok in narratives['parsed_text'][2]:
    print(tok.text, tok.dep_, tok.head.text, tok.head.pos_, [child for child in tok.children])

I nsubj am AUX []
am ROOT am AUX [I, not, republican, .]
not neg am AUX []
your poss republican ADJ []
normal amod republican ADJ []
republican attr am AUX [your, normal]
. punct am AUX []
I nsubj am AUX []
am ROOT am AUX [I, gay, and, live]
gay acomp am AUX []
and cc am AUX []
I nsubj live VERB []
live conj am AUX [I, in, .]
in prep live VERB [Tennessee]
Tennessee pobj in ADP []
. punct live VERB []
Throughout prep supported VERB [most]
most pobj Throughout ADP [of]
of prep most ADJ [life]
my poss life NOUN []
life pobj of ADP [my]
I nsubj supported VERB []
supported ROOT supported VERB [Throughout, I, democrats, .]
democrats dobj supported VERB [and, left]
and cc democrats PROPN []
the det left NOUN []
left conj democrats PROPN [the]
. punct supported VERB []
I nsubj thought VERB []
thought ROOT thought VERB [I, was]
because mark was AUX []
I nsubj was AUX []
was advcl thought VERB [because, I, gay]
gay acomp was AUX []
I nsubj had AUX []
had ROOT had AUX [I, support, .]
to aux suppo

In [35]:
#VERBS IN NARRATIVES
verbs = list()
tense = list()
typ_narr = list()
post_id = list()

i = 0
for doc in narratives['parsed_text']:
    for poss_verb in doc:
        if poss_verb.pos == spacy.symbols.VERB:
            verbs.append(poss_verb)
            tense.append(poss_verb.tag_)
            post_id.append(narratives['postid'][i])
            if narratives['Being'][i] == 1.0:
                typ_narr.append("Being")
            elif narratives['Becoming'][i] == 1.0:
                typ_narr.append("Becoming")
            else:
                typ_narr.append("NA")
    i += 1

In [37]:
verbs_in_texts = pd.DataFrame(
                {"verbs": verbs,
                "tense": tense,
                "type_of_narr": typ_narr,
                "postid": post_id})

In [38]:
verbs_in_texts.sample(10)

Unnamed: 0,verbs,tense,type_of_narr,postid
694,make,VBP,Being,12
887,raise,VB,Becoming,17
615,involves,VBZ,Being,11
155,made,VBD,Becoming,5
1648,realize,VBP,Becoming,31
131,bought,VBN,Becoming,5
855,fit,VB,Becoming,16
1347,voted,VBD,Becoming,25
906,asked,VBD,Becoming,17
1284,'ll,MD,Becoming,23


In [39]:
verbs_in_texts.to_csv("verbs_in_website.csv")

In [42]:
#getting nouns
noun_phrases = list()
post_ids = list()
typ_narr = list()

i = 0

for doc in narratives['parsed_text']:
    for phrase in doc.noun_chunks:
        noun_phrases.append(phrase)
        post_ids.append(narratives['postid'][i])
        if narratives['Being'][i] == 1.0:
            typ_narr.append("Being")
        elif narratives['Becoming'][i] == 1.0:
            typ_narr.append("Becoming")
        else:
            typ_narr.append("NA")
    i += 1

In [44]:
noun_phrases = pd.DataFrame(
            {"noun_phrase": noun_phrases, 
             "postid": post_ids,
            "type_of_narr": typ_narr})

In [45]:
noun_phrases.head()

Unnamed: 0,noun_phrase,postid,type_of_narr
0,(We),1,Being
1,"(identity, politics)",1,Being
2,(We),1,Being
3,"(a, gay, couple)",1,Being
4,(who),1,Being


In [46]:
def noun_verb_pairs(doc):
    nounverbs = list()
    for tok in doc:
        if tok.dep_ == 'ROOT':
            nounverbs.append((child(tok,'nsubj'),tok,child(tok,'dobj')))
    return nounverbs

def child(tok, dep): # helper function
    for c in tok.children:
        if c.dep_== dep:
            return c
    return None

noun_verb_pairs(doc)

[(I, voted, Dem),
 (I, explained, None),
 (He, told, me),
 (I, did, None),
 (her, say, None),
 (I, am, None),
 (one, tells, me),
 (I, am, None),
 (Liars, cheats, None)]

In [59]:
being = narratives[narratives['Being'] == 1.0]
becoming = narratives[narratives['Becoming'] == 1.0]

In [60]:
being_nvpairs = list()
becoming_nvpairs = list()

for doc in being['parsed_text']:
    being_nvpairs.append(noun_verb_pairs(doc))

for doc in becoming['parsed_text']:
    becoming_nvpairs.append(noun_verb_pairs(doc))

In [61]:
being_nvpairs = pd.DataFrame(
    {"nvpairs": being_nvpairs,
    "type": "Being"})

being_nvpairs = being_nvpairs.explode('nvpairs')

In [62]:
becoming_nvpairs = pd.DataFrame(
    {"nvpairs": becoming_nvpairs,
    "type": "Becoming"})

becoming_nvpairs = becoming_nvpairs.explode('nvpairs')

In [63]:
all_nvpairs = being_nvpairs.append(becoming_nvpairs)

In [64]:
all_nvpairs.to_csv("web_nvpairs.csv")
noun_phrases.to_csv("web_nphrases.csv")