In [58]:
#https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/examples/text-processing/text_preprocessing_demo.ipynb 

import convokit
from convokit import Corpus, download
import os

In [60]:
OUT_DIR = '/Users/Emilie/.convokit/downloads/subreddit-mentalhealth'

In [27]:
# c = Corpus(filename=download("subreddit-mentalhealth"))
ROOT_DIR = download('subreddit-mentalhealth')
corpus = convokit.Corpus(ROOT_DIR, utterance_end_index=199)

Dataset already exists at /Users/Emilie/.convokit/downloads/subreddit-mentalhealth


In [28]:
corpus.print_summary_stats()

Number of Users: 120
Number of Utterances: 200
Number of Conversations: 200


In [45]:
test_utt_id = corpus.get_utterance_ids()[3]
utt = corpus.get_utterance(test_utt_id)
utt.meta

{'score': 1,
 'top_level_comment': None,
 'retrieved_on': -1,
 'gilded': -1,
 'gildings': None,
 'subreddit': 'mentalhealth',
 'stickied': False,
 'permalink': '/r/mentalhealth/comments/o30h3/anyone_else_ever_experience_limerence/',
 'author_flair_text': ''}

In [46]:
from convokit.text_processing import TextProcessor

In [47]:
def preprocess_text(text):
    text = text.replace(' -- ', ' ')
    return text

In [48]:
prep = TextProcessor(proc_fn=preprocess_text, output_field='clean_text')
corpus = prep.transform(corpus)

In [49]:
utt.get_info('clean_text')

"I have been diagnosed with depression for the last 4 years of my life, recently also been diagnosed with an adjustment disorder. I have been searching and searching for answers and an explanation of what I am going through. \nMy days are filled with obsessive thoughts of my last boyfriend of 2 years. It has been over a year since our breakup and my dreams are nightmares of him and his new girlfriend EVERY NIGHT.\nIt is if I am living in a prison in my own body as the obsessive thoughts about him, what went wrong and things I could of said, are neverending and I would probably say I spend about 80% of my day thinking of him. \n\nI need help and a cure, no doctors have even heard of limerence where I am from. \nI am afraid without help I won't be able to take it any longer. "

In [50]:
parser = TextParser(input_field='clean_text', verbosity=50)

In [51]:
corpus = parser.transform(corpus)


050/200 utterances processed
100/200 utterances processed
150/200 utterances processed
200/200 utterances processed


In [52]:
test_parse = utt.get_info('parsed')

In [53]:
test_parse[0]

{'rt': 3,
 'toks': [{'tok': 'I', 'tag': 'PRP', 'dep': 'nsubjpass', 'up': 3, 'dn': []},
  {'tok': 'have', 'tag': 'VBP', 'dep': 'aux', 'up': 3, 'dn': []},
  {'tok': 'been', 'tag': 'VBN', 'dep': 'auxpass', 'up': 3, 'dn': []},
  {'tok': 'diagnosed',
   'tag': 'VBN',
   'dep': 'ROOT',
   'dn': [0, 1, 2, 4, 14, 18, 23]},
  {'tok': 'with', 'tag': 'IN', 'dep': 'prep', 'up': 3, 'dn': [5]},
  {'tok': 'depression', 'tag': 'NN', 'dep': 'pobj', 'up': 4, 'dn': [6]},
  {'tok': 'for', 'tag': 'IN', 'dep': 'prep', 'up': 5, 'dn': [10]},
  {'tok': 'the', 'tag': 'DT', 'dep': 'det', 'up': 10, 'dn': []},
  {'tok': 'last', 'tag': 'JJ', 'dep': 'amod', 'up': 10, 'dn': []},
  {'tok': '4', 'tag': 'CD', 'dep': 'nummod', 'up': 10, 'dn': []},
  {'tok': 'years', 'tag': 'NNS', 'dep': 'pobj', 'up': 6, 'dn': [7, 8, 9, 11]},
  {'tok': 'of', 'tag': 'IN', 'dep': 'prep', 'up': 10, 'dn': [13]},
  {'tok': 'my', 'tag': 'PRP$', 'dep': 'poss', 'up': 13, 'dn': []},
  {'tok': 'life', 'tag': 'NN', 'dep': 'pobj', 'up': 11, 'dn': [12

In [54]:
texttagger = TextParser(output_field='tagged', input_field='clean_text', mode='tag')
corpus = texttagger.transform(corpus)
utt.get_info('tagged')[0]

{'toks': [{'tok': 'I', 'tag': 'PRP'},
  {'tok': 'have', 'tag': 'VBP'},
  {'tok': 'been', 'tag': 'VBN'},
  {'tok': 'diagnosed', 'tag': 'VBN'},
  {'tok': 'with', 'tag': 'IN'},
  {'tok': 'depression', 'tag': 'NN'},
  {'tok': 'for', 'tag': 'IN'},
  {'tok': 'the', 'tag': 'DT'},
  {'tok': 'last', 'tag': 'JJ'},
  {'tok': '4', 'tag': 'CD'},
  {'tok': 'years', 'tag': 'NNS'},
  {'tok': 'of', 'tag': 'IN'},
  {'tok': 'my', 'tag': 'PRP$'},
  {'tok': 'life', 'tag': 'NN'},
  {'tok': ',', 'tag': ','},
  {'tok': 'recently', 'tag': 'RB'},
  {'tok': 'also', 'tag': 'RB'},
  {'tok': 'been', 'tag': 'VBN'},
  {'tok': 'diagnosed', 'tag': 'VBN'},
  {'tok': 'with', 'tag': 'IN'},
  {'tok': 'an', 'tag': 'DT'},
  {'tok': 'adjustment', 'tag': 'NN'},
  {'tok': 'disorder', 'tag': 'NN'},
  {'tok': '.', 'tag': '.'}]}

In [55]:
list(utt.meta.keys())

['score',
 'top_level_comment',
 'retrieved_on',
 'gilded',
 'gildings',
 'subreddit',
 'stickied',
 'permalink',
 'author_flair_text',
 'clean_text',
 'parsed',
 'tagged']

In [61]:
corpus.dump(os.path.basename(OUT_DIR), base_path=os.path.dirname(OUT_DIR), 
            fields_to_skip={'utterance': ['parsed','tagged','clean_text']})

In [62]:
corpus.dump_info('utterance',['parsed','tagged'], dir_name = OUT_DIR)

In [63]:
new_corpus = convokit.Corpus(OUT_DIR)

In [64]:
new_utt = new_corpus.get_utterance(test_utt_id)

In [65]:
new_utt.meta.keys()

KeysView({'score': 1, 'top_level_comment': None, 'retrieved_on': -1, 'gilded': -1, 'gildings': None, 'subreddit': 'mentalhealth', 'stickied': False, 'permalink': '/r/mentalhealth/comments/o30h3/anyone_else_ever_experience_limerence/', 'author_flair_text': ''})

In [66]:
new_corpus.load_info('utterance',['parsed'])