In [1]:
#https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/examples/text-processing/text_preprocessing_demo.ipynb 

import convokit
from convokit import Corpus, download
import os
from convokit.text_processing import TextParser

In [2]:
OUT_DIR = '/Users/Emilie/.convokit/downloads/subreddit-depressed'

In [3]:
# c = Corpus(filename=download("subreddit-mentalhealth"))
ROOT_DIR = download('subreddit-depressed')
corpus = convokit.Corpus(ROOT_DIR, utterance_end_index=199)

Dataset already exists at /Users/Emilie/.convokit/downloads/subreddit-depressed


In [4]:
corpus.print_summary_stats()

Number of Users: 120
Number of Utterances: 200
Number of Conversations: 200


In [5]:
test_utt_id = corpus.get_utterance_ids()[3]
utt = corpus.get_utterance(test_utt_id)
utt.meta

{'score': 1,
 'top_level_comment': None,
 'retrieved_on': 1412686878,
 'gilded': 0,
 'gildings': None,
 'subreddit': 'depressed',
 'stickied': False,
 'permalink': '/r/depressed/comments/1dgsab/i_have_6_months_to_live/',
 'author_flair_text': ''}

In [6]:
from convokit.text_processing import TextProcessor

In [7]:
def preprocess_text(text):
    text = text.replace(' -- ', ' ')
    return text

In [8]:
prep = TextProcessor(proc_fn=preprocess_text, output_field='clean_text')
corpus = prep.transform(corpus)

In [9]:
utt.get_info('clean_text')

"I've been given 6 months to live by my doctor?\nI've struggled with testicular cancer for about 2 years now, and I was told by my doctor I only have about 6 months to live. I'm a 17 year old high school junior that doesn't know what to do now, I've struggled with the thought but I have since, learned to accept it. Any advice on what to do?"

In [10]:
parser = TextParser(input_field='clean_text', verbosity=50)

In [11]:
corpus = parser.transform(corpus)


050/200 utterances processed
100/200 utterances processed
150/200 utterances processed
200/200 utterances processed


In [12]:
test_parse = utt.get_info('parsed')

In [13]:
test_parse[0]

{'rt': 3,
 'toks': [{'tok': 'I', 'tag': 'PRP', 'dep': 'nsubjpass', 'up': 3, 'dn': []},
  {'tok': "'ve", 'tag': 'VB', 'dep': 'aux', 'up': 3, 'dn': []},
  {'tok': 'been', 'tag': 'VBN', 'dep': 'auxpass', 'up': 3, 'dn': []},
  {'tok': 'given', 'tag': 'VBN', 'dep': 'ROOT', 'dn': [0, 1, 2, 5, 7, 11]},
  {'tok': '6', 'tag': 'CD', 'dep': 'nummod', 'up': 5, 'dn': []},
  {'tok': 'months', 'tag': 'NNS', 'dep': 'dobj', 'up': 3, 'dn': [4]},
  {'tok': 'to', 'tag': 'TO', 'dep': 'aux', 'up': 7, 'dn': []},
  {'tok': 'live', 'tag': 'VB', 'dep': 'xcomp', 'up': 3, 'dn': [6, 8]},
  {'tok': 'by', 'tag': 'IN', 'dep': 'prep', 'up': 7, 'dn': [10]},
  {'tok': 'my', 'tag': 'PRP$', 'dep': 'poss', 'up': 10, 'dn': []},
  {'tok': 'doctor', 'tag': 'NN', 'dep': 'pobj', 'up': 8, 'dn': [9]},
  {'tok': '?', 'tag': '.', 'dep': 'punct', 'up': 3, 'dn': [12]},
  {'tok': '\n', 'tag': '_SP', 'dep': '', 'up': 11, 'dn': []}]}

In [14]:
texttagger = TextParser(output_field='tagged', input_field='clean_text', mode='tag')
corpus = texttagger.transform(corpus)
utt.get_info('tagged')[0]

{'toks': [{'tok': 'I', 'tag': 'PRP'},
  {'tok': "'ve", 'tag': 'VB'},
  {'tok': 'been', 'tag': 'VBN'},
  {'tok': 'given', 'tag': 'VBN'},
  {'tok': '6', 'tag': 'CD'},
  {'tok': 'months', 'tag': 'NNS'},
  {'tok': 'to', 'tag': 'TO'},
  {'tok': 'live', 'tag': 'VB'},
  {'tok': 'by', 'tag': 'IN'},
  {'tok': 'my', 'tag': 'PRP$'},
  {'tok': 'doctor', 'tag': 'NN'},
  {'tok': '?', 'tag': '.'}]}

In [15]:
list(utt.meta.keys())

['score',
 'top_level_comment',
 'retrieved_on',
 'gilded',
 'gildings',
 'subreddit',
 'stickied',
 'permalink',
 'author_flair_text',
 'clean_text',
 'parsed',
 'tagged']

In [16]:
corpus.dump(os.path.basename(OUT_DIR), base_path=os.path.dirname(OUT_DIR), 
            fields_to_skip={'utterance': ['parsed','tagged','clean_text']})

In [17]:
corpus.dump_info('utterance',['parsed','tagged'], dir_name = OUT_DIR)

In [18]:
new_corpus = convokit.Corpus(OUT_DIR)

In [19]:
new_utt = new_corpus.get_utterance(test_utt_id)

In [20]:
new_utt.meta.keys()

KeysView({'score': 1, 'top_level_comment': None, 'retrieved_on': 1412686878, 'gilded': 0, 'gildings': None, 'subreddit': 'depressed', 'stickied': False, 'permalink': '/r/depressed/comments/1dgsab/i_have_6_months_to_live/', 'author_flair_text': ''})

In [21]:
new_corpus.load_info('utterance',['parsed'])