In [2]:
from convokit import Corpus, download

In [3]:
corpus = Corpus(filename=download('subreddit-Cornell'))

Dataset already exists at /Users/calebchiam/.convokit/downloads/subreddit-Cornell


In [4]:
corpus.print_summary_stats()

Number of Users: 7568
Number of Utterances: 74467
Number of Conversations: 10744


## Bag-of-words prediction for utterances having positive scores

In [5]:
from convokit import BoWClassifier, BoWTransformer

In [6]:
bow_transformer = BoWTransformer(obj_type="utterance")

Initializing default unigram CountVectorizer...


In [7]:
print(next(corpus.iter_utterances()))

Utterance('id': 'nyx4d', 'root': nyx4d, 'reply-to': None, 'user': User('id': reddmau5, 'meta': {'num_posts': 3, 'num_comments': 5}), 'timestamp': 1325452698, 'text': "I was just reading about the Princeton Mic-Check and it's getting [national press](http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html).\n\nI want to get a sense of what people felt like around campus. Anything interesting happen? Anything interesting coming up?", 'meta': {'score': 2, 'top_level_comment': None, 'retrieved_on': -1, 'gilded': -1, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '/r/Cornell/comments/nyx4d/so_i_was_away_this_past_semester_whats_going_on/', 'author_flair_text': ''})


In [8]:
bow_transformer.fit_transform(corpus)

<convokit.model.corpus.Corpus at 0x122447090>

In [9]:
print(next(corpus.iter_utterances()))

Utterance('id': 'nyx4d', 'root': nyx4d, 'reply-to': None, 'user': User('id': reddmau5, 'meta': {'num_posts': 3, 'num_comments': 5}), 'timestamp': 1325452698, 'text': "I was just reading about the Princeton Mic-Check and it's getting [national press](http://www.bloomberg.com/news/2011-12-29/princeton-brews-trouble-for-us-1-percenters-commentary-by-michael-lewis.html).\n\nI want to get a sense of what people felt like around campus. Anything interesting happen? Anything interesting coming up?", 'meta': {'score': 2, 'top_level_comment': None, 'retrieved_on': -1, 'gilded': -1, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '/r/Cornell/comments/nyx4d/so_i_was_away_this_past_semester_whats_going_on/', 'author_flair_text': '', 'bow_vector': <1x9340 sparse matrix of type '<class 'numpy.int64'>'
	with 42 stored elements in Compressed Sparse Row format>})


In [10]:
bow_classifier = BoWClassifier(obj_type="utterance", labeller=lambda utt: utt.meta['score'] > 0)

Initializing default classification model (standard scaled logistic regression)


In [11]:
bow_classifier.fit_transform(corpus)

<convokit.model.corpus.Corpus at 0x122447090>

In [12]:
next(corpus.iter_utterances()).meta

{'score': 2,
 'top_level_comment': None,
 'retrieved_on': -1,
 'gilded': -1,
 'gildings': None,
 'subreddit': 'Cornell',
 'stickied': False,
 'permalink': '/r/Cornell/comments/nyx4d/so_i_was_away_this_past_semester_whats_going_on/',
 'author_flair_text': '',
 'bow_vector': <1x9340 sparse matrix of type '<class 'numpy.int64'>'
 	with 42 stored elements in Compressed Sparse Row format>,
 'prediction': True,
 'pred_score': 0.9999998491105527}

In [13]:
next(corpus.iter_utterances()).meta['bow_vector'].toarray()

array([[0, 0, 0, ..., 0, 0, 0]])

In [14]:
bow_classifier.summarize(corpus).head()

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
dhhm9sa,True,1.0
dw553ml,True,1.0
dvzmhdx,True,1.0
dvzpp79,True,1.0
dw0imao,True,1.0


In [15]:
corpus.get_utterance('15enm8').text

'One, just to get this out of the way: I\'m only a sophomore in high school. In spite of this, my high school is one of the top public schools in New Jersey (and to put it bluntly it\'s a very affluent area... although I\'m not necessarily affluent like my classmates). The point of telling you guys that is kids start talking about all these amazing schools they want to go to in like eighth grade, so I know quite a bit about colleges. As stated in the title, I really want to go to Cornell, and I just was hoping that some of you guys and girls on here would be awesome enough to give out some SAT scores, ACT scores (if you took them), and extra curricular activities you guys got/did? My unweighted GPA is a 3.8 (weighted is a 4.2), and my first PSAT was an overall 1900, and from taking that I (not to sound cocky here) *know* that I\'m going to get that score up a *lot*. I\'m in all the highest level classes I can be in, and I\'m looking to take multiple AP courses next year (junior). Do yo

In [16]:
bow_classifier.get_coefs(feature_names=bow_transformer.get_vocabulary()).head()

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
hotels,1.270001
hbhs,1.11569
engine,1.109702
involves,1.081836
lincoln,1.071464


In [17]:
bow_classifier.get_coefs(feature_names=bow_transformer.get_vocabulary()).tail()

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
mahogany,-0.667785
ignoreme,-0.722992
hilton,-0.742234
binary,-0.764383
creation,-0.784593


In [18]:
y_true, y_pred = bow_classifier.get_y_true_pred(corpus)

In [19]:
bow_classifier.base_accuracy(corpus)

0.9279546644822538

In [20]:
bow_classifier.accuracy(corpus)

0.9491452589737737

In [21]:
print(bow_classifier.classification_report(corpus))

              precision    recall  f1-score   support

       False       0.88      0.34      0.49      5365
        True       0.95      1.00      0.97     69102

    accuracy                           0.95     74467
   macro avg       0.91      0.67      0.73     74467
weighted avg       0.95      0.95      0.94     74467



## Bag-of-words prediction for comment thread doubling in length versus staying the same length based on first 5 utterances

In [22]:
top_level_comment_ids = [utt.id for utt in corpus.iter_utterances() if utt.id == utt.meta['top_level_comment']]

In [23]:
corpus.print_summary_stats()

Number of Users: 7568
Number of Utterances: 74467
Number of Conversations: 10744


In [24]:
len(top_level_comment_ids)

32893

In [25]:
threads_corpus = corpus.reindex_conversations(new_convo_roots=top_level_comment_ids)


['c3p1rn8', 'c3oyf4d', 'c3od15i', 'c3ocsyl', 'c3p8bze']


In [26]:
threads_corpus.print_summary_stats()

Number of Users: 6160
Number of Utterances: 63697
Number of Conversations: 32888


In [27]:
for thread in threads_corpus.iter_conversations():
    thread_len = len(list(thread.iter_utterances()))
    if thread_len == 5:
        thread.meta['thread_doubles'] = False
    elif thread_len >= 10:
        thread.meta['thread_doubles'] = True
    else:
        thread.meta['thread_doubles'] = None

In [28]:
bow_transformer2 = BoWTransformer(obj_type="conversation", 
                    text_func=lambda convo: " ".join([utt.text for utt in convo.get_chronological_utterance_list()][:5]),
                   )

Initializing default unigram CountVectorizer...


In [29]:
bow_transformer2.fit_transform(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

<convokit.model.corpus.Corpus at 0x12b12ee90>

In [30]:
bow_classifier2 = BoWClassifier(obj_type="conversation", 
                    labeller=lambda convo: convo.meta['thread_doubles'])

Initializing default classification model (standard scaled logistic regression)


In [31]:
bow_classifier2.fit_transform(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

<convokit.model.corpus.Corpus at 0x12b12ee90>

In [32]:
summary = bow_classifier2.summarize(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [33]:
summary.head()

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
dt05qyf,True,1.0
dandio0,True,1.0
dwa6k96,True,1.0
dsldpxg,True,1.0
e70wjy3,True,1.0


In [34]:
summary.tail()

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
drduxx1,False,2.501712e-12
dl7q7n2,False,8.554611e-14
dxfib8r,False,2.785982e-15
dwqaa06,False,2.784342e-16
d8y9akn,False,2.180319e-16


In [35]:
bow_classifier2.base_accuracy(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

0.6761904761904762

In [36]:
bow_classifier2.accuracy(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

0.9992063492063492

In [37]:
print(bow_classifier2.classification_report(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00       852
        True       1.00      1.00      1.00       408

    accuracy                           1.00      1260
   macro avg       1.00      1.00      1.00      1260
weighted avg       1.00      1.00      1.00      1260



In [38]:
bow_classifier2.get_coefs(feature_names=bow_transformer2.get_vocabulary()).head(10)

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
nothing,0.611774
house,0.513271
common,0.508619
stop,0.481715
saw,0.463012
removed,0.456118
media,0.441348
profile,0.417414
500,0.414128
gonna,0.390282


In [39]:
bow_classifier2.get_coefs(feature_names=bow_transformer2.get_vocabulary()).tail(10)

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
hill,-0.371515
added,-0.383505
tried,-0.383836
nice,-0.387033
70,-0.403688
god,-0.416929
joke,-0.419514
goes,-0.427046
enjoy,-0.492197
thanks,-0.659167


In [40]:
bow_classifier2.confusion_matrix(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

array([[852,   0],
       [  1, 407]])

In [41]:
import pandas as pd

In [42]:
bow_classifier2.evaluate_with_cv(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

Using corpus objects...

Running a cross-validated evaluation...
Done.


array([0.60714286, 0.63888889, 0.61904762, 0.65079365, 0.6468254 ])

In [43]:
bow_classifier2.evaluate_with_train_test_split(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

Using corpus objects...

Running a train-test-split evaluation...
Done.


(0.623015873015873, array([[120,  50],
        [ 45,  37]]))

In [44]:
objs = list(threads_corpus.iter_conversations(selector=lambda convo: convo.meta['thread_doubles'] is not None))

In [45]:
bow_classifier2.transform_objs(objs)

Using input list of corpus objects...



[Conversation({'obj_type': 'conversation', '_owner': <convokit.model.corpus.Corpus object at 0x12b12ee90>, 'meta': {'original_convo_meta': {'title': 'Oceanography Final?', 'num_comments': 5, 'domain': 'self.Cornell', 'timestamp': 1540858930, 'subreddit': 'Cornell', 'gilded': 0, 'gildings': {'gid_1': 0, 'gid_2': 0, 'gid_3': 0}, 'stickied': False, 'author_flair_text': ''}, 'original_convo_id': '9siwkw', 'thread_doubles': False, 'bow_vector': <1x2052 sparse matrix of type '<class 'numpy.int64'>'
 	with 57 stored elements in Compressed Sparse Row format>, 'prediction': False, 'pred_score': 0.004799615842620171}, '_id': 'e8p8j7u', '_utterance_ids': ['e8p8j7u', 'e8pb34f', 'e8pc4zs', 'e8pcsap', 'e8pei12'], '_user_ids': None, 'tree': None}),
 Conversation({'obj_type': 'conversation', '_owner': <convokit.model.corpus.Corpus object at 0x12b12ee90>, 'meta': {'original_convo_meta': {'title': 'CS 3rd sem scheduling advice', 'num_comments': 13, 'domain': 'self.Cornell', 'timestamp': 1492398699, 'sub

In [49]:
bow_classifier2.summarize_objs(objs)

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
d8y9akn,False,1.639134e-17
dwqaa06,False,1.755115e-14
dl7q7n2,False,2.518259e-13
e6m7j9z,False,1.613894e-12
dxfib8r,False,1.766981e-12
c4drajn,False,1.830577e-12
drduxx1,False,7.892001e-11
djxk078,False,1.663868e-10
e23mztk,False,4.224788e-10
cyeq0e8,False,4.840265e-10


## Paired bag-of-words prediction for comment thread doubling in length versus staying the same length based on first 5 utterances

In [None]:
from convokit import Pairer, PairedBoW

In [None]:
corpus = Corpus(filename=download('subreddit-Cornell'))

In [None]:
threads_corpus = corpus.reindex_conversations(new_convo_roots=top_level_comment_ids, preserve_convo_meta=True)

In [None]:
next(threads_corpus.iter_conversations())

In [None]:
for thread in threads_corpus.iter_conversations():
    thread_len = len(list(thread.iter_utterances()))
    if thread_len == 5:
        thread.meta['thread_doubles'] = False
    elif thread_len >= 10:
        thread.meta['thread_doubles'] = True
    else:
        thread.meta['thread_doubles'] = None

In [None]:
next(threads_corpus.iter_conversations()).meta

In [None]:
pairer = Pairer(obj_type="conversation", 
                pos_label_func=lambda convo: convo.meta['thread_doubles'], 
                neg_label_func=lambda convo: convo.meta['thread_doubles'] == False,
                pairing_func=lambda convo: convo.meta['original_convo_id']
               )

In [None]:
pairer.transform(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [None]:
print(next(threads_corpus.iter_conversations()))

In [None]:
for convo in threads_corpus.iter_conversations():
    if convo.meta['pair_id'] is not None:
        print(convo)
        break

In [None]:
from convokit import BoWTransformer

In [None]:
bow_transformer = BoWTransformer(obj_type="conversation", 
                                 text_func=lambda convo: " ".join([utt.text for utt in convo.get_chronological_utterance_list()][:5]),
                    )

In [None]:
bow_transformer.fit_transform(threads_corpus, selector=lambda convo: convo.meta['pair_id'] is not None)

In [None]:
paired_bow = PairedBoW(obj_type="conversation")

In [None]:
paired_bow.fit(threads_corpus)

In [None]:
paired_bow.summarize(threads_corpus)

In [None]:
paired_bow.get_coefs(feature_names=bow_transformer.get_vocabulary())