In [1]:
test_file = "Reddit_MensRights.ndjson"

In [2]:
import ndjson
from collections import defaultdict

In [3]:
with open(test_file) as ndsjon_file:
    data  = ndjson.load(ndsjon_file)

In [4]:
len(data)

719983

In [5]:
data[0]

["it's", 'also', 'the', 'perfect', 'defense', '.']

In [37]:
vocab = defaultdict(list)

for sentence in data:
    words_added = set()
    for word in sentence:
        if word not in words_added:
            vocab[word].append(sentence)
            words_added.add(word)

In [25]:
def pretty_print(l):
    ''' pretty print a list of (word, tag) tuples '''
    words = ''
    tags  = ''
    for word, tag in l:
        if tag is None: tag = ''
        width = max((len(word), len(tag)))
        paddstr = "{:<"+ str(width) +"} "
        
        words += paddstr.format(word)
        tags  += paddstr.format(tag)
    
    print(words)
    print(tags)

In [38]:
def get_examples(focus_word, other_words=set(), focus_bonus=2, example_length = 11, num_examples=2):
    ''' returns the sentence containing the most occurances of focus_word.
        if the focus_word does not occur in the corpus, raises a ValueError
        if the optional other_words set is provided, returns the sentence
        containing the greatest number of other words.
        
        The 'best' sentence is computed by summing the occurances of other_words with
        focus_bonus * the occurances of focus_word
        
        if the resulting example sentence is longer than example_length, the returned
        example will be centered around the focus word and "clipped" at length
        example_length.
        
        sentences returned as a list of tuples of (token, None or 'focus' or 'other')
    '''
    if focus_word not in vocab:
        #raise ValueError("{} is not in this corpus' vocabulary.".format(focus_word))
        return []
        
    # make sure example_length is odd
    if example_length % 2 == 0: example_length += 1
    
    possible_sentences = vocab[focus_word]
    scored_possible_sentences = []
    for sentence in possible_sentences:
        score = 0
        for word in sentence:
            if word == focus_word:
                score += focus_bonus
            elif word in other_words:
                score += 1
        scored_possible_sentences.append( (sentence, score) )
        
    sentences = sorted(scored_possible_sentences, key=lambda x: x[1], reverse=True)
    sentences = sentences[:num_examples] if len(sentences)>num_examples else sentences
    sentences = map(lambda x: x[0], sentences)    
    
    def trimmer(sentence):
        if len(sentence) > example_length:
            pos_of_focus = sentence.index(focus_word)

            if pos_of_focus < int(example_length/2):
                # focus_word is at front of sentence
                sentence = sentence[:example_length]
            else:
                padding  = int(example_length/2)
                front = pos_of_focus - padding
                back =  pos_of_focus + padding
                sentence = sentence[front:back+1]
        return sentence
            
    def tagger(word):
        tag = None
        if word == focus_word:    tag = 'focus'
        elif word in other_words: tag = 'other'
        return (word, tag)
    
    def sentence_tagger(sentence):
        return list(map(tagger, sentence))
    
    sentences = list(map(trimmer, sentences))
    sentences = list(map(sentence_tagger, sentences))
    
    return sentences
            
    

In [39]:
s = get_examples('taco', example_length=50, num_examples=3)

for sentence in s:
    pretty_print(sentence)
    print()


is it racist for him to sell me a taco  or racist for him to refuse to sell me a taco  ? 
                                  focus                                          focus   

it won't , one knotted condom is nothing compared to a taco  bell meal , but even if it did - no problem , that can be your calling card 
                                                       focus                                                                             

wow , now i'm wondering , if you eat a taco  and you're not a mexican national , are you guilty of cultural food appropriation ? 
                                       focus                                                                                     



In [36]:
s = ['a','b','c','d','e','f','g','h']

In [23]:
"b" in set()

False