In [28]:
from __future__ import division
from collections import defaultdict
import random

In [2]:
data = [ ("big data", 100, 15), ("Hadoop", 95, 25), ("Python", 75, 50),  
        ("R", 50, 40), ("machine learning", 80, 20), ("statistics", 20, 60),   
        ("data science", 60, 70), ("analytics", 90, 3),      
        ("team player", 85, 85), ("dynamic", 2, 90), ("synergies", 70, 0),   
        ("actionable insights", 40, 30), ("think out of the box", 45, 10),   
        ("self-starter", 30, 50), ("customer focus", 65, 15),     
        ("thought leadership", 35, 35)]

In [3]:
# get data

def fix_unicode(text):
    return text.replace(u"\u2019", "'")

In [19]:
from bs4 import BeautifulSoup
import requests
import re

url = "http://radar.oreilly.com/2010/06/what-is-data-science.html"
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')



In [36]:
content = soup.find("div", "a-body")
regex = r"[\w']+|[\.]"

document = []

for paragraph in content('p'):
    words = re.findall(regex, fix_unicode(paragraph.text))
    document.extend(words)

In [37]:
bigrams = zip(document, document[1:])
transitions = defaultdict(list)
for prev,current in bigrams:
    transitions[prev].append(current)

In [39]:
def generate_using_bigrams():
    current = "."
    result = []
    while True:
        next_word_candidates = transitions[current] # all bigrams for current
        current = random.choice(next_word_candidates) # pick one
        result.append(current)
        if current == '.':
            return " ".join(result)

In [43]:
generate_using_bigrams()

u'Try using data scientists are data MySpace activity streams online .'

In [44]:
trigrams = zip(document, document[1:], document[2:])
trigram_transitions = defaultdict(list)
starts = []

for prev, current, next in trigrams:
    if prev == ".":
        starts.append(current)
        
    trigram_transitions[(prev, current)].append(next)

In [45]:
def generate_using_trigrams():
    current = random.choice(starts)
    prev = "."
    result = [current]
    
    while True:
        next_word_candidates = trigram_transitions[(prev, current)]
        next_word = random.choice(next_word_candidates)
        
        prev, current = current, next_word
        result.append(current)
        
        if current == ".":
            return " ".join(result)

In [49]:
generate_using_trigrams()

u'Relational databases though neither term is very useful .'

### Grammar

In [50]:
grammar = {
    "_S" : ["_NP _VP"],
    "_NP" : ["_N",
            "_A _NP _P _A _N"],
    "_VP" : ["_V",
            "_V _NP"],
    "_N" : ["data science", "Python", "regression"],
    "_A" : ["big", "linear", "logistic"],
    "_P" : ["about", "near"],
    "_V" : ["learns", "trains", "tests", "is"]
}

In [54]:
def is_terminal(token):
    return token[0] != "_"

def expand(grammar, tokens):
    for i, token in enumerate(tokens):
        
        # skip over terminals
        if is_terminal(token):
            continue
            
        # if non-terminal choose replacement at random
        replacement = random.choice(grammar[token])
        
        if is_terminal(replacement):
            tokens[i] = replacement
        else:
            tokens = tokens[:i] + replacement.split() + tokens[(i+1):]
            
        return expand(grammar, tokens)
    
    return tokens

def generate_sentence(grammar):
    return expand(grammar, ["_S"])

In [64]:
generate_sentence(grammar)

['Python',
 'trains',
 'logistic',
 'regression',
 'about',
 'logistic',
 'data science']