In [1]:
import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
# function to split the data for cross-validation
from sklearn.model_selection import train_test_split
# function for transforming documents into counts
from sklearn.feature_extraction.text import CountVectorizer
# function for encoding categories
from sklearn.preprocessing import LabelEncoder

In [2]:
from urllib.parse import urlsplit

In [3]:
# grab the data
news = pd.read_csv("./data/hn.csv")

In [4]:
news.head()

Unnamed: 0,id,title,url,author,score
0,7530721,"David ""Debt"" Graeber evicted, claims revenge-h...",http://boingboing.net/2014/04/03/david-debt-gr...,reirob,52
1,7295114,Joint Statement Regarding MtGox,http://blog.coinbase.com/post/77766809700/join...,jordhy,53
2,7799471,HackerChat – A private chat for the HN community,http://hackerchat.co,gianluka,53
3,8067945,Children Exposed To Religion Have Difficulty D...,http://www.huffingtonpost.com/2014/07/21/child...,givan,54
4,6828620,"""Sheep Marketplace was founded and run by Tomá...",http://pastebin.com/raw.php?i=9spTATw6,moyix,55


In [5]:
def normalize_text(s):
    s = s.lower()
    
    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    
    # make sure we didn't introduce any double spaces
    s = re.sub('\s+',' ',s)
    
    return s

In [6]:
news['text'] = [normalize_text(s) for s in news['title']]

In [7]:
news['baseurl'] = [urlsplit(s).netloc if type(s) == str else '' for s in news['url']]

In [8]:
from collections import Counter

In [9]:
counturls = Counter(news['baseurl'])
counturls = [(v, k) for k, v in counturls.items()]
counturls.sort(reverse=True)
counturls[0:50]

[(2779, ''),
 (2597, 'github.com'),
 (1894, 'techcrunch.com'),
 (1674, 'www.nytimes.com'),
 (989, 'arstechnica.com'),
 (920, 'www.wired.com'),
 (655, 'medium.com'),
 (610, 'www.washingtonpost.com'),
 (596, 'www.bbc.co.uk'),
 (485, 'www.theatlantic.com'),
 (409, 'www.theguardian.com'),
 (389, 'en.wikipedia.org'),
 (353, 'www.youtube.com'),
 (343, 'www.economist.com'),
 (340, 'www.eff.org'),
 (334, 'thenextweb.com'),
 (331, 'www.bloomberg.com'),
 (318, 'online.wsj.com'),
 (311, 'www.google.com'),
 (311, 'plus.google.com'),
 (309, 'www.forbes.com'),
 (298, 'www.slate.com'),
 (288, 'www.npr.org'),
 (281, 'www.bbc.com'),
 (264, 'www.newyorker.com'),
 (255, 'groups.google.com'),
 (235, 'code.google.com'),
 (228, 'stackoverflow.com'),
 (226, 'venturebeat.com'),
 (216, 'www.reddit.com'),
 (213, 'www.kickstarter.com'),
 (213, 'gigaom.com'),
 (210, 'www.techcrunch.com'),
 (210, 'torrentfreak.com'),
 (208, 'www.reuters.com'),
 (184, 'news.cnet.com'),
 (180, '37signals.com'),
 (177, 'blogs.wsj.com

In [10]:
def categorize(url, title):
    techy = ["github.com",
             'code.google.com',
             'stackoverflow.com',
             'groups.google.com',
             "googleblog.blogspot.com",
             "lwn.net"]
    nottechy = ["www.nytimes.com", 
                "www.washingtonpost.com", 
                "www.youtube.com", 
                "www.bbc.co.uk", 
                "www.bloomberg.com", 
                "www.forbes.com",
                'www.slate.com',
                'www.npr.org',
                'www.bbc.com',
                'www.newyorker.com']
    category = 'unknown'
    if url in techy:
        category = 'tech'
    elif url in nottechy:
        category = 'not_tech'
    if str.startswith(title, "Show HN:") or str.startswith(title, "Ask HN:"):
        category = 'tech'
    return category
news['category'] = [categorize(url, title) for url, title in zip(news['baseurl'], news['title'])]

In [11]:
news[news['category'] == 'unknown'].head()

Unnamed: 0,id,title,url,author,score,text,baseurl,category
0,7530721,"David ""Debt"" Graeber evicted, claims revenge-h...",http://boingboing.net/2014/04/03/david-debt-gr...,reirob,52,david debt graeber evicted claims revenge-hara...,boingboing.net,unknown
1,7295114,Joint Statement Regarding MtGox,http://blog.coinbase.com/post/77766809700/join...,jordhy,53,joint statement regarding mtgox,blog.coinbase.com,unknown
2,7799471,HackerChat – A private chat for the HN community,http://hackerchat.co,gianluka,53,hackerchat a private chat for the hn community,hackerchat.co,unknown
3,8067945,Children Exposed To Religion Have Difficulty D...,http://www.huffingtonpost.com/2014/07/21/child...,givan,54,children exposed to religion have difficulty d...,www.huffingtonpost.com,unknown
4,6828620,"""Sheep Marketplace was founded and run by Tomá...",http://pastebin.com/raw.php?i=9spTATw6,moyix,55,"""sheep marketplace was founded and run by tomá...",pastebin.com,unknown


In [12]:
print(news.size)
tnews= news[news['category'] != 'unknown']
print(tnews.size)

712464
98520


In [13]:
# pull the data into vectors
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(tnews['text'])

encoder = LabelEncoder()
y = encoder.fit_transform(tnews['category'])

# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# take a look at the shape of each of these
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(9852, 14329)
(9852,)
(2463, 14329)
(2463,)


In [14]:
nb = MultinomialNB()
nb.fit(x_train, y_train)

MultinomialNB()

In [15]:
nb.score(x_test, y_test)

0.9326025172553796

In [16]:
import pickle
pickle.dump({'vocabulary': vectorizer.vocabulary_, 'model': nb},open("model.pkl","wb"))

In [17]:
def predict(titles):
    mdata = pickle.load(open("model.pkl","rb"))
    vectorizer = CountVectorizer(decode_error="replace",vocabulary=mdata['vocabulary'])
    x = vectorizer.fit_transform(titles)
    return mdata['model'].predict(x)
print(predict(np.array(["Show HN"])))
print(predict(np.array(["Ask HN"])))
print(predict(np.array(["Is Going to the Office a Broken Way of Working?"])))


[1]
[1]
[0]


In [18]:
data = list(zip(news['title'], predict(news['title'])))
for x in data[0:50]:
    print(x)


('David "Debt" Graeber evicted, claims revenge-harassment for OWS participation', 0)
('Joint Statement Regarding MtGox', 0)
('HackerChat – A private chat for the HN community', 1)
('Children Exposed To Religion Have Difficulty Distinguishing Fact From Fiction', 0)
('"Sheep Marketplace was founded and run by Tomáš Jiřikovský"', 0)
('“Happy Birthday” is not in copyright', 0)
('Fleep wants to replace email like mobile phones replaced landlines', 1)
('Chocolatey Kickstarter – Help get Chocolatey to the next level', 1)
('GMail now allows you to send money', 1)
('Did you invent Bitcoin?  Take our simple quiz to find out.', 1)
('OH HAI SEXISM', 1)
('I am not who you think I am', 1)
('Why I Ditched The Cushy VC World To Start Kohort', 0)
('Coolant Crisis – From Agile Teamwork to Lone-Wolf Game Development', 0)
('Intro to Bitcoin [video]', 0)
('The Datacenter as a Computer, Second edition', 0)
('Matchure: pattern matching library for Clojure', 1)
('Anatomy of the Linux virtual file system switc