In [1]:
import pandas as pd
import json

data = pd.read_csv("../../assets/dataset/stumbleupon.tsv", sep='\t')
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', ''))
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', ''))
data.head()

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,title,body
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,24,0,5424,170,8,0.152941,0.07913,0,IBM Sees Holographic Calls Air Breathing Batte...,A sign stands outside the International Busine...
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,...,40,0,4973,187,9,0.181818,0.125448,1,The Fully Electronic Futuristic Starting Gun T...,And that can be carried on a plane without the...
2,http://www.menshealth.com/health/flu-fighting-...,1164,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,...,55,0,2240,258,11,0.166667,0.057613,1,Fruits that Fight the Flu fruits that fight th...,Apples The most popular source of antioxidants...
3,http://www.dumblittleman.com/2007/12/10-foolpr...,6684,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.4,0.1,0.016667,0.0,...,24,0,2737,120,5,0.041667,0.100858,1,10 Foolproof Tips for Better Sleep,There was a period in my life when I had a lot...
4,http://bleacherreport.com/articles/1205138-the...,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.5,0.222222,0.123457,0.04321,...,14,0,12032,162,10,0.098765,0.082569,0,The 50 Coolest Jerseys You Didn t Know Existed...,Jersey sales is a curious business Whether you...


## Predicting "Greenness" Of Content

This dataset comes from [stumbleupon](https://www.stumbleupon.com/), a web page recommender.  

A description of the columns is below

FieldName|Type|Description
---------|----|-----------
url|string|Url of the webpage to be classified
title|string|Title of the article
body|string|Body text of article
urlid|integer| StumbleUpon's unique identifier for each url
boilerplate|json|Boilerplate text
alchemy_category|string|Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
alchemy_category_score|double|Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
avglinksize| double|Average number of words in each link
commonlinkratio_1|double|# of links sharing at least 1 word with 1 other links / # of links
commonlinkratio_2|double|# of links sharing at least 1 word with 2 other links / # of links
commonlinkratio_3|double|# of links sharing at least 1 word with 3 other links / # of links
commonlinkratio_4|double|# of links sharing at least 1 word with 4 other links / # of links
compression_ratio|double|Compression achieved on this page via gzip (measure of redundancy)
embed_ratio|double|Count of number of <embed> usage
frameBased|integer (0 or 1)|A page is frame-based (1) if it has no body markup but have a frameset markup
frameTagRatio|double|Ratio of iframe markups over total number of markups
hasDomainLink|integer (0 or 1)|True (1) if it contains an <a> with an url with domain
html_ratio|double|Ratio of tags vs text in the page
image_ratio|double|Ratio of <img> tags vs text in the page
is_news|integer (0 or 1) | True (1) if StumbleUpon's news classifier determines that this webpage is news
lengthyLinkDomain| integer (0 or 1)|True (1) if at least 3 <a> 's text contains more than 30 alphanumeric characters
linkwordscore|double|Percentage of words on the page that are in hyperlink's text
news_front_page| integer (0 or 1)|True (1) if StumbleUpon's news classifier determines that this webpage is front-page news
non_markup_alphanum_characters|integer| Page's text's number of alphanumeric characters
numberOfLinks|integer Number of <a>|markups
numwords_in_url| double|Number of words in url
parametrizedLinkRatio|double|A link is parametrized if it's url contains parameters or has an attached onClick event
spelling_errors_ratio|double|Ratio of words not found in wiki (considered to be a spelling mistake)
label|integer (0 or 1)|User-determined label. Either evergreen (1) or non-evergreen (0); available for train.tsv only

> ### Let's try extracting some of the text content.
> ### Create a feature for the title containing 'recipe'. Is the % of evegreen websites higher or lower on pages that have recipe in the the title?

In [2]:
# Option 1: Create a function to check for this

def has_recipe(text_in):
    try:
        if 'recipe' in str(text_in).lower():
            return 1
        else:
            return 0
    except: 
        return 0
        
data['recipe'] = data['title'].map(has_recipe)

# Option 2: lambda functions

#data['recipe'] = data['title'].map(lambda t: 1 if 'recipe' in str(t).lower() else 0)


# Option 3: string functions
data['recipe'] = data['title'].str.contains('recipe')

 ### Demo: Use of the Count Vectorizer

In [3]:
titles = data['title'].fillna('')

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features = 1000, 
                             ngram_range=(1, 2), 
                             stop_words='english',
                             binary=True)

# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(titles)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles)

In [4]:
print X.toarray()

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


 ### Demo: Build a random forest model to predict evergreeness of a website using the title features

In [5]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 20)
    
# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(titles)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles).toarray()
y = data['label']

model.fit(X,y)

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.7867134   0.79997266  0.80390543], Average AUC 0.796863830541


### Exercise: Build a random forest model to predict evergreeness of a website using the title features and quantitative features

In [6]:
model = RandomForestClassifier(n_estimators = 20)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X_text_features = vectorizer.transform(titles)

# Identify the features you want from the original dataset
other_features_columns = ['html_ratio', 'image_ratio']
other_features = data[other_features_columns]

# Stack them horizontally together
# This takes all of the word/n-gram columns and appends on two more columns for `html_ratio` and `image_ratio`
from scipy.sparse import hstack
X = hstack((X_text_features, other_features)).toarray()

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

# What features of these are most important?
model.fit(X, y)

all_feature_names = vectorizer.get_feature_names() + other_features_columns
feature_importances = pd.DataFrame({'Features' : all_feature_names, 'Importance Score': model.feature_importances_})
feature_importances.sort_values('Importance Score', ascending=False)

CV AUC [ 0.78648006  0.80075988  0.79369929], Average AUC 0.793646410527


Unnamed: 0,Features,Importance Score
1000,html_ratio,1.551930e-01
1001,image_ratio,9.498387e-02
712,recipe,3.640567e-02
721,recipes,1.745519e-02
190,chocolate,1.360429e-02
180,chicken,1.282708e-02
342,fashion,9.631961e-03
150,cake,9.234694e-03
828,sports,9.021626e-03
496,kitchen,8.205088e-03


 ### Exercise: Build a random forest model to predict evergreeness of a website using the body features

In [7]:
body_text = data['body'].fillna('')

# Use `fit` to learn the vocabulary
vectorizer.fit(body_text)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(body_text).toarray()

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.83461756  0.85441041  0.83987381], Average AUC 0.842967259397


 ### Exercise: Use `TfIdfVectorizer` instead of `CountVectorizer` - is this an improvement?

In [8]:
titles = data['title'].fillna('')

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 1000, 
                             ngram_range=(1, 2), 
                             stop_words='english')


# Use `fit` to learn the vocabulary
vectorizer.fit(body_text)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(body_text).toarray()

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.84203495  0.85279249  0.84437064], Average AUC 0.846399360324


In [11]:
vectorizer.get_feature_names()

[u'00',
 u'000',
 u'08',
 u'10',
 u'10 minutes',
 u'100',
 u'11',
 u'12',
 u'13',
 u'14',
 u'15',
 u'15 minutes',
 u'16',
 u'17',
 u'18',
 u'19',
 u'20',
 u'20 minutes',
 u'2005',
 u'2006',
 u'2007',
 u'2008',
 u'2009',
 u'2010',
 u'2011',
 u'2012',
 u'21',
 u'22',
 u'23',
 u'24',
 u'25',
 u'26',
 u'27',
 u'28',
 u'29',
 u'30',
 u'30 minutes',
 u'31',
 u'35',
 u'350',
 u'40',
 u'45',
 u'50',
 u'60',
 u'ability',
 u'able',
 u'according',
 u'actually',
 u'ad',
 u'add',
 u'added',
 u'adding',
 u'addition',
 u'additional',
 u'age',
 u'ago',
 u'air',
 u'alcohol',
 u'allow',
 u'amazing',
 u'america',
 u'american',
 u'app',
 u'apple',
 u'apples',
 u'april',
 u'area',
 u'aren',
 u'art',
 u'article',
 u'aside',
 u'ask',
 u'asked',
 u'athletes',
 u'attention',
 u'august',
 u'author',
 u'available',
 u'average',
 u'avocado',
 u'avoid',
 u'away',
 u'awesome',
 u'baby',
 u'background',
 u'bacon',
 u'bad',
 u'bag',
 u'bake',
 u'baked',
 u'baking',
 u'baking powder',
 u'baking sheet',
 u'baking soda'

In [9]:
len(vectorizer.get_feature_names())

1000

In [12]:
type(vectorizer.fit(body_text))

sklearn.feature_extraction.text.TfidfVectorizer

In [13]:
vectorizer.vocabulary_

{u'fit': 342,
 u'return': 739,
 u'august': 75,
 u'hour': 428,
 u'today': 899,
 u'attention': 74,
 u'yellow': 995,
 u'new': 583,
 u'drug': 288,
 u'onion': 608,
 u'seeds': 770,
 u'prepare': 683,
 u'food': 352,
 u'rolls': 746,
 u'cupcakes': 241,
 u'addition': 52,
 u'candy': 149,
 u'mushrooms': 575,
 u'starting': 835,
 u'center': 157,
 u'extract': 313,
 u'care': 152,
 u'based': 99,
 u'thing': 889,
 u'think': 891,
 u'potatoes': 674,
 u'dress': 283,
 u'crust': 237,
 u'style': 858,
 u'problems': 692,
 u'note': 591,
 u'ago': 55,
 u'age': 54,
 u'tried': 914,
 u'actually': 47,
 u'finally': 338,
 u'girl': 380,
 u'living': 505,
 u'different': 266,
 u'lime': 498,
 u'phone': 642,
 u'syrup': 867,
 u'women': 978,
 u'working': 982,
 u'internet': 457,
 u'process': 693,
 u'pumpkin': 704,
 u'filled': 336,
 u'knife': 474,
 u'http www': 433,
 u'shape': 780,
 u'tsp': 918,
 u'holiday': 422,
 u'golden brown': 388,
 u'cancer': 148,
 u'corn': 222,
 u'city': 180,
 u'follow': 350,
 u'prepared': 684,
 u'vegetables'