In [1]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import json
%matplotlib inline

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score


pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 300

In [2]:
# generic curve plotting function
def auc_plotting_function(rate1, rate2, rate1_name, rate2_name, curve_name):
    AUC = auc(rate1, rate2)
    # Plot of a ROC curve for class 1
    plt.figure(figsize=[11,9])
    plt.plot(rate1, rate2, label=curve_name + ' (area = %0.2f)' % AUC, linewidth=4)
    plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel(rate1_name, fontsize=18)
    plt.ylabel(rate2_name, fontsize=18)
    plt.title(curve_name + ' for greenness', fontsize=18)
    plt.legend(loc="lower right")
    plt.show()

# plot receiving operator characteristic curve
def plot_roc(y_true, y_score):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc_plotting_function(fpr, tpr, 'False Positive Rate', 'True Positive Rate', 'ROC')

## Predicting "Greenness" Of Content

This dataset comes from [stumbleupon](https://www.stumbleupon.com/), a web page recommender and was made available [here](https://www.kaggle.com/c/stumbleupon/download/train.tsv)

A description of the columns is below

FieldName|Type|Description
---------|----|-----------
url|string|Url of the webpage to be classified
urlid|integer| StumbleUpon's unique identifier for each url
boilerplate|json|Boilerplate text
alchemy_category|string|Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
alchemy_category_score|double|Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
avglinksize| double|Average number of words in each link
commonLinkRatio_1|double|# of links sharing at least 1 word with 1 other links / # of links
commonLinkRatio_2|double|# of links sharing at least 1 word with 2 other links / # of links
commonLinkRatio_3|double|# of links sharing at least 1 word with 3 other links / # of links
commonLinkRatio_4|double|# of links sharing at least 1 word with 4 other links / # of links
compression_ratio|double|Compression achieved on this page via gzip (measure of redundancy)
embed_ratio|double|Count of number of <embed> usage
frameBased|integer (0 or 1)|A page is frame-based (1) if it has no body markup but have a frameset markup
frameTagRatio|double|Ratio of iframe markups over total number of markups
hasDomainLink|integer (0 or 1)|True (1) if it contains an <a> with an url with domain
html_ratio|double|Ratio of tags vs text in the page
image_ratio|double|Ratio of <img> tags vs text in the page
is_news|integer (0 or 1) | True (1) if StumbleUpon's news classifier determines that this webpage is news
lengthyLinkDomain| integer (0 or 1)|True (1) if at least 3 <a> 's text contains more than 30 alphanumeric characters
linkwordscore|double|Percentage of words on the page that are in hyperlink's text
news_front_page| integer (0 or 1)|True (1) if StumbleUpon's news classifier determines that this webpage is front-page news
non_markup_alphanum_characters|integer| Page's text's number of alphanumeric characters
numberOfLinks|integer Number of <a>|markups
numwords_in_url| double|Number of words in url
parametrizedLinkRatio|double|A link is parametrized if it's url contains parameters or has an attached onClick event
spelling_errors_ratio|double|Ratio of words not found in wiki (considered to be a spelling mistake)
label|integer (0 or 1)|User-determined label. Either evergreen (1) or non-evergreen (0); available for train.tsv only

### What are 'evergreen' sites?
- These are websites that always relevant like recipies or reviews (as opposed to current events)
- Look at some examples

In [2]:
data = pd.read_csv('../../assets/datasets/train.tsv', sep='\t', na_values='?')

# Extract the title and body from the boilerplate JSON text
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', '')).fillna('')
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', '')).fillna('')

In [3]:
data[['title', 'label']].head()

Unnamed: 0,title,label
0,"IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries",0
1,"The Fully Electronic Futuristic Starting Gun That Eliminates Advantages in Races the fully electronic, futuristic starting gun that eliminates advantages in races the fully electronic, futuristic starting gun that eliminates advantages in races",1
2,Fruits that Fight the Flu fruits that fight the flu | cold & flu | men's health,1
3,10 Foolproof Tips for Better Sleep,1
4,The 50 Coolest Jerseys You Didn t Know Existed coolest jerseys you haven't seen,0


#### In previous lessons, we added text features manually as below 

In [4]:
data['recipe'] = data['title'].str.lower().str.contains('recipe')
data['electronic'] = data['title'].str.lower().str.contains('electronic')
data['tips'] = data['title'].str.lower().str.contains('tips')

#### We can build a Logistic Regression model using scikit-learn and examine the coefficients
- Examine the coefficients using the `examine_coefficients` function provided

In [5]:
def examine_coefficients(model, df):
    df = pd.DataFrame(
        { 'Coefficient' : model.coef_[0] , 'Feature' : df.columns}
    ).sort_values(by='Coefficient')
    return df[df.Coefficient !=0 ]

In [6]:
from sklearn.linear_model import LogisticRegression

X = data[[
        'recipe',
        'electronic',
        'tips'
    ]]
y = data.label


model = LogisticRegression() 

model.fit(X, y) # This fits the model to learn the coefficients
examine_coefficients(model, X)

Unnamed: 0,Coefficient,Feature
1,-0.441721,electronic
2,0.62025,tips
0,2.482849,recipe


#### We can build text features in bulk as well using built-in preprocessing tools
- `CountVectorizer` builds a feature per word automatically as we did manually for `recipe`, `electronic` above.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(
    binary=True,  # Create binary features
    stop_words='english', # Ignore common words such as 'the', 'and'
    max_features=50, # Only use the top 50 most common words
)


# This builds a matrix with a row per website (or data point) and column per word (using all words in the dataset)
X = v.fit_transform(data.title).todense()
X = pd.DataFrame(X, columns=v.get_feature_names())
X.head()

Unnamed: 0,10,2011,allrecipes,apple,baked,best,blog,butter,cake,cheese,chicken,chocolate,com,cookies,cooking,cream,cupcakes,day,easy,fashion,food,foods,free,health,healthy,home,homemade,illustrated,insidershealth,kitchen,life,make,new,news,peanut,photos,pie,recipe,recipes,si,sports,style,sweet,swimsuit,technology,time,tips,video,ways,world
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Using the input matrix above, fit a logistic regression model using L1 regularization
- Change the `C` parameter
    - how do the coefficients change? (use `examine_coeffcients`)
    - how does the model perfomance change (using AUC)

In [11]:
# TODO
model = LogisticRegression(penalty = 'l1', C = 10.0) # Model with high, little regularization

model.fit(X, y)
#C = 10
examine_coefficients(model, X)

Unnamed: 0,Coefficient,Feature
44,-1.887736,technology
19,-1.785701,fashion
33,-1.235435,news
47,-1.140028,video
40,-1.026233,sports
32,-0.737459,new
34,-0.585839,peanut
41,-0.49972,style
35,-0.48889,photos
45,-0.469302,time


In [12]:
# TODO
model = LogisticRegression(penalty = 'l1', C = 15.0) # Model with high, little regularization

model.fit(X, y)
#C = 15
examine_coefficients(model, X)
#more Cs seem to increase the coefficient (by increase I mean go further in its original direction) - 
#so negative coeffcients get more negative and positive ones go more positive

Unnamed: 0,Coefficient,Feature
44,-1.891828,technology
19,-1.787065,fashion
33,-1.236469,news
47,-1.140558,video
40,-1.025648,sports
32,-0.738573,new
34,-0.612614,peanut
41,-0.501494,style
35,-0.491518,photos
45,-0.471624,time


In [16]:
# TODO
model = LogisticRegression(penalty = 'l1', C = 5.0) # Model with high, little regularization

model.fit(X, y)
#C = 5
examine_coefficients(model, X)

#smaller Cs moves the values towards 0 - so positives go down and negatives go up

Unnamed: 0,Coefficient,Feature
44,-1.875476,technology
19,-1.781461,fashion
33,-1.232344,news
47,-1.138243,video
40,-1.024626,sports
32,-0.734059,new
34,-0.50591,peanut
41,-0.494458,style
35,-0.482757,photos
45,-0.46256,time


In [19]:
for metric in ['accuracy', 'precision', 'recall', 'roc_auc']:
    scores = cross_val_score(model, X, y, scoring=metric)
    print("mean {}: {}, all: {}".format(metric, scores.mean(), scores))

mean accuracy: 0.70710062067, all: [ 0.69870235  0.7148073   0.70779221]
mean precision: 0.836242815103, all: [ 0.81468111  0.84691358  0.84713376]
mean recall: 0.534246441335, all: [ 0.53475513  0.54229249  0.5256917 ]
mean roc_auc: 0.766261216891, all: [ 0.7640732   0.77030764  0.76440281]


#### Using the input matrix above, fit a logistic regression model using L2 regularization
- Change the `C` parameter - how do the coefficients change? (use `examine_coeffcients`)

In [13]:
# TODO
# l2 - ridge regression
model = LogisticRegression(penalty = 'l2', C = 10.0) # Model with high, little regularization

model.fit(X, y)
#C = 10
examine_coefficients(model, X)

Unnamed: 0,Coefficient,Feature
44,-1.876833,technology
19,-1.780735,fashion
33,-1.23561,news
47,-1.139417,video
40,-1.025387,sports
32,-0.73846,new
34,-0.542309,peanut
41,-0.502143,style
35,-0.492131,photos
45,-0.473177,time


In [14]:
# TODO
# l2 - ridge regression
model = LogisticRegression(penalty = 'l2', C = 15.0) # Model with high, little regularization

model.fit(X, y)
#C = 15
examine_coefficients(model, X)

#like L1 (lasso) there is a slight increase in the coefficient with respect to if its negative or positive
#negative gets more negative and positive gets more positive

Unnamed: 0,Coefficient,Feature
44,-1.884403,technology
19,-1.78388,fashion
33,-1.236595,news
47,-1.140081,video
40,-1.026023,sports
32,-0.739248,new
34,-0.581288,peanut
41,-0.503189,style
35,-0.493357,photos
45,-0.474226,time


In [15]:
# TODO
# l2 - ridge regression
model = LogisticRegression(penalty = 'l2', C = 5.0) # Model with high, little regularization

model.fit(X, y)
#C = 5
examine_coefficients(model, X)

#decreasing the Cs decreases the coefficients - positives go down and negatives go up
#moving towards 0

Unnamed: 0,Coefficient,Feature
44,-1.854061,technology
19,-1.771559,fashion
33,-1.232353,news
47,-1.136993,video
40,-1.023179,sports
32,-0.736291,new
41,-0.499346,style
35,-0.489025,photos
45,-0.47049,time
1,-0.466904,2011


In [20]:
for metric in ['accuracy', 'precision', 'recall', 'roc_auc']:
    scores = cross_val_score(model, X, y, scoring=metric)
    print("mean {}: {}, all: {}".format(metric, scores.mean(), scores))

mean accuracy: 0.70710062067, all: [ 0.69870235  0.7148073   0.70779221]
mean precision: 0.836242815103, all: [ 0.81468111  0.84691358  0.84713376]
mean recall: 0.534246441335, all: [ 0.53475513  0.54229249  0.5256917 ]
mean roc_auc: 0.7662585841, all: [ 0.76406464  0.7703083   0.76440281]
