In [2]:
# libraries
import csv
import pandas as pd
import numpy as np
from random import shuffle
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split


## 1. The Dataset: Online News Popularity

#### The Online News Popularity dataset is a classification dataset: it is used for a prediction task where the goal is to determine whether an article posted on Mashable.com was popular or not, based on how many times it was shared online. 
#### The list of attributes is as follows:
- output variable: **shares**, number of shares, <1400 and >=1400 (converted to 0 and 1 respectively)
- input features:      
    - **n_tokens_title**: Number of words in the title; continuous
    - **n_tokens_content**: Number of words in the content; continuous
    - **n_unique_tokens**: Rate of unique words in the content; continuous
    - **n_non_stop_words**: Rate of non-stop words in the content; continuous
    - **n_non_stop_unique_tokens**: Rate of unique non-stop words in the content; continuous                        
    - **num_hrefs**: Number of links; continuous
    - **num_self_hrefs**: Number of links to other articles published by Mashable; continuous
    - **num_imgs**: Number of images; continuous
    - **num_videos**: Number of videos; continuous
    - **average_token_length**: Average length of the words in the content; continuous
    - **num_keywords**: Number of keywords in the metadata; continuous
    - **data_channel**: Is data channel 'Lifestyle', 'Entertainment', 'Business', 'Social Media', 'Tech' ,'World'?; 
        categorical             
    - **kw_min_min**: Worst keyword (min. shares); continuous
    - **kw_max_min**: Worst keyword (max. shares); continuous
    - **kw_avg_min**: Worst keyword (avg. shares); continuous
    - **kw_min_max**: Best keyword (min. shares); continuous
    - **kw_max_max**: Best keyword (max. shares); continuous
    - **kw_avg_max**: Best keyword (avg. shares); continuous
    - **kw_min_avg**: Average keyword (min. shares); continuous
    - **kw_max_avg**: Average keyword (max. shares); continuous
    - **kw_avg_avg**: Average keyword (avg. shares); continuous
    - **self_reference_min_shares**: Min. shares of referenced articles on Mashable; continuous
    - **self_reference_max_shares**: Max. shares of referenced articles on Mashable; continuous
    - **self_reference_avg_shares**: Avg. shares of referenced articles on Mashable; continuous
    - **day_of_week**: Which day of the week was the article published on? Mon-Sun; categorical
    - **is_weekend**: Was the article published on the weekend?; binary
    - **LDA_00**: Closeness to LDA topic 0; continuous
    - **LDA_01**: Closeness to LDA topic 1; continuous
    - **LDA_02**: Closeness to LDA topic 2; continuous
    - **LDA_03**: Closeness to LDA topic 3; continuous
    - **LDA_04**: Closeness to LDA topic 4; continuous
    - **global_subjectivity**: Text subjectivity; continuous
    - **global_sentiment_polarity**: Text sentiment polarity; continuous
    - **global_rate_positive_words**: Rate of positive words in the content; continuous
    - **global_rate_negative_words**: Rate of negative words in the content; continuous
    - **rate_positive_words**: Rate of positive words among non-neutral tokens; continuous
    - **rate_negative_words**: Rate of negative words among non-neutral tokens; continuous
    - **avg_positive_polarity**: Avg. polarity of positive words; continuous
    - **min_positive_polarity**:  Min. polarity of positive words; continuous
    - **max_positive_polarity**: Max. polarity of positive words; continuous
    - **avg_negative_polarity**: Avg. polarity of negative  words; continuous
    - **min_negative_polarity**: Min. polarity of negative  words; continuous
    - **max_negative_polarity**: Max. polarity of negative  words; continuous
    - **title_subjectivity**:  Title subjectivity; continuous
    - **title_sentiment_polarity**: Title polarity; continuous
    - **abs_title_subjectivity**: Absolute subjectivity level; continuous
    - **abs_title_sentiment_polarity**: Absolute polarity level; continuous

In [18]:
df = pd.read_csv("online_news_popularity.csv")

cols = [' n_tokens_title', ' n_tokens_content',
       ' n_unique_tokens', ' n_non_stop_words', ' n_non_stop_unique_tokens',
       ' num_hrefs', ' num_self_hrefs', ' num_imgs', ' num_videos',
       ' average_token_length', ' num_keywords',
       'data_channel', ' kw_min_min', ' kw_max_min', ' kw_avg_min',
       ' kw_min_max', ' kw_max_max', ' kw_avg_max', ' kw_min_avg',
       ' kw_max_avg', ' kw_avg_avg', ' self_reference_min_shares',
       ' self_reference_max_shares', ' self_reference_avg_sharess',
       'day_of_week', ' is_weekend', ' LDA_00', ' LDA_01', ' LDA_02',
       ' LDA_03', ' LDA_04', ' global_subjectivity',
       ' global_sentiment_polarity', ' global_rate_positive_words',
       ' global_rate_negative_words', ' rate_positive_words',
       ' rate_negative_words', ' avg_positive_polarity',
       ' min_positive_polarity', ' max_positive_polarity',
       ' avg_negative_polarity', ' min_negative_polarity',
       ' max_negative_polarity', ' title_subjectivity',
       ' title_sentiment_polarity', ' abs_title_subjectivity',
       ' abs_title_sentiment_polarity', ' shares']

def get_day_of_week(row):
    if row[' weekday_is_monday'] == 1:
        return "Monday"
    elif row[' weekday_is_tuesday'] == 1:
        return "Tuesday"
    elif row[' weekday_is_wednesday'] == 1:
        return "Wednesday"
    elif row[' weekday_is_thursday'] == 1:
        return "Thursday"
    elif row[' weekday_is_friday'] == 1:
        return "Friday"
    elif row[' weekday_is_saturday'] == 1:
        return "Saturday"
    else:
        return "Sunday"
    
def get_channel(row):
    if row[' data_channel_is_lifestyle'] == 1:
        return "Lifestyle"
    elif row[' data_channel_is_entertainment'] == 1:
        return "Entertainment"
    elif row[' data_channel_is_bus'] == 1:
        return "Business"
    elif row[' data_channel_is_socmed'] == 1:
        return "Social Media"
    elif row[' data_channel_is_tech'] == 1:
        return "Tech"
    else:
        return "World"

df['day_of_week'] = df.apply(lambda row: get_day_of_week(row), axis=1)
df['data_channel'] = df.apply(lambda row: get_channel(row), axis=1)

df = df[cols]

train_cols = df.columns[0:-1]
label = df.columns[-1]
X_df = df[train_cols]
y_df = df[label]

# Converting the response / output variable to a binary class
y_df = y_df.apply(lambda x: 0 if x < 1400 else 1)

dataset = {
        'X': X_df,
        'y': y_df,
}

#Top 5 rows of the original dataset:
X_df.head()

Unnamed: 0,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,0.0,4.680365,...,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875
1,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,0.0,4.913725,...,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0
2,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,0.0,4.393365,...,0.495833,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0
3,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,0.0,4.404896,...,0.385965,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0
4,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,0.0,4.682836,...,0.411127,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364


In [58]:
# Use this cell for any data-related exploration

## 2. The Model: GAMs

In [19]:
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import train_test_split

# create a train/test split
seed = 1
X_train, X_test, y_train, y_test = train_test_split(dataset['X'],dataset['y'], test_size=0.25, random_state=seed)

# train a GAM for the training dataset
ebm = ExplainableBoostingClassifier()

ebm.fit(X_train, y_train)

ExplainableBoostingClassifier(feature_names=[' n_tokens_title',
                                             ' n_tokens_content',
                                             ' n_unique_tokens',
                                             ' n_non_stop_words',
                                             ' n_non_stop_unique_tokens',
                                             ' num_hrefs', ' num_self_hrefs',
                                             ' num_imgs', ' num_videos',
                                             ' average_token_length',
                                             ' num_keywords', 'data_channel',
                                             ' kw_min_min', ' kw_max_min',
                                             ' kw_avg_min', ' kw_min_max',
                                             ' kw_max_max', ' kw_avg_max',
                                             ' kw_min_avg', ' kw_max_...
                                             'continuous', 'continuo

In [23]:
#Training accuracy
train_pred = ebm.predict(X_train).tolist()
accuracy_train = round(sum(train_pred == y_train) / len(train_pred), 5)

# #Test set accuracy
predictions = ebm.predict(X_test).tolist()
accuracy_test = round(sum(predictions == y_test) / len(predictions), 5)

print("The accuracy of the model on the training set is: ", accuracy_train)
print("The accuracy of the model on the test set is: ", accuracy_test)

The accuracy of the model on the training set is:  0.6855
The accuracy of the model on the test set is:  0.66613


### Visualize Global Explanations: What the model learned overall from training data

In [21]:
from interpret import show

ebm_global = ebm.explain_global()
show(ebm_global)

### Visualize Local Explanations: How an individual prediction was made

In [22]:
ebm_local = ebm.explain_local(X_test[:15], y_test[:15], name = 'EBM')
show(ebm_local)

## 3. Questions
Please answer these to the best of your abilities. Make sure to also answer the follow-up questions in each of the code cells.

If you need to write code to answer the question, please use the code cell provided

If you don't know how to answer a question, please note that to the researcher in the room, explain why you cannot answer the question in the cell, and move on to the next one.

#### Part 1: Familiarity with the Data

In [63]:
## How familiar are you with online news articles shared on Mashable.com? 

# If you had to give it a number, on a scale of 1-7 (where 1 = Not at all and 7 = Extremely),
# How would you rate your familiarity? 
    # Response: 

# ---------------------------------------------------------------------------------------------------

## Have you previously interacted with datasets about online news popularity? 

# On a scale of 1-7, how would you rate your familiarity with datasets about online news popularity? 
    # Response: 

# ---------------------------------------------------------------------------------------------------

## Specifically, have you worked with UCI repository's Online News Popularity dataset based on Mashable articles?  

# On a scale of 1-7, how would you rate your familiarity with the above UCI repository dataset on Online News Popularity?
    # Response: 


#### Part 2: Global feature importance

In [64]:
## Is this feature importance order reasonable for online news popularity data? Why or why not?


# Please answer the following on a scale of 1-7 (where 1 = Not at all and 7 = Extremely):
# How reasonable is the feature importance order?
    # Response: 
# How confident are you that you have understood the explanation correctly?
    # Response: 

#### Part 3: Individual feature importance

In [65]:
## How would you describe the relationship between the feature "data_channel" and predicted popularity?


## How would you describe the relationship between the feature "data_channel" and predicted popularity?


## How would you describe the relationship between the feature "data_channel" and predicted popularity?


# Please answer the following on a scale of 1-7 (where 1 = Not at all and 7 = Extremely):
# How confident are you that these relationships are reasonable? 
    # Response: 
# How confident are you that you have understood these explanations correctly?
    # Response: 

#### Part 4: Local predictions

![online_news_popularity_hidden_result.png](online_news_popularity_hidden_result.png)

In [66]:
## Given the above input feature values, do you think this article was popular (shared over 1400 times) or not? Why?


# Please answer the following on a scale of 1-7 (where 1 = Not at all and 7 = Extremely):
# How confident are you that your predicted answer is reasonable? 
    # Response: 
# How confident are you that you have understood the explanation correctly?
    # Response: 

#### Part 5: Local misclassification

In [31]:
ebm_local = ebm.explain_local(X_test[229:230], y_test[229:230], name='EBM')
show(ebm_local)

In [50]:
## The model misclassified this particular datapoint (above). Why do you think that happened?


# Please answer the following on a scale of 1-7 (where 1 = Not at all and 7 = Extremely):
# How confident are you that your response for the question above is reasonable? 
    # Response: 
# How confident are you that you have understood the explanation correctly?
    # Response: 

#### Part 6: Reflecting

In [51]:
## On a scale of 1-7, how easy was it to answer these questions about online news popularity?

## What made it easy/hard to answer questions about online news popularity?

## When answering the questions, were you relying on the visuals alone or was there any prior knowledge you used in your answers?