In [1]:
# libraries
import csv
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

## Online news popularity

In [2]:
# 0. url: URL of the article (non-predictive)
# 1. timedelta: Days between the article publication and the dataset acquisition (non-predictive)
# 2. n_tokens_title: Number of words in the title
# 3. n_tokens_content: Number of words in the content
# 4. n_unique_tokens: Rate of unique words in the content
# 5. n_non_stop_words: Rate of non-stop words in the content
# 6. n_non_stop_unique_tokens: Rate of unique non-stop words in the content
# 7. num_hrefs: Number of links
# 8. num_self_hrefs: Number of links to other articles published by Mashable
# 9. num_imgs: Number of images
# 10. num_videos: Number of videos
# 11. average_token_length: Average length of the words in the content
# 12. num_keywords: Number of keywords in the metadata
# 13. data_channel_is_lifestyle: Is data channel 'Lifestyle'?
# 14. data_channel_is_entertainment: Is data channel 'Entertainment'?
# 15. data_channel_is_bus: Is data channel 'Business'?
# 16. data_channel_is_socmed: Is data channel 'Social Media'?
# 17. data_channel_is_tech: Is data channel 'Tech'?
# 18. data_channel_is_world: Is data channel 'World'?
# 19. kw_min_min: Worst keyword (min. shares)
# 20. kw_max_min: Worst keyword (max. shares)
# 21. kw_avg_min: Worst keyword (avg. shares)
# 22. kw_min_max: Best keyword (min. shares)
# 23. kw_max_max: Best keyword (max. shares)
# 24. kw_avg_max: Best keyword (avg. shares)
# 25. kw_min_avg: Avg. keyword (min. shares)
# 26. kw_max_avg: Avg. keyword (max. shares)
# 27. kw_avg_avg: Avg. keyword (avg. shares)
# 28. self_reference_min_shares: Min. shares of referenced articles in Mashable
# 29. self_reference_max_shares: Max. shares of referenced articles in Mashable
# 30. self_reference_avg_sharess: Avg. shares of referenced articles in Mashable
# 31. weekday_is_monday: Was the article published on a Monday?
# 32. weekday_is_tuesday: Was the article published on a Tuesday?
# 33. weekday_is_wednesday: Was the article published on a Wednesday?
# 34. weekday_is_thursday: Was the article published on a Thursday?
# 35. weekday_is_friday: Was the article published on a Friday?
# 36. weekday_is_saturday: Was the article published on a Saturday?
# 37. weekday_is_sunday: Was the article published on a Sunday?
# 38. is_weekend: Was the article published on the weekend?
# 39. LDA_00: Closeness to LDA topic 0
# 40. LDA_01: Closeness to LDA topic 1
# 41. LDA_02: Closeness to LDA topic 2
# 42. LDA_03: Closeness to LDA topic 3
# 43. LDA_04: Closeness to LDA topic 4
# 44. global_subjectivity: Text subjectivity
# 45. global_sentiment_polarity: Text sentiment polarity
# 46. global_rate_positive_words: Rate of positive words in the content
# 47. global_rate_negative_words: Rate of negative words in the content
# 48. rate_positive_words: Rate of positive words among non-neutral tokens
# 49. rate_negative_words: Rate of negative words among non-neutral tokens
# 50. avg_positive_polarity: Avg. polarity of positive words
# 51. min_positive_polarity: Min. polarity of positive words
# 52. max_positive_polarity: Max. polarity of positive words
# 53. avg_negative_polarity: Avg. polarity of negative words
# 54. min_negative_polarity: Min. polarity of negative words
# 55. max_negative_polarity: Max. polarity of negative words
# 56. title_subjectivity: Title subjectivity
# 57. title_sentiment_polarity: Title polarity
# 58. abs_title_subjectivity: Absolute subjectivity level
# 59. abs_title_sentiment_polarity: Absolute polarity level
# 60. shares: Number of shares (target)

In [3]:
df = pd.read_csv('online-news.csv',sep = ',')

'''
cols = ['url',' timedelta',' n_tokens_title',' n_tokens_content',' n_unique_tokens',' n_non_stop_words',
        ' n_non_stop_unique_tokens',' num_hrefs',' num_self_hrefs',' num_imgs',' num_videos',' average_token_length',
        ' num_keywords' ' data_channel_is_lifestyle',' data_channel_is_entertainment',' data_channel_is_bus',
        ' data_channel_is_socmed',' data_channel_is_tech',' data_channel_is_world',' kw_min_min',' kw_max_min',
        ' kw_avg_min',' kw_min_max',' kw_max_max',' kw_avg_max',' kw_min_avg',' kw_max_avg',' kw_avg_avg',
        ' self_reference_min_shares',' self_reference_max_shares',' self_reference_avg_sharess',' weekday_is_monday',
        ' weekday_is_tuesday',' weekday_is_wednesday' ' weekday_is_thursday',' weekday_is_friday',
        ' weekday_is_saturday',' weekday_is_sunday',' is_weekend',' LDA_00',' LDA_01',' LDA_02',' LDA_03',
        ' LDA_04',' global_subjectivity',' global_sentiment_polarity',' global_rate_positive_words',
        ' global_rate_negative_words',' rate_positive_words',' rate_negative_words',' avg_positive_polarity',
        ' min_positive_polarity',' max_positive_polarity',' avg_negative_polarity',' min_negative_polarity',
        ' max_negative_polarity',' title_subjectivity',' title_sentiment_polarity',' abs_title_subjectivity',
        ' abs_title_sentiment_polarity',' shares']
        
        
df.columns = cols
'''

df.info()
df.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 61 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   url                             39644 non-null  object 
 1    timedelta                      39644 non-null  float64
 2    n_tokens_title                 39644 non-null  float64
 3    n_tokens_content               39644 non-null  float64
 4    n_unique_tokens                39644 non-null  float64
 5    n_non_stop_words               39644 non-null  float64
 6    n_non_stop_unique_tokens       39644 non-null  float64
 7    num_hrefs                      39644 non-null  float64
 8    num_self_hrefs                 39644 non-null  float64
 9    num_imgs                       39644 non-null  float64
 10   num_videos                     39644 non-null  float64
 11   average_token_length           39644 non-null  float64
 12   num_keywords                   

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,...,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,...,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505
5,http://mashable.com/2013/01/07/beewi-smart-toys/,731.0,10.0,370.0,0.559889,1.0,0.698198,2.0,2.0,0.0,...,0.136364,0.6,-0.195,-0.4,-0.1,0.642857,0.214286,0.142857,0.214286,855
6,http://mashable.com/2013/01/07/bodymedia-armba...,731.0,8.0,960.0,0.418163,1.0,0.549834,21.0,20.0,20.0,...,0.1,1.0,-0.224479,-0.5,-0.05,0.0,0.0,0.5,0.0,556
7,http://mashable.com/2013/01/07/canon-poweshot-n/,731.0,12.0,989.0,0.433574,1.0,0.572108,20.0,20.0,20.0,...,0.1,1.0,-0.242778,-0.5,-0.05,1.0,0.5,0.5,0.5,891
8,http://mashable.com/2013/01/07/car-of-the-futu...,731.0,11.0,97.0,0.670103,1.0,0.836735,2.0,0.0,0.0,...,0.4,0.8,-0.125,-0.125,-0.125,0.125,0.0,0.375,0.0,3600
9,http://mashable.com/2013/01/07/chuck-hagel-web...,731.0,10.0,231.0,0.636364,1.0,0.797101,4.0,1.0,1.0,...,0.1,0.5,-0.238095,-0.5,-0.1,0.0,0.0,0.5,0.0,710


In [4]:
# Break dataset into train and test data

share_index = list(df.columns).index(" shares")
train_cols = list(df.columns[1:share_index]) + list(df.columns[share_index+1:])
label = df.columns[share_index]
X_df = df[list(train_cols)]
y_df = df[label]

In [5]:
dataset = {
        'X': X_df,
        'y': y_df,
}

# create a train/test split
seed = 1
X_train, X_test, y_train, y_test = train_test_split(dataset['X'],dataset['y'], test_size=0.25, random_state=seed)

## SVM

In [None]:
from sklearn import svm

clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

In [None]:
#Training accuracy
train_pred = clf.predict(X_train).tolist()
same_train = sum(x == y for x, y in zip(train_pred, y_train.tolist()))
accuracy_train = round(same_train/ len(train_pred), 5)

#Test set accuracy
predictions = clf.predict(X_test).tolist()
same_test = sum(x == y for x, y in zip(predictions, y_test.tolist()))
accuracy_test = round(same_test / len(predictions), 5)

print("The accuracy of the model on the training set is: ", accuracy_train)
print("The accuracy of the model on the test set is: ", accuracy_test)

In [None]:
plt.figure(figsize=(12,8))
# Top 10 most important features
pd.Series(abs(clf.coef_[0]), index=dataset['X'].columns).nlargest(20).plot(kind='barh')