# Importing the necessary Libraries

In [154]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# Creating Dataframe from the available data

In [155]:
df_news = pd.read_csv('OnlineNewsPopularity.csv',sep=r'\s*,\s*',
                           header=0, encoding='ascii', engine='python')
df_news

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731,12,219,0.663594,1.0,0.815385,4,2,1,...,0.100000,0.70,-0.350000,-0.600,-0.200000,0.500000,-0.187500,0.000000,0.187500,593
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731,9,255,0.604743,1.0,0.791946,3,1,1,...,0.033333,0.70,-0.118750,-0.125,-0.100000,0.000000,0.000000,0.500000,0.000000,711
2,http://mashable.com/2013/01/07/apple-40-billio...,731,9,211,0.575130,1.0,0.663866,3,1,1,...,0.100000,1.00,-0.466667,-0.800,-0.133333,0.000000,0.000000,0.500000,0.000000,1500
3,http://mashable.com/2013/01/07/astronaut-notre...,731,9,531,0.503788,1.0,0.665635,9,0,1,...,0.136364,0.80,-0.369697,-0.600,-0.166667,0.000000,0.000000,0.500000,0.000000,1200
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731,13,1072,0.415646,1.0,0.540890,19,19,20,...,0.033333,1.00,-0.220192,-0.500,-0.050000,0.454545,0.136364,0.045455,0.136364,505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39639,http://mashable.com/2014/12/27/samsung-app-aut...,8,11,346,0.529052,1.0,0.684783,9,7,1,...,0.100000,0.75,-0.260000,-0.500,-0.125000,0.100000,0.000000,0.400000,0.000000,1800
39640,http://mashable.com/2014/12/27/seth-rogen-jame...,8,12,328,0.696296,1.0,0.885057,9,7,3,...,0.136364,0.70,-0.211111,-0.400,-0.100000,0.300000,1.000000,0.200000,1.000000,1900
39641,http://mashable.com/2014/12/27/son-pays-off-mo...,8,10,442,0.516355,1.0,0.644128,24,1,12,...,0.136364,0.50,-0.356439,-0.800,-0.166667,0.454545,0.136364,0.045455,0.136364,1900
39642,http://mashable.com/2014/12/27/ukraine-blasts/,8,6,682,0.539493,1.0,0.692661,10,1,1,...,0.062500,0.50,-0.205246,-0.500,-0.012500,0.000000,0.000000,0.500000,0.000000,1100


In [156]:
df_news.columns

Index(['url', 'timedelta', 'n_tokens_title', 'n_tokens_content',
       'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens',
       'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
       'average_token_length', 'num_keywords', 'data_channel_is_lifestyle',
       'data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world', 'kw_min_min', 'kw_max_min', 'kw_avg_min',
       'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
       'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
       'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'LDA_00',
       'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
     

# Classifying the popularity of the news article based on how many people shared the news

In [157]:
df_news['shares']=df_news['shares'].apply(lambda x: 1 if x>=1400 else 0)

In [158]:
df_news['shares']

0        0
1        0
2        1
3        0
4        0
        ..
39639    1
39640    1
39641    1
39642    0
39643    0
Name: shares, Length: 39644, dtype: int64

# Applying normalisation to all fields except url

In [188]:
scaler = StandardScaler()
scaler.fit(df_news.iloc[:,1:59])
scaled_data = scaler.transform(df_news.iloc[:,1:59])

# Dimensionality reduction using PCA (60 input features to 20.)

In [176]:
pca = PCA(n_components=20)

pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)
scaled_data.shape
print(x_pca)

[[-1.38545146 -1.84358082 -2.13422959 ...  2.29331649  0.02735957
   0.63415506]
 [ 0.02505296 -4.1985407  -1.20416423 ...  2.02743848 -0.40993902
   0.88209153]
 [-3.90046747 -3.26709773 -1.69454684 ...  1.85758402 -1.13266427
   1.34435395]
 ...
 [ 0.55023219  2.6075477   0.11473249 ...  0.20209373  1.6081026
  -0.06307983]
 [ 3.4796714   0.34679345 -3.1097019  ... -0.37111266  0.65974287
   0.04991   ]
 [ 0.87309496  0.45498855  2.23388102 ... -0.18153497  0.80752157
  -0.57216642]]


In [177]:
x_pca.shape

(39644, 20)

# Convert the numpy features array into Dataframe

In [178]:
df = pd.DataFrame(data=x_pca,columns=["feature1", "feature2","feature3","feature4","feature5","feature6","feature7","feature8","feature9","feature10","feature11", "feature12","feature13","feature14","feature15","feature16","feature17","feature18","feature19","feature20"])
df

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20
0,-1.385451,-1.843581,-2.134230,0.028105,0.385960,-3.668055,-0.478926,-1.472795,0.427488,0.149486,1.467492,1.749782,0.485774,0.006992,0.925227,-0.510352,-0.082032,2.293316,0.027360,0.634155
1,0.025053,-4.198541,-1.204164,0.080175,-0.344112,-3.239322,1.755171,0.488672,2.188011,0.810946,-0.324203,0.036338,0.971936,0.594666,-0.855378,-0.382247,0.192382,2.027438,-0.409939,0.882092
2,-3.900467,-3.267098,-1.694547,-0.085987,-0.372869,-2.199260,1.329335,-0.251669,-0.017476,-0.091078,-0.563986,-0.052219,0.913940,-0.500541,0.001388,-0.011942,0.146557,1.857584,-1.132664,1.344354
3,-0.801885,-1.506166,-3.437179,-0.094657,1.172562,-2.614227,-0.317064,-1.295557,0.005212,0.099870,1.433617,-0.236881,1.890375,0.509182,0.405244,-0.362923,-0.093246,1.886315,-0.521457,0.791912
4,-4.251478,-3.606443,-1.588494,-0.111583,0.348966,1.948094,-1.660772,-1.600043,0.866066,2.903571,-0.154046,0.543178,-1.212585,0.578850,0.879564,-0.604261,0.003587,2.832578,0.641525,0.751200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39639,0.003677,-0.138243,0.919141,0.044648,-0.772277,2.293216,1.102858,-1.664281,-1.404640,0.505255,-1.643155,0.308357,0.306359,0.428927,1.066401,2.005398,-0.128207,-0.048736,0.841166,0.266119
39640,-1.068003,1.375971,2.006141,0.061394,-0.986342,-0.757008,-1.733592,-0.689608,-0.106490,0.941353,1.848068,-1.972963,-3.156995,0.199572,-0.194465,2.877905,1.274704,-1.359304,-2.086509,3.049531
39641,0.550232,2.607548,0.114732,0.008465,-0.089848,-0.309106,-0.513953,-0.201322,-0.755492,0.178490,-0.728701,0.142689,-1.423242,-1.582979,-0.440872,1.862567,-0.204545,0.202094,1.608103,-0.063080
39642,3.479671,0.346793,-3.109702,0.028222,-0.513249,0.832365,1.465661,1.067954,-0.151759,-0.049882,0.473358,-0.887909,0.222299,0.880036,-0.054531,1.959619,-0.215704,-0.371113,0.659743,0.049910


# Adding the output feature to the Dataframe

In [189]:
df['output']=df_news['shares']

In [180]:
df

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,output
0,-1.385451,-1.843581,-2.134230,0.028105,0.385960,-3.668055,-0.478926,-1.472795,0.427488,0.149486,...,1.749782,0.485774,0.006992,0.925227,-0.510352,-0.082032,2.293316,0.027360,0.634155,0
1,0.025053,-4.198541,-1.204164,0.080175,-0.344112,-3.239322,1.755171,0.488672,2.188011,0.810946,...,0.036338,0.971936,0.594666,-0.855378,-0.382247,0.192382,2.027438,-0.409939,0.882092,0
2,-3.900467,-3.267098,-1.694547,-0.085987,-0.372869,-2.199260,1.329335,-0.251669,-0.017476,-0.091078,...,-0.052219,0.913940,-0.500541,0.001388,-0.011942,0.146557,1.857584,-1.132664,1.344354,1
3,-0.801885,-1.506166,-3.437179,-0.094657,1.172562,-2.614227,-0.317064,-1.295557,0.005212,0.099870,...,-0.236881,1.890375,0.509182,0.405244,-0.362923,-0.093246,1.886315,-0.521457,0.791912,0
4,-4.251478,-3.606443,-1.588494,-0.111583,0.348966,1.948094,-1.660772,-1.600043,0.866066,2.903571,...,0.543178,-1.212585,0.578850,0.879564,-0.604261,0.003587,2.832578,0.641525,0.751200,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39639,0.003677,-0.138243,0.919141,0.044648,-0.772277,2.293216,1.102858,-1.664281,-1.404640,0.505255,...,0.308357,0.306359,0.428927,1.066401,2.005398,-0.128207,-0.048736,0.841166,0.266119,1
39640,-1.068003,1.375971,2.006141,0.061394,-0.986342,-0.757008,-1.733592,-0.689608,-0.106490,0.941353,...,-1.972963,-3.156995,0.199572,-0.194465,2.877905,1.274704,-1.359304,-2.086509,3.049531,1
39641,0.550232,2.607548,0.114732,0.008465,-0.089848,-0.309106,-0.513953,-0.201322,-0.755492,0.178490,...,0.142689,-1.423242,-1.582979,-0.440872,1.862567,-0.204545,0.202094,1.608103,-0.063080,1
39642,3.479671,0.346793,-3.109702,0.028222,-0.513249,0.832365,1.465661,1.067954,-0.151759,-0.049882,...,-0.887909,0.222299,0.880036,-0.054531,1.959619,-0.215704,-0.371113,0.659743,0.049910,0


# Spliting the data into input(X) and output(Y) features.

In [181]:
X = df.iloc[:,0:20].values
Y = df.iloc[:,-1].values

In [182]:
X

array([[-1.38545146, -1.84358082, -2.13422959, ...,  2.29331649,
         0.02735957,  0.63415506],
       [ 0.02505296, -4.1985407 , -1.20416423, ...,  2.02743848,
        -0.40993902,  0.88209153],
       [-3.90046747, -3.26709773, -1.69454684, ...,  1.85758402,
        -1.13266427,  1.34435395],
       ...,
       [ 0.55023219,  2.6075477 ,  0.11473249, ...,  0.20209373,
         1.6081026 , -0.06307983],
       [ 3.4796714 ,  0.34679345, -3.1097019 , ..., -0.37111266,
         0.65974287,  0.04991   ],
       [ 0.87309496,  0.45498855,  2.23388102, ..., -0.18153497,
         0.80752157, -0.57216642]])

In [183]:
Y

array([0, 0, 1, ..., 1, 0, 0], dtype=int64)

# Splitting the Data into train and test data

In [184]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=0)

In [185]:
X_train.shape

(29733, 20)

In [186]:
X_test.shape

(9911, 20)

# Training the model against Logistic Regression and Random Forest Classifier

In [187]:
def models(X_train,Y_train):
    #Logistic Regression 
    log = LogisticRegression(random_state=0,penalty='l2')
    log.fit(X_train,Y_train)
    
    #Random Forest Classifier
    forest = RandomForestClassifier(criterion='entropy',random_state=0,bootstrap= True,max_depth = 70,max_features= 'auto',
                                    min_samples_leaf = 4,min_samples_split = 10,n_estimators= 400)
    forest.fit(X_train,Y_train)
    
    print("Regression model result " , log.score(X_train,Y_train))
    print("Random Forest cassifier result " , log.score(X_train,Y_train))
    
    return log , forest

# Generating the model accuracy

In [190]:
model = models(X_train,Y_train)

Regression model result  0.6407695153533112
Random Forest cassifier result  0.6407695153533112


# Logistic Model acuracy for test data

In [173]:
cm = confusion_matrix(Y_test,model[0].predict(X_test))

TP = cm[0][0]
TN = cm[1][1]
FP = cm[0][1]
FN = cm[1][0]

print(cm)

print('testing accuracy of logistic regression model is equal to : ', (TP+TN)/(TP+TN+FP+FN))

[[2527 2136]
 [1485 3763]]
testing accuracy of logistic regression model is equal to :  0.6346483704974271


# Random Forest Model accuracy for train data

In [174]:

cm = confusion_matrix(Y_test,model[1].predict(X_test))

TP = cm[0][0]
TN = cm[1][1]
FP = cm[0][1]
FN = cm[1][0]

print(cm)

print('testing accuracy of Random Forest model is equal to : ', (TP+TN)/(TP+TN+FP+FN))

[[2630 2033]
 [1523 3725]]
testing accuracy of Random Forest model is equal to :  0.6412067399858743


# It can be observed from the above work that the Random Forest Classifier perform better compared to Logistic Model during testing of model.