In [1]:
import numpy as np
import pandas as pd

## ニュース記事のデータによる検証
ニュース記事のデータを用いて交互作用特徴量の効果の検証を行う. 記事に関する特徴量からニュースのシェア数を予測するタスクを考える. 効果検証のためにオリジナルのデータの特徴量と,ペアワイズ特徴量を加えたインスタンスについて線形回帰モデルを作成する. 評価はホールドアウト法で行い,性能指標は決定係数$R^2$とする.

In [9]:
# load data
df = pd.read_csv("./Data/OnlineNewsPopularity.csv",delimiter=", ")
df.head()

  df = pd.read_csv("./Data/OnlineNewsPopularity.csv",delimiter=", ")


Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,...,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,...,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505


In [19]:
import sklearn.preprocessing as preproc

# ペアワイズ交互作用特徴量を作成する特徴量を選択
features = ['n_tokens_title', 'n_tokens_content', 'n_unique_tokens',
            'n_non_stop_words', 'n_non_stop_unique_tokens', 'num_hrefs',
            'num_self_hrefs', 'num_imgs', 'num_videos', 'average_token_length',
            'num_keywords', 'data_channel_is_lifestyle', 'data_channel_is_entertainment',
            'data_channel_is_bus', 'data_channel_is_socmed', 'data_channel_is_tech',
            'data_channel_is_world']

X= df[features]
y = df[["shares"]] # target

Xpoly = preproc.PolynomialFeatures(include_bias=False).fit_transform(X)
Xpoly.shape

(39644, 170)

In [27]:
from sklearn.model_selection import train_test_split
# train : test = 8 : 3
X_train,X_test,Xpoly_train,Xpoly_test,y_train,y_test = train_test_split(X,Xpoly,y,test_size=0.3,random_state=123)

In [28]:
from sklearn.linear_model import LinearRegression

def evaluate_feature(X_train,X_test,y_train,y_test):
    """
     訓練データに対して線形回帰モデルを適合し,テストデータの決定係数を計算する関数
     Args:
     X_train : 訓練セット
     X_test : 評価セット
     y_train : 訓練セットの正解データ
     y_test : 評価セットの正解データ
    
    Returns:
    (model,r2_score) : モデルと決定係数
    """
    model = LinearRegression()
    model.fit(X_train,y_train)
    r2_score = model.score(X_test,y_test)
    return (model,r2_score)

In [29]:
(m,r2) = evaluate_feature(X_train,X_test,y_train,y_test)
(mp,r2p) = evaluate_feature(Xpoly_train,Xpoly_test,y_train,y_test)
print("R-squared score with singleton features : %0.5f" %r2)
print("R-squared score with pairwise features : %0.10f" %r2p)

R-squared score with singleton features : 0.00924
R-squared score with pairwise features : 0.0113177226


実行結果から交互作用特徴量を加えたほうが決定係数が良くなっていることがわかる. しかしペアワイズ交互作用特徴量は元の特徴量が$n$個のとき$n^2$個作成され,訓練や検証の時間も$O(n)$から$O(n^2)$に増加する.