In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score, classification_report,mean_absolute_error, mean_squared_error

pd.set_option('display.max_columns', 20)
pd.set_option('display.max_colwidth', None)

## Baseline model

In [5]:
df = pd.read_csv('/Users/clara/Desktop/neuefische/d-drivers/data/data_nlp.csv')

In [6]:
df.dropna(axis=0, how='any', inplace=True)
df.drop(['page_id','url','video_play', 'page_impressions', 'clickouts','last_author','date_scraped','scraped_word_count','meta_title', 'meta_description',
         'abstract','meta_image_url','page_img_size', 'merged_url','last_publish_date', 'page_name', 'title','h1','confidence_abstract','confidence_meta_title'], axis=1, inplace= True)
df['publish_date_min'] = pd.to_datetime(df['publish_date_min'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5060 entries, 0 to 6813
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   n_days                  5060 non-null   int64         
 1   n_urls                  5060 non-null   int64         
 2   no_versions             5060 non-null   int64         
 3   age                     5060 non-null   int64         
 4   publish_date_min        5060 non-null   datetime64[ns]
 5   word_count              5060 non-null   float64       
 6   classification_product  5060 non-null   object        
 7   classification_type     5060 non-null   object        
 8   external_clicks         5060 non-null   float64       
 9   external_impressions    5060 non-null   float64       
 10  likes_n_days            5060 non-null   float64       
 11  dislikes_n_days         5060 non-null   float64       
 12  ctr                     5060 non-null   float64      

In [8]:
# Filter columns of a specific data type (e.g., numerical columns)
numeric_columns = df.select_dtypes(include=['number']).columns.tolist()

# Filter columns of a specific data type (e.g., categorical columns)
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Filter columns of a specific data type (e.g., datetime columns)
datetime_columns = df.select_dtypes(include=['datetime']).columns.tolist()

# Combine numerical and categorical features
X_all = df[numeric_columns + categorical_columns]
X_all.drop(['external_clicks','external_impressions','ctr'],axis=1,inplace=True)

# One-hot encode all categorical features
X_all_encoded = pd.get_dummies(X_all, columns=categorical_columns, drop_first=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_all.drop(['external_clicks','external_impressions','ctr'],axis=1,inplace=True)


In [10]:
X_all_encoded.columns

Index(['n_days', 'n_urls', 'no_versions', 'age', 'word_count', 'likes_n_days',
       'dislikes_n_days', 'meta_title_len', 'meta_desc_len', 'h1_len',
       ...
       'scraped_author_press-inform/Wolfgang Gomoll',
       'scraped_author_pv magazine', 'scraped_author_pv-magazine',
       'scraped_author_smarter fahren', 'scraped_author_t3n',
       'scraped_author_zolar', 'sentiment_abstract_neutral',
       'sentiment_abstract_positive', 'sentiment_meta_title_neutral',
       'sentiment_meta_title_positive'],
      dtype='object', length=215)

In [11]:
X_all_encoded

Unnamed: 0,n_days,n_urls,no_versions,age,word_count,likes_n_days,dislikes_n_days,meta_title_len,meta_desc_len,h1_len,...,scraped_author_press-inform/Wolfgang Gomoll,scraped_author_pv magazine,scraped_author_pv-magazine,scraped_author_smarter fahren,scraped_author_t3n,scraped_author_zolar,sentiment_abstract_neutral,sentiment_abstract_positive,sentiment_meta_title_neutral,sentiment_meta_title_positive
0,6,2,0,22,827.000000,2.0,5.0,55,64,61,...,False,False,False,False,False,False,False,True,True,False
1,1,2,0,697,1066.000000,0.0,0.0,46,141,75,...,False,False,False,False,False,False,True,False,True,False
2,10,1,0,40,466.000000,4.0,0.0,53,155,53,...,False,False,False,False,False,False,False,False,True,False
3,3,1,0,2282,0.000000,0.0,0.0,71,156,71,...,False,False,False,False,False,False,True,False,True,False
5,11,2,1,1869,477.454545,3.0,13.0,63,153,63,...,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6808,2,1,0,2282,0.000000,0.0,0.0,62,78,62,...,False,False,False,False,False,False,True,False,True,False
6809,2,1,0,2282,0.000000,0.0,0.0,62,71,62,...,False,False,False,False,False,False,True,False,True,False
6811,1,1,0,2282,0.000000,0.0,0.0,77,81,77,...,False,False,False,False,False,False,True,False,True,False
6812,1,1,0,2282,0.000000,0.0,0.0,76,153,76,...,False,False,False,False,False,False,True,False,True,False


In [12]:
def lin_reg_evaluation(X, y, test_size=0.3, random_state=25):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Fit the linear regression model
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)

    # Predictions on training and testing sets
    y_pred_train = lin_reg.predict(X_train)
    y_pred_test = lin_reg.predict(X_test)


    # Evaluation metrics
    print("R-squared (Train):", r2_score(y_train, y_pred_train).round(3))
    print("R-squared (Test):", r2_score(y_test, y_pred_test).round(3))
    print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred_test).round(3))
    print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred_test).round(3))
    print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred_test)).round(3))


In [13]:
author_col = X_all_encoded.filter(regex='author', axis=1)
media_col = X_all_encoded.filter(regex='media', axis=1)
product_col = X_all_encoded.filter(regex='classification_product', axis=1)
type_col = X_all_encoded.filter(regex='classification', axis=1)

In [14]:
target = ['external_impressions', 'external_clicks', 'ctr']

columns = ['no_versions','n_days','classification_product','classification_type','scraped_author',
           'likes_n_days','dislikes_n_days','scraped_word_count','media_type','meta_title_len','meta_desc_len','h1_len','abstract_len']

X = {"Length related features" : "df[['word_count','meta_title_len', 'meta_desc_len', 'h1_len', 'abstract_len']]",
    "All features": "X_all_encoded",
    "Performance metrics" : "X_all_encoded[['likes_n_days','dislikes_n_days']]",
    "EDA identified features": "pd.concat([media_col, author_col, product_col, X_all_encoded['n_days']], axis=1)"
    }

for key, value in X.items():
    print(f"================ {key} ================")
    features = eval(value) 
    for item in target:
        y = df[item]
        print(f"Evaluating for target: {item}")
        lin_reg_evaluation(features, y)
        print()

Evaluating for target: external_impressions
R-squared (Train): 0.02
R-squared (Test): 0.031
Mean Absolute Error (MAE): 175121.075
Mean Squared Error (MSE): 137918101329.924
Root Mean Squared Error (RMSE): 371373.264

Evaluating for target: external_clicks
R-squared (Train): 0.017
R-squared (Test): 0.024
Mean Absolute Error (MAE): 13895.248
Mean Squared Error (MSE): 832356819.72
Root Mean Squared Error (RMSE): 28850.595

Evaluating for target: ctr
R-squared (Train): 0.036
R-squared (Test): 0.033
Mean Absolute Error (MAE): 2.845
Mean Squared Error (MSE): 16.062
Root Mean Squared Error (RMSE): 4.008

Evaluating for target: external_impressions
R-squared (Train): 0.33
R-squared (Test): 0.303
Mean Absolute Error (MAE): 161625.438
Mean Squared Error (MSE): 99231221403.146
Root Mean Squared Error (RMSE): 315009.875

Evaluating for target: external_clicks
R-squared (Train): 0.275
R-squared (Test): 0.271
Mean Absolute Error (MAE): 13057.711
Mean Squared Error (MSE): 621514351.989
Root Mean Squa

In [68]:
#df = pd.read_csv('/Users/clara/Desktop/neuefische/d-drivers/data/data_features.csv')
df["classification_product"].unique().shape

(17,)