In [1]:
import json
import gzip
from collections import Counter

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np


def read_json(data_path: str) -> list:
    with gzip.open(data_path, 'rt', encoding='utf-8') as f:
        return json.load(f)
raw = read_json('rtvslo_train_new.json.gzip')
print(len(raw))
head = raw

20981


In [2]:
df = pd.json_normalize(head, meta=['date', 'title', 'author', 'url'])
to_drop = ['authors', 'id', 'lead', 'category', 'topics']
df.drop(columns=to_drop, inplace=True)

In [3]:
# df.columns

In [4]:
def extract_topic(url):
    result = []
    for url in url.values:
        topic_pair = url[0].split('/')[3:5]
        topic = ""
        if 'sport' in topic_pair:
            topic = f"{topic_pair[0]}-{topic_pair[1]}"
        else:
            topic = f"{topic_pair[0]}"
        result.append(topic)
    return pd.DataFrame(result, columns=['topic'])

def hour_weekend(timestamps):
    timestamps = pd.to_datetime(timestamps)
    timestampsdf = pd.DataFrame(columns=['hours', 'weekend'])
    timestampsdf['hours'] = timestamps.dt.hour
    timestampsdf['weekend'] = timestamps.dt.weekday > 4
    return timestampsdf

def token_number(titles):
    lengths = [len(title.split()) for title in titles]
    title_length = pd.DataFrame(lengths, columns=['title_length'])
    return title_length

def article_length(paragraphs):
    article_lengths = [len(paragraph) for paragraph in paragraphs]
    return pd.DataFrame(article_lengths, columns=['article_length'])

def count_images(figures):
    return pd.DataFrame([len(fig) for fig in figures], columns=['images'])

def calculate_important_keywords(combined, discard):
    combined_as_list = [x 
                        for xs in combined
                        for x in xs]
    keyword_counter = Counter(combined_as_list)
    keyword_df = pd.DataFrame(list(keyword_counter.values()), columns=['keywords'])
    threshold = keyword_df.quantile(discard).values[0]
    important_keywords = [key for key, value in keyword_counter.items() if value > threshold]
    return important_keywords

def keywords_filter(keywords, important_keywords):
    return [word for word in keywords if word in important_keywords]

def keyword_encoding(X, important_keywords):
    mlb = MultiLabelBinarizer(classes=important_keywords)
    result = mlb.fit_transform(X['words'])
    return pd.DataFrame(result, columns=important_keywords)

In [5]:
class KeywordEncodingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, survival=0.01):
        self.survival = survival
        self.important_keywords = None

    def fit(self, X, y=None):
        self.important_keywords = calculate_important_keywords(X['keywords'] + X['gpt_keywords'], 1.0 - self.survival)
        return self

    def transform(self, X):
        print(X.columns)
        X['words'] = X['keywords'] + X['gpt_keywords']
        X['words'] = X['words'].apply(keywords_filter, args=(self.important_keywords,))
        X.drop(columns=['keywords', 'gpt_keywords'], inplace=True)
        return keyword_encoding(X, self.important_keywords)
    
    def get_feature_names_out(self, *args, **params):
        return self.important_keywords

In [6]:
url_to_topic = FunctionTransformer(extract_topic, validate=False)
time_transform = FunctionTransformer(hour_weekend, validate=False)
number_of_words = FunctionTransformer(token_number, validate=False)
length_of_paragraphs = FunctionTransformer(article_length, validate=False)
number_of_images = FunctionTransformer(count_images, validate=False)

In [7]:
ct = make_column_transformer((url_to_topic, ['url']), 
                             (time_transform, 'date'), 
                             (number_of_words, 'title'),
                             (length_of_paragraphs, 'paragraphs'), 
                             (number_of_images, 'figures'),
                             remainder='passthrough',
                             verbose_feature_names_out=False
                             )
ct.set_output(transform='pandas')
df1 = ct.fit_transform(df)
df1.columns

Index(['topic', 'hours', 'weekend', 'title_length', 'article_length', 'images',
       'keywords', 'gpt_keywords', 'n_comments'],
      dtype='object')

In [8]:
numeric_attributes = ['title_length', 'article_length', 'images']
categorical_attributes = ['topic', 'hours', 'weekend']
multi_label_attributes = ['keywords', 'gpt_keywords']

In [9]:
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False, feature_name_combiner='concat')

In [10]:
ke = KeywordEncodingTransformer()

In [11]:
ppct = make_column_transformer(
    (ohe, categorical_attributes),
    (ke, multi_label_attributes),
    remainder='passthrough',
    verbose=False,
    verbose_feature_names_out=False
)
ppct.set_output(transform='pandas')

In [12]:
# df1['n_comments'].describe()

In [13]:
df2 = ppct.fit_transform(df1)
# df2.columns

Index(['keywords', 'gpt_keywords'], dtype='object')


Index(['topic_crna-kronika', 'topic_gospodarstvo', 'topic_kolumne',
       'topic_kultura', 'topic_okolje', 'topic_slovenija',
       'topic_sport-atletika', 'topic_sport-citat-za-prebrat',
       'topic_sport-dokovic-petic-izbran-za-najboljsega-v-evropi-doncic-na-39-mestu',
       'topic_sport-formula-1',
       ...
       'teroristični napad', 'Wimbledon', 'javno mnenje', 'preobrat',
       'resolucija', 'dolgotrajna oskrba', 'title_length', 'article_length',
       'images', 'n_comments'],
      dtype='object', length=1329)

In [14]:
feature_names = [*df2.columns][:-1]
X = df2[feature_names]
y = df2['n_comments']

In [15]:
# X.shape

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
pipe = make_column_transformer(
    (scaler, numeric_attributes),
    remainder='passthrough'
)
X_train = pipe.fit_transform(X_train)
X_test = pipe.transform(X_test)

In [35]:
# al = np.logspace(-4, 0, 10)
reg = Lasso(alpha=0.1, random_state=42, copy_X=False, max_iter=1000)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"{mean_absolute_error(y_test, y_pred):.5f} is the r2_score for Lasso")
y_dumb = np.ones_like(y_pred) * df2['n_comments'].mean()
print(f"{mean_absolute_error(y_test, y_dumb):.5f} is the r2_score for dumb regressor (average)")

# print(f"{r2_score(y_test, y_pred):.5f} is the r2_score for Lasso")
# y_dumb = np.ones_like(y_pred) * df2['n_comments'].mean()
# print(f"{r2_score(y_test, y_dumb):.5f} is the r2_score for dumb regressor (average)")

# print(reg.sparse_coef_)

40.54884 is the r2_score for Lasso
41.04641 is the r2_score for dumb regressor (average)


In [34]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

i = 0
for train_index, test_index in kf.split(X, y):
    X_tr = X.loc[train_index]
    y_tr = y.loc[train_index]
    
    X_te = X.loc[test_index]
    y_te = y.loc[test_index]
    
    X_tr = pipe.transform(X_tr)
    X_te = pipe.transform(X_te)
    reg = Lasso(alpha=0.1, random_state=42, copy_X=False, max_iter=1000)
    reg.fit(X_tr, y_tr)
    y_pred = reg.predict(X_te)
    print(f"{mean_absolute_error(y_te, y_pred):.5f} is the MAE for Lasso in {i}th iteration")
    i += 1
    

32.11347 is the MAE for Lasso in 0th iteration
30.17968 is the MAE for Lasso in 1th iteration
29.54871 is the MAE for Lasso in 2th iteration
31.95168 is the MAE for Lasso in 3th iteration
30.21897 is the MAE for Lasso in 4th iteration
31.62223 is the MAE for Lasso in 5th iteration
29.64901 is the MAE for Lasso in 6th iteration
28.67687 is the MAE for Lasso in 7th iteration
30.73166 is the MAE for Lasso in 8th iteration
30.01065 is the MAE for Lasso in 9th iteration


In [37]:
razlaga = dict()
for importance, feature in sorted(zip(reg.coef_, feature_names), reverse=True):
    razlaga[feature] = importance

razlaga = dict(sorted(razlaga.items(), key=lambda x:abs(x[1]), reverse=False))

In [38]:
razlaga

{'župan': 0.0,
 'žrtve': -0.0,
 'žrtev': -0.0,
 'žreb': -0.0,
 'življenje': -0.0,
 'živali': -0.0,
 'žirija': 0.0,
 'ženske': 0.0,
 'ženska': 0.0,
 'študenti': 0.0,
 'športniki': -0.0,
 'šport': 0.0,
 'šole': -0.0,
 'šola': 0.0,
 'škoda': 0.0,
 'Švica': 0.0,
 'Švedska': -0.0,
 'Španija': 0.0,
 'Škoda': -0.0,
 'čustva': -0.0,
 'človekove pravice': -0.0,
 'človek': 0.0,
 'članstvo': 0.0,
 'četrtina': -0.0,
 'Človekove pravice': -0.0,
 'Češka': 0.0,
 'zvezdniki': 0.0,
 'zvezdnik': -0.0,
 'zveza': -0.0,
 'zunanja politika': 0.0,
 'zračna obramba': -0.0,
 'znanost': -0.0,
 'znanje': -0.0,
 'zmagovalec': -0.0,
 'zmage': 0.0,
 'zloraba': -0.0,
 'zlato': -0.0,
 'zgodovina': -0.0,
 'zgodba': -0.0,
 'zdravstvo': 0.0,
 'zdravstvene težave': -0.0,
 'zdravniki': -0.0,
 'zdravnik': -0.0,
 'zdravljenje': -0.0,
 'zbirka': 0.0,
 'zaščita': -0.0,
 'zavezništvo': -0.0,
 'zaupanje': 0.0,
 'zasedba': 0.0,
 'zasebnost': -0.0,
 'zaprtje': 0.0,
 'zaposlovanje': 0.0,
 'zaposlitev': -0.0,
 'zaposleni': -0.0,
 '

In [None]:
for k, v in razlaga.items():
    print(k, v)

In [None]:
razlaga = dict()
for importance, feature in sorted(zip(reg.coef_, feature_names), reverse=True):
    razlaga[feature] = importance

razlaga = dict(sorted(razlaga.items(), key=lambda x:abs(x[1])))

In [None]:
#dummie_model 
y_pred = reg.predict(X_test)
y_dumm = np.ones_like(y_pred) * df1['n_comments'].mean()
mean_absolute_error(y_test, y_dumm)