If one author is likely to write "?good?" articles he/she might receive more comments. 
Calculate entropy of each keyword and try to grep all articles with 20-ish words. 
Number of images should be one of the features. Also try binary variable - has image (1, 0) 
One feature should be connected with the hour of the posing. 
Number of paragraphs should be a feature. 
Length of the title can be a feature. 

In [None]:
import json
import gzip
from collections import Counter

import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
import os
import numpy as np


def read_json(data_path: str) -> list:
    with gzip.open(data_path, 'rt', encoding='utf-8') as f:
        return json.load(f)

Define all constants

In [None]:
MINIMUM_REPEAT = 41

In [None]:
def gpt_imputer(paragraphs: list, all_gpt_words: set) -> list:
    surrogate_words = list()
    for paragraph in paragraphs:
        for word in paragraph:
            if word in all_gpt_words:
                surrogate_words.append(word)
                print(word)
    return surrogate_words

In [None]:
raw = read_json('rtvslo_train.json.gzip')
print(len(raw))
# head = raw[:50]

In [None]:
head = raw[:300]

In [None]:
no_figures = []
for article in head:
    # if 'figures' not in article.keys():
    #     no_figures.append(0)
    #     continue
    no_figures.append(len(article['figures']))
print(no_figures)

In [None]:
times = []
for article in head:
    timestamp = datetime.fromisoformat(article['date'])
    times.append(timestamp.hour) # .weekday() and for weekend < 6 ture 1
print(times)

In [None]:
# all_gpt_keywords = set([article['gpt_keywords'] for article in head if 'gpt_keywords' in article.keys()])
all_gpt_keywords = set()
for article in head:
    if 'gpt_keywords' in article.keys():
        all_gpt_keywords.update(article['gpt_keywords'])
print(all_gpt_keywords)

In [None]:
absolute_all_keywords = list()
keywords_counter = {}
for article in raw:
    if 'gpt_keywords' in article.keys():
        absolute_all_keywords.extend(article['gpt_keywords'])
    else:
        absolute_all_keywords.extend(article['keywords'])
keywords_counter = Counter(absolute_all_keywords)
# print(sorted(keywords_counter.items(), key=lambda x : x[1], reverse=False))
df = pd.DataFrame(list(keywords_counter.values()), columns=['values'])
print(df.describe())
minimum_repeat = df.quantile(0.99).values[0]
chosen = sum(1 for value in keywords_counter.values() if value > minimum_repeat)
# print(chosen)
important_keywords = [key for key, value in keywords_counter.items() if value > minimum_repeat]
print(important_keywords)


In [None]:
unique_keys = list()
for article in head:
    unique_keys.extend([*article.keys()])
print(sorted(list(set(unique_keys))))

In [None]:
# for article in head:
#     topic_pair = article['url'].split('/')[3:5]
#     if 'sport' in topic_pair:
#         article['topic'] = f"{topic_pair[0]}-{topic_pair[1]}"
#     elif 'topic' not in article.keys():
#         article['topic'] = f"{topic_pair[0]}"
    # else:
    #     if article['topic'] != topic_pair[0]:
    #         print(article['topic'], "suggested:", topic_pair[0])

In [None]:
df = pd.json_normalize(head, meta=['date', 'title', 'author', 'url'])

In [None]:
to_drop = ['authors', 'id', 'lead', 'category']
stay = ['url', 'date', 'figures', 'gpt_keywords', 'keywords', 'paragraphs', 'lead', 'title', 'topics', 'n_comments']
df = df.drop(columns=to_drop)
print(df)
print(df.columns)

In [None]:
def extract_topic(url):
    topic_pair = url.split('/')[3:5]
    result = ""
    if 'sport' in topic_pair:
        result = f"{topic_pair[0]}-{topic_pair[1]}"
    else:
        result = f"{topic_pair[0]}"
    return result

In [None]:
def keywords_filter(keywords):
    result_list = []
    for word in keywords:
        if word in important_keywords:
            result_list.append(word)
    return result_list

In [None]:
def add_extra_features(X):
    X['datetime'] = pd.to_datetime(X['date'])
    X['hours'] = X['datetime'].dt.hour
    X['weekend'] = X['datetime'].dt.weekday > 4
    X['title_length'] = X['title'].apply(len)
    df_topic = pd.DataFrame(X['url'])
    X['topics'] = df_topic['url'].apply(extract_topic)
    X[X['gpt_keywords'] == ''] = X['keywords']
    X['gpt_keywords'] = X['gpt_keywords'].apply(keywords_filter)
    X['images'] = X['figures'].apply(len)
    X['article_length'] = X['paragraphs'].apply(len)
    
    to_drop_after_processing = ['datetime', 'date', 'title', 'url', 'keywords', 'figures', 'paragraphs']
    return X.drop(columns=to_drop_after_processing, inplace=False)
    # return X

attr_adder = FunctionTransformer(add_extra_features, validate=False)
dr_reset = df.reset_index(drop=True)
articles = attr_adder.fit_transform(dr_reset, important_keywords)
print(articles)
print([*articles.columns])

In [None]:
# Drop labels for training set
y = articles['n_comments'].copy()
X = articles.drop('n_comments', axis=1, inplace=False).copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Learning set size: {:d}\nTest set size: {:d}".format(len(X_train), len(X_test)))


In [None]:
mlb = MultiLabelBinarizer(classes=np.array(important_keywords))
out = mlb.fit_transform(X_train['gpt_keywords'][:5])
print(mlb.get_params())
print(out)

In [None]:
numerical_attributes = ['title_length', 'article_length', 'images']
categorical_attributes = ['weekend', 'topics', 'hours']
multi_label_attributes = ['gpt_keywords']

ct = make_column_transformer()

column_transformer = ColumnTransformer([
    ('scaler', StandardScaler(), numerical_attributes),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_attributes),
    ('multi_label_attributes', MultiLabelBinarizer(), multi_label_attributes)
])

full_pipeline = Pipeline([
    ('transformer', column_transformer),
    ('estimatior', LinearRegression())
])

full_pipeline.fit(X_train, y_train)
y_pred = full_pipeline.predict(X_test)
score = r2_score(y_test, y_pred)
print(score)