In [1]:
from simpletransformers.classification import ClassificationModel
import numpy as np
import pandas as pd
from sklearn import preprocessing
from scipy import stats
import wandb


In [2]:
#subscribers data
daryo_n = [597660, 635395, 876932, 944683, 970976, 974959, 1001620, 987818, 955630, 930500]
kun_n = [866120, 897421, 1234502, 1327874, 1400304, 1419048, 1468400, 1467269, 1487835, 1533702]
qalampir_n = [84189, 91990, 132252, 141007, 162540, 167763, 182913, 186740, 185759, 183327]

view_period=7 # days 

In [3]:
def create_fields(data, source):
    dates = pd.DatetimeIndex(data['date'])
    n_views = data['num_views'].values

    labels = []
    day_of_week = []
    for i in range(0, len(dates)):

        daily_rate = n_views[i]/view_period
        average_daily_rate = int(source[dates.month[i]-1]/100) # scaling down it a bit

        labels.append(daily_rate/average_daily_rate)
        day_of_week.append(dates[i].dayofweek)
    
    return labels, day_of_week

In [22]:
daryo = pd.read_json("daryo_articles.jsonl", orient='columns', lines=False)
kun = pd.read_json("kun_articles.jsonl", orient='columns', lines=True)
qalampir = pd.read_json("qalampir_articles.jsonl", orient='columns', lines=True)

labels, day_of_week = create_fields(daryo, daryo_n)
    
daryo.insert(len(daryo.columns), "labels", labels, True) 
daryo.insert(len(daryo.columns)-1, "day_of_week", day_of_week, True)
daryo.insert(2, "source", len(daryo)*[1], True) 

labels, day_of_week = create_fields(kun, kun_n)
    
kun.insert(len(kun.columns), "labels", labels, True) 
kun.insert(len(kun.columns)-1, "day_of_week", day_of_week, True)
kun.insert(2, "source", len(kun)*[2], True) 

labels, day_of_week = create_fields(qalampir, qalampir_n)
    
qalampir.insert(len(qalampir.columns), "labels", labels, True) 
qalampir.insert(len(qalampir.columns)-1, "day_of_week", day_of_week, True)
qalampir.insert(2, "source", len(qalampir)*[3], True) 


In [23]:
# output[0:100].to_json('output.jsonl', orient='records', lines=True)

In [24]:
def _format(data):
    return pd.DataFrame({
        'text_a': '[CLS] ' + data['content'],
        'text_b': data['title'],
        'text_c': data['num_links'],
        'text_d': data['num_images'],
        'text_e': data['day_of_week'],
        'text_f': data['num_quotes'],
        'labels': data['labels']
    })

In [25]:
def scale(train_df, column):
    return (train_df[column]-train_df[column].min())/(train_df[column].max()-train_df[column].min())


In [26]:
# scale it
def scale_fields(train_df):
    train_df['text_c'] = scale(train_df, 'text_c')
    train_df['text_d'] = scale(train_df, 'text_d')
    train_df['text_e'] = scale(train_df, 'text_e')
    train_df['text_f'] = scale(train_df, 'text_f')
    train_df['labels'] = scale(train_df, 'labels')
    return train_df



In [27]:
daryo = scale_fields(_format(daryo))
kun = scale_fields(_format(kun))
qalampir = scale_fields(_format(qalampir))

train_df = [daryo, kun, qalampir]
train_df = pd.concat(train_df)

In [28]:
# train_df.sort_values(by=['labels'], ascending=False)
# train_df['labels'].mean()

In [29]:
from sklearn.model_selection import train_test_split
train_df = train_df.sample(frac=1)
train, test = train_test_split(train_df, test_size=0.2)

In [30]:
hyperparameter_defaults = dict(
    bs = 32,
    lr = 4e-5,
    epochs = 1,
    sequence_length = 1024,
)

wandb.init(config= hyperparameter_defaults,project="duvduvgap")
config = wandb.config
wandb.config.experiment = "longformer"

train_args={
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'num_train_epochs': config.epochs,
    'regression': True,
    'max_seq_length': config.sequence_length,
    'n_gpu': 3,
    "learning_rate": config.lr,
    "train_batch_size": config.bs,
    "eval_batch_size": config.bs,
}

# Create a ClassificationModel
model = ClassificationModel("longformer", "allenai/longformer-base-4096", num_labels=1, use_cuda=True, args=train_args)

# Train the model
model.train_model(train)

[34m[1mwandb[0m: Wandb version 0.10.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the

HBox(children=(FloatProgress(value=0.0, max=19765.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=618.0, style=ProgressStyle(des…





In [31]:
result, model_outputs, wrong_predictions = model.eval_model(test)
test.insert(len(test.columns), "prediction", model_outputs, True) 
test = test.sort_values(by=['prediction'], ascending=False)

HBox(children=(FloatProgress(value=0.0, max=4942.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=155.0, style=ProgressStyle(descr…




In [32]:
print(result)
print(min(model_outputs))

{'eval_loss': 0.0047967177834512004}
0.05860306


In [33]:
def analysis(data):
    print(f"Correlation between labels and predictions: {stats.pearsonr(data['labels'], data['prediction'])}\n")
    print(f"Number of links and predictions: {stats.pearsonr(data['text_c'], data['prediction'])}\n")
    print(f"Number of images and predictions: {stats.pearsonr(data['text_d'], data['prediction'])}\n")
    print(f"Day of the week and predictions: {stats.pearsonr(data['text_e'], data['prediction'])}\n")
    print(f"Number of quotes and predictions: {stats.pearsonr(data['text_f'], data['prediction'])}\n")


In [34]:
analysis(test)

Correlation between labels and predictions: (0.12620413484659374, 5.316862134224309e-19)

Number of links and predictions: (0.027718369549795317, 0.05135934146597337)

Number of images and predictions: (-0.094203862852062, 3.232812922051197e-11)

Day of the week and predictions: (0.005186818102421701, 0.7154546559242575)

Number of quotes and predictions: (-0.15638327079423117, 1.9837009945094355e-28)

