In [None]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

## Data Prep

In [None]:
# Read articles and Ratings file
df_knights  = pd.read_csv('Data/knight_data_articles.csv')
df_knights_ratings  = pd.read_csv('Data/knight_data_ratings.csv')

df_merge = pd.merge(df_knights, df_knights_ratings, on = 'content_id')
df_merge = df_merge[['content_id', 'content_body_clean', 'content_title_clean', 'content_source_desc', 'rating_scale_response', 'blind']]
df_merge.head()

In [None]:
# Filter out articles with blind ratings and take mean for average ratings
df_merge_blind = df_merge[df_merge['blind']==1]

df_mean_blind_ratings = df_merge_blind[['content_id','content_body_clean','rating_scale_response']].groupby(['content_id', 'content_body_clean'], as_index=False).mean()
df_mean_blind_ratings.head()

In [None]:
df_article_text = df_mean_blind_ratings['content_body_clean']
df_ratings = df_mean_blind_ratings['rating_scale_response']

## BERT Model

In [None]:
# distilBERT (lighter version of BERT)
# model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# BERT
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)


In [None]:
# Tokenization
tokenized = df_article_text.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
# Trunc all articles to max 512 tokens as thats the max BERT can handle. 
# Used Head (129 tokens) + Tail (last 383 tokens)

max_len = 512
updated_tokens = []

for i in tokenized.values:
    if len(i) > max_len:
        tmp = i[:129] + i[-383:]
        
    if len(i) < max_len:
        tmp = i + [0]*(max_len-len(i))
    
    updated_tokens.append(tmp)

padded = np.array(updated_tokens)

In [None]:
# MASKING
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

In [None]:
# Generate BERT output vector which serves as feature input to Linear Regression Model
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

## Linear Regression Model

In [None]:
features = last_hidden_states[0][:,0,:].numpy()
# features

In [None]:
labels = np.array(df_ratings)
# labels = labels[:100]
labels

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=0)

In [None]:
lr_model = LinearRegression()

lr_model.fit(X_train, y_train)
lr_model.score(X_test, y_test)