In [1]:
import pandas as pd
import numpy as np

import spacy
import warnings

from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

tqdm.pandas()
warnings.filterwarnings('ignore')

#### Training Data

In [2]:
embeddings_model = spacy.load("en_core_web_trf")

In [3]:
news_column_headers = ["News ID", "Category", "SubCategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities"]
news_data = pd.read_csv('../data/mind-news-dataset/MINDsmall_train/news.tsv', delimiter='\t', names=news_column_headers)

news_data = news_data.drop(columns=['URL', 'Title Entities', 'Abstract Entities'])
news_data['Abstract'] = news_data['Abstract'].fillna('')

news_data['Context'] = news_data['Title'] + ' ' + news_data['Abstract']
news_data = news_data.drop(columns=["Title", "Abstract"])

news_data = news_data.set_index('News ID')
news_data

Unnamed: 0_level_0,Category,SubCategory,Context
News ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an..."
N19639,health,weightloss,50 Worst Habits For Belly Fat These seemingly ...
N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...
N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...
N38324,health,medical,"How to Get Rid of Skin Tags, According to a De..."
...,...,...,...
N16909,weather,weathertopstories,"Adapting, Learning And Soul Searching: Reflect..."
N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...
N7482,sports,more_sports,St. Dominic soccer player tries to kick cancer...
N34418,sports,soccer_epl,"How the Sounders won MLS Cup Mark, Jeremiah an..."


In [4]:
def compute_embeddings(row):
    return list(embeddings_model(row)._.trf_data.last_hidden_layer_state.data.mean(axis=0))

news_data['category_embeddings'] = news_data['Category'].progress_apply(compute_embeddings)
news_data['subcategory_embeddings'] = news_data['SubCategory'].progress_apply(compute_embeddings)
news_data['context_embeddings'] = news_data['Context'].progress_apply(compute_embeddings)

100%|██████████| 51282/51282 [15:41<00:00, 54.49it/s]
100%|██████████| 51282/51282 [16:36<00:00, 51.48it/s]
100%|██████████| 51282/51282 [40:31<00:00, 21.09it/s]


In [5]:
news_data.head()

Unnamed: 0_level_0,Category,SubCategory,Context,category_embeddings,subcategory_embeddings,context_embeddings
News ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","[-0.38372427, -1.547945, -0.82934594, 1.253516...","[-0.23410589, -0.9984167, -0.82643366, -0.1785...","[-0.09358272, -0.34953436, -0.19622236, -0.099..."
N19639,health,weightloss,50 Worst Habits For Belly Fat These seemingly ...,"[-0.34300137, -1.337414, -0.5679851, 1.2203748...","[-0.25600094, -1.4119117, -0.7917525, 1.027251...","[-0.1317841, -0.26250145, -0.11892246, 0.12496..."
N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,"[-0.04904593, -1.2764076, -0.6322053, 0.248834...","[0.33087507, -1.0746021, -1.1772493, -0.256131...","[-0.20576456, -0.5350486, -0.20848566, 0.07347..."
N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"[-0.34300137, -1.337414, -0.5679851, 1.2203748...","[0.015156612, -0.81792057, -1.4806063, 0.22450...","[-0.36664656, -0.3240311, -0.26827332, 0.00374..."
N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","[-0.34300137, -1.337414, -0.5679851, 1.2203748...","[-0.39024988, -0.3095015, -1.0689181, 0.339397...","[-0.1855902, -0.46774507, -0.17179142, 0.02046..."


In [6]:
behaviour_column_headers = ["Impression ID", "User ID", "Time", "History", "Impressions"]

customer_behaviour_data = pd.read_csv(
    '../data/mind-news-dataset/MINDsmall_train/behaviors.tsv', delimiter='\t', names=behaviour_column_headers).dropna(subset=['History'])

customer_behaviour_data['History'] = customer_behaviour_data['History'].str.split(' ')
customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].str.split(' ')

def clean_impressions(impression_list):
    return [x.split('-')[0] for x in impression_list if x.endswith('1')]

customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].apply(clean_impressions)
customer_behaviour_data = customer_behaviour_data.drop(columns=['Impression ID', 'Time'])

impression_data = customer_behaviour_data.groupby('User ID', as_index=False)['Impressions'].agg(lambda x: [item for sublist in x for item in sublist])
user_history_data = customer_behaviour_data.groupby('User ID', as_index=False)['History'].agg(lambda x: list(set().union(*x)))

customer_behaviour_data = pd.merge(user_history_data, impression_data, on='User ID', how='left')
customer_behaviour_data

Unnamed: 0,User ID,History,Impressions
0,U100,"[N53465, N50095, N55743, N20121, N42330, N2057...",[N7800]
1,U1000,"[N41244, N29641, N1789]","[N29739, N7670, N58656, N53875]"
2,U10001,"[N56814, N51706, N20639, N47937, N34562, N5644...","[N1031, N10833, N35937]"
3,U10003,"[N26619, N31431, N41668, N61052, N39074, N2825...","[N18708, N57090, N55689]"
4,U10008,"[N40977, N46754, N30148, N63422, N32312, N6000...",[N15405]
...,...,...,...
49103,U9993,"[N47458, N14114]","[N22257, N30648]"
49104,U9995,"[N48449, N49289, N2186, N18132, N15501, N47873...","[N11817, N37204, N57426, N19444, N10812, N4714..."
49105,U9996,"[N28296, N8448, N60340, N4719, N31165]","[N287, N47098, N23446]"
49106,U9997,"[N55285, N38367, N11929, N90, N46990, N11727, ...","[N48410, N35738, N39269, N23081, N16502, N4245..."


In [7]:
def compute_user_profile(history):
    user_news_history = news_data[news_data.index.isin(history)][['category_embeddings', 'subcategory_embeddings', 'context_embeddings']]

    return {'category': np.stack(user_news_history['category_embeddings'].values).mean(axis=0),
            'subcategory': np.stack(user_news_history['subcategory_embeddings'].values).mean(axis=0),
            'context': np.stack(user_news_history['context_embeddings'].values).mean(axis=0)}

customer_behaviour_data['user_profile'] = customer_behaviour_data['History'].progress_apply(compute_user_profile)
customer_behaviour_data

100%|██████████| 49108/49108 [02:33<00:00, 320.07it/s]


Unnamed: 0,User ID,History,Impressions,user_profile
0,U100,"[N53465, N50095, N55743, N20121, N42330, N2057...",[N7800],"{'category': [-0.22084086, -1.3919017, -0.8351..."
1,U1000,"[N41244, N29641, N1789]","[N29739, N7670, N58656, N53875]","{'category': [-0.424755, -1.4512954, -0.812635..."
2,U10001,"[N56814, N51706, N20639, N47937, N34562, N5644...","[N1031, N10833, N35937]","{'category': [0.11339963, -0.9142725, -0.90669..."
3,U10003,"[N26619, N31431, N41668, N61052, N39074, N2825...","[N18708, N57090, N55689]","{'category': [-0.092008725, -1.2081567, -0.813..."
4,U10008,"[N40977, N46754, N30148, N63422, N32312, N6000...",[N15405],"{'category': [-0.11400258, -1.344169, -0.66041..."
...,...,...,...,...
49103,U9993,"[N47458, N14114]","[N22257, N30648]","{'category': [-0.27062765, -1.283045, -0.47168..."
49104,U9995,"[N48449, N49289, N2186, N18132, N15501, N47873...","[N11817, N37204, N57426, N19444, N10812, N4714...","{'category': [-0.09834073, -1.486022, -0.75745..."
49105,U9996,"[N28296, N8448, N60340, N4719, N31165]","[N287, N47098, N23446]","{'category': [-0.33215192, -1.4870102, -0.7542..."
49106,U9997,"[N55285, N38367, N11929, N90, N46990, N11727, ...","[N48410, N35738, N39269, N23081, N16502, N4245...","{'category': [-0.1354178, -1.5284661, -0.96939..."


In [10]:
customer_behaviour_data = customer_behaviour_data.iloc[:100]

In [11]:
def recommend_user_attention(row):
    category_embeddings = np.stack(news_data['category_embeddings'].values)
    subcategory_embeddings = np.stack(news_data['category_embeddings'].values)
    context_embeddings = np.stack(news_data['category_embeddings'].values)

    user_item_similarity_matrix = np.array([
        cosine_similarity([row['user_profile']['category']], category_embeddings)[0],
        cosine_similarity([row['user_profile']['subcategory']], subcategory_embeddings)[0],
        cosine_similarity([row['user_profile']['context']], context_embeddings)[0]])

    attention_weight = (np.exp(user_item_similarity_matrix) / np.sum(np.exp(user_item_similarity_matrix), axis=0)).T

    attention_weighted_embeddings = ((attention_weight[:, 0].reshape(-1, 1) * category_embeddings) +
                   (attention_weight[:, 1].reshape(-1, 1) * subcategory_embeddings) +
                   (attention_weight[:, 2].reshape(-1, 1) * context_embeddings))

    user_profile = (row['user_profile']['category'] + row['user_profile']['subcategory'] + row['user_profile']['context']) / 3

    return list(news_data.index[np.argsort(cosine_similarity([user_profile], attention_weighted_embeddings)[0])[:20]])


customer_behaviour_data['Recommended Articles'] = customer_behaviour_data.progress_apply(recommend_user_attention, axis=1)
user_impression_data = customer_behaviour_data.drop(columns=['History', 'user_profile'])
user_impression_data

 15%|█▌        | 15/100 [00:53<05:03,  3.57s/it]


KeyboardInterrupt: 

In [9]:
def find_intersection(row):
    return list(set(row['Recommended Articles']).intersection(set(row['Impressions'])))

user_impression_data['Intersection'] = user_impression_data.apply(find_intersection, axis=1)
user_impression_data

NameError: name 'user_impression_data' is not defined

In [None]:
def calculate_precision(row):
    if len(row["Recommended Articles"]) == 0:
        return 0
    return len(row["Intersection"]) / (len(row["Intersection"]) + (len(row["Recommended Articles"]) - len(row["Intersection"])))

user_impression_data['Precision'] = user_impression_data.apply(calculate_precision, axis=1)
user_impression_data["Precision"].mean()

In [None]:
def calculate_recall(row):
    return len(row["Intersection"]) / (len(row["Intersection"]) + len(list(set(row["Impressions"]) - set(row["Recommended Articles"]))))

user_impression_data['Recall'] = user_impression_data.apply(calculate_recall, axis=1)
user_impression_data["Recall"].mean()

In [None]:
def calculate_f1_score(row):
    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return (2 * row['Precision'] * row['Recall']) / (row['Precision'] + row['Recall'])

user_impression_data['F1 Score'] = user_impression_data.apply(calculate_f1_score, axis=1)
user_impression_data['F1 Score'].mean()

In [None]:
def calculate_f_beta(row):
    beta = 2

    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return ((1 + (beta ** 2)) * row['Precision'] * row['Recall']) / ((beta ** 2) * (row['Precision']) + row['Recall'])

user_impression_data['F-Beta Score'] = user_impression_data.apply(calculate_f_beta, axis=1)
user_impression_data['F-Beta Score'].mean()

In [11]:
user_impression_data.to_csv('../data/results/user_item_similarity_approach_results_train.csv', index=False)

#### Validation Data

In [None]:
news_column_headers = ["News ID", "Category", "SubCategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities"]

news_data = pd.read_csv('../data/mind-news-dataset/MINDsmall_train/news.tsv', delimiter='\t', names=news_column_headers)
news_data_val = pd.read_csv('../data/mind-news-dataset/MINDsmall_dev/news.tsv', delimiter='\t', names=news_column_headers)

news_data = pd.concat([news_data, news_data_val], ignore_index=True).drop_duplicates(subset=['News ID']).reset_index(drop=True)
news_data = news_data.drop(columns=['URL', 'Title Entities', 'Abstract Entities'])

news_data['Abstract'] = news_data['Abstract'].fillna('')

news_data['Context'] = news_data['Title'] + ' ' + news_data['Abstract']
news_data = news_data.drop(columns=["Title", "Abstract"])

news_data = news_data.set_index('News ID')
news_data

In [None]:
def compute_embeddings(row):
    return list(embeddings_model(row)._.trf_data.last_hidden_layer_state.data.mean(axis=0))

news_data['category_embeddings'] = news_data['Category'].progress_apply(compute_embeddings)
news_data['subcategory_embeddings'] = news_data['SubCategory'].progress_apply(compute_embeddings)
news_data['context_embeddings'] = news_data['Context'].progress_apply(compute_embeddings)

In [None]:
news_data.head()

In [None]:
behaviour_column_headers = ["Impression ID", "User ID", "Time", "History", "Impressions"]

customer_behaviour_data = pd.read_csv(
    '../data/mind-news-dataset/MINDsmall_train/behaviors.tsv', delimiter='\t', names=behaviour_column_headers).dropna(subset=['History'])

customer_behaviour_data['History'] = customer_behaviour_data['History'].str.split(' ')
customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].str.split(' ')

def clean_impressions(impression_list):
    return [x.split('-')[0] for x in impression_list if x.endswith('1')]

customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].apply(clean_impressions)
customer_behaviour_data = customer_behaviour_data.drop(columns=['Impression ID', 'Time'])

impression_data = customer_behaviour_data.groupby('User ID', as_index=False)['Impressions'].agg(lambda x: [item for sublist in x for item in sublist])
user_history_data = customer_behaviour_data.groupby('User ID', as_index=False)['History'].agg(lambda x: list(set().union(*x)))

customer_behaviour_data = pd.merge(user_history_data, impression_data, on='User ID', how='left')
customer_behaviour_data

In [None]:
def compute_user_profile(history):
    user_news_history = news_data[news_data.index.isin(history)][['category_embeddings', 'subcategory_embeddings', 'context_embeddings']]

    return {'category': np.array([list(embeddings) for embeddings in user_news_history['category_embeddings'].values]).mean(axis=0),
            'subcategory': np.array([list(embeddings) for embeddings in user_news_history['subcategory_embeddings'].values]).mean(axis=0),
            'context': np.array([list(embeddings) for embeddings in user_news_history['context_embeddings'].values]).mean(axis=0)}

customer_behaviour_data['user_profile'] = customer_behaviour_data['History'].progress_apply(compute_user_profile)
customer_behaviour_data

In [None]:
def recommend_user_attention(row):
    category_embeddings = np.stack(news_data['category_embeddings'].values)
    subcategory_embeddings = np.stack(news_data['category_embeddings'].values)
    context_embeddings = np.stack(news_data['category_embeddings'].values)

    user_item_similarity_matrix = np.array([
        cosine_similarity([row['user_profile']['category']], category_embeddings)[0],
        cosine_similarity([row['user_profile']['subcategory']], subcategory_embeddings)[0],
        cosine_similarity([row['user_profile']['context']], context_embeddings)[0]])

    attention_weight = (np.exp(user_item_similarity_matrix) / np.sum(np.exp(user_item_similarity_matrix), axis=0)).T

    attention_weighted_embeddings = ((attention_weight[:, 0].reshape(-1, 1) * category_embeddings) +
                   (attention_weight[:, 1].reshape(-1, 1) * subcategory_embeddings) +
                   (attention_weight[:, 2].reshape(-1, 1) * context_embeddings))

    user_profile = (row['user_profile']['category'] + row['user_profile']['subcategory'] + row['user_profile']['context']) / 3

    return list(news_data.index[np.argsort(cosine_similarity([user_profile], attention_weighted_embeddings)[0])[:20]])


customer_behaviour_data['Recommended Articles'] = customer_behaviour_data.progress_apply(recommend_user_attention, axis=1)
user_impression_data = customer_behaviour_data.drop(columns=['History', 'user_profile'])
user_impression_data

In [None]:
def find_intersection(row):
    return list(set(row['Recommended Articles']).intersection(set(row['Impressions'])))

user_impression_data['Intersection'] = user_impression_data.apply(find_intersection, axis=1)
user_impression_data

In [None]:
def calculate_precision(row):
    if len(row["Recommended Articles"]) == 0:
        return 0
    return len(row["Intersection"]) / (len(row["Intersection"]) + (len(row["Recommended Articles"]) - len(row["Intersection"])))

user_impression_data['Precision'] = user_impression_data.apply(calculate_precision, axis=1)
user_impression_data["Precision"].mean()

In [None]:
def calculate_recall(row):
    return len(row["Intersection"]) / (len(row["Intersection"]) + len(list(set(row["Impressions"]) - set(row["Recommended Articles"]))))

user_impression_data['Recall'] = user_impression_data.apply(calculate_recall, axis=1)
user_impression_data["Recall"].mean()

In [None]:
def calculate_f1_score(row):
    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return (2 * row['Precision'] * row['Recall']) / (row['Precision'] + row['Recall'])

user_impression_data['F1 Score'] = user_impression_data.apply(calculate_f1_score, axis=1)
user_impression_data['F1 Score'].mean()

In [None]:
def calculate_f_beta(row):
    beta = 2

    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return ((1 + (beta ** 2)) * row['Precision'] * row['Recall']) / ((beta ** 2) * (row['Precision']) + row['Recall'])

user_impression_data['F-Beta Score'] = user_impression_data.apply(calculate_f_beta, axis=1)
user_impression_data['F-Beta Score'].mean()

In [20]:
user_impression_data.to_csv('../data/results/content_based_approach_results_test.csv', index=False)