In [2]:
import pandas as pd
import numpy as np

import spacy
import warnings

from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

tqdm.pandas()
warnings.filterwarnings('ignore')

#### Training Data

In [3]:
embeddings_model = spacy.load("en_core_web_trf")

In [4]:
def create_embeddings(row):
    category_vector = embeddings_model(row["Category"])._.trf_data.last_hidden_layer_state.data.mean(axis=0)
    subcategory_vector = embeddings_model(row["SubCategory"])._.trf_data.last_hidden_layer_state.data.mean(axis=0)
    context_vector = embeddings_model(row["Context"])._.trf_data.last_hidden_layer_state.data.mean(axis=0)

    return list(category_vector) + list(subcategory_vector) + list(context_vector)

In [5]:
news_column_headers = ["News ID", "Category", "SubCategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities"]
news_data = pd.read_csv('../data/mind-news-dataset/MINDsmall_train/news.tsv', delimiter='\t', names=news_column_headers)

news_data = news_data.drop(columns=['URL', 'Title Entities', 'Abstract Entities'])

news_data['Abstract'] = news_data['Abstract'].fillna('')

news_data['Context'] = news_data['Title'] + ' ' + news_data['Abstract']
news_data = news_data.drop(columns=["Title", "Abstract"])

news_data = news_data.set_index('News ID')

news_data['Embedding'] = news_data.progress_apply(lambda row: create_embeddings(row), axis=1)
news_data = pd.DataFrame(data=np.stack(news_data['Embedding'].values), index=news_data.index)

news_data.head()

100%|██████████| 51282/51282 [1:29:40<00:00,  9.53it/s]  


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2294,2295,2296,2297,2298,2299,2300,2301,2302,2303
News ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N55528,-0.383724,-1.547945,-0.829346,1.253517,1.268836,-0.662258,-0.137644,0.974485,0.825314,-0.002158,...,-0.207909,0.325078,0.601655,0.150507,-0.400716,-0.296924,-0.32323,0.153161,-0.190733,0.214007
N19639,-0.343001,-1.337414,-0.567985,1.220375,1.153712,-0.418675,0.115724,0.739819,1.103576,-0.015558,...,0.509726,0.331881,-0.093307,-0.159236,-0.169413,-0.062995,0.250308,0.764319,-0.14807,-0.043718
N61837,-0.049046,-1.276408,-0.632205,0.248834,2.083739,-1.391234,0.284319,0.541727,0.634959,-0.346889,...,0.254583,0.249418,0.160578,-0.14776,-0.112158,-0.108611,0.056507,0.680737,-0.123355,-0.077637
N53526,-0.343001,-1.337414,-0.567985,1.220375,1.153712,-0.418675,0.115724,0.739819,1.103576,-0.015558,...,0.59606,0.375594,-0.325976,-0.207872,-0.171114,-0.171538,-0.019665,-0.16576,-0.081754,0.34604
N38324,-0.343001,-1.337414,-0.567985,1.220375,1.153712,-0.418675,0.115724,0.739819,1.103576,-0.015558,...,0.394966,0.32027,-0.011746,-0.102617,-0.24865,-0.262519,0.176324,0.546731,-0.106276,0.262711


In [6]:
behaviour_column_headers = ["Impression ID", "User ID", "Time", "History", "Impressions"]

customer_behaviour_data = pd.read_csv(
    '../data/mind-news-dataset/MINDsmall_train/behaviors.tsv', delimiter='\t', names=behaviour_column_headers).dropna(subset=['History'])

customer_behaviour_data['History'] = customer_behaviour_data['History'].str.split(' ')
customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].str.split(' ')

def clean_impressions(impression_list):
    return [x.split('-')[0] for x in impression_list if x.endswith('1')]

customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].apply(clean_impressions)
customer_behaviour_data = customer_behaviour_data.drop(columns=['Impression ID', 'Time'])

impression_data = customer_behaviour_data.groupby('User ID', as_index=False)['Impressions'].agg(lambda x: [item for sublist in x for item in sublist])
user_history_data = customer_behaviour_data.groupby('User ID', as_index=False)['History'].agg(lambda x: list(set().union(*x)))

customer_behaviour_data = pd.merge(user_history_data, impression_data, on='User ID', how='left')
customer_behaviour_data

Unnamed: 0,User ID,History,Impressions
0,U100,"[N53465, N45954, N55743, N51705, N18870, N2057...",[N7800]
1,U1000,"[N29641, N41244, N1789]","[N29739, N7670, N58656, N53875]"
2,U10001,"[N34562, N33976, N47937, N2735, N61319, N51706...","[N1031, N10833, N35937]"
3,U10003,"[N31431, N41668, N26619, N50839, N39074, N6105...","[N18708, N57090, N55689]"
4,U10008,"[N63422, N38870, N36526, N33876, N23614, N3530...",[N15405]
...,...,...,...
49103,U9993,"[N14114, N47458]","[N22257, N30648]"
49104,U9995,"[N42415, N6727, N12608, N4415, N36888, N45146,...","[N11817, N37204, N57426, N19444, N10812, N4714..."
49105,U9996,"[N4719, N8448, N31165, N28296, N60340]","[N287, N47098, N23446]"
49106,U9997,"[N46759, N11929, N64459, N55285, N64836, N9072...","[N48410, N35738, N39269, N23081, N16502, N4245..."


In [9]:
customer_behaviour_data = customer_behaviour_data.iloc[:5000, :]

In [10]:
def user_preference_recomendation(row):
    history_data = news_data[news_data.index.isin(row["History"])].mean().values.T.reshape(1, -1)
    similarity_matrix = pd.DataFrame(data=cosine_similarity(news_data, history_data), index=news_data.index, columns=['News Articles'])

    num_aritcles = (int(len(row['Impressions']) * 1.5) + 1) if len(row['Impressions']) >= 5 else 16
    similarity_matrix = similarity_matrix.apply(lambda col: col.nlargest(num_aritcles).index).iloc[1:, :]

    return list(set(similarity_matrix.values.flatten()) - set(row["History"]))

customer_behaviour_data['Recommended Articles'] = customer_behaviour_data.progress_apply(lambda row: user_preference_recomendation(row), axis=1)
user_impression_data = customer_behaviour_data.drop(columns=['History'])

user_impression_data

100%|██████████| 5000/5000 [32:34<00:00,  2.56it/s]


Unnamed: 0,User ID,Impressions,Recommended Articles
0,U100,[N7800],"[N33170, N14780, N58271, N28299, N61313, N5255..."
1,U1000,"[N29739, N7670, N58656, N53875]","[N25551, N28074, N771, N2070, N56077, N33943, ..."
2,U10001,"[N1031, N10833, N35937]","[N65114, N46373, N39663, N15095, N43810, N2155..."
3,U10003,"[N18708, N57090, N55689]","[N17876, N24573, N45330, N10426, N2042, N63797..."
4,U10008,[N15405],"[N45330, N2042, N51852, N35960, N28889, N61858..."
...,...,...,...
4995,U18527,[N7821],"[N55786, N27840, N56988, N48014, N59019, N2521..."
4996,U18530,"[N38779, N7128, N13801, N49279, N26043, N64734...","[N27617, N26941, N35973, N41002, N32257, N7421..."
4997,U18535,"[N18708, N9009, N31448, N60858]","[N16044, N45106, N45183, N22077, N57628, N5913..."
4998,U18538,[N64174],"[N8798, N34920, N26367, N53570, N33382, N1895,..."


In [11]:
def find_intersection(row):
    return list(set(row['Recommended Articles']).intersection(set(row['Impressions'])))

user_impression_data['Intersection'] = user_impression_data.apply(find_intersection, axis=1)
user_impression_data

Unnamed: 0,User ID,Impressions,Recommended Articles,Intersection
0,U100,[N7800],"[N33170, N14780, N58271, N28299, N61313, N5255...",[]
1,U1000,"[N29739, N7670, N58656, N53875]","[N25551, N28074, N771, N2070, N56077, N33943, ...",[]
2,U10001,"[N1031, N10833, N35937]","[N65114, N46373, N39663, N15095, N43810, N2155...",[]
3,U10003,"[N18708, N57090, N55689]","[N17876, N24573, N45330, N10426, N2042, N63797...",[]
4,U10008,[N15405],"[N45330, N2042, N51852, N35960, N28889, N61858...",[]
...,...,...,...,...
4995,U18527,[N7821],"[N55786, N27840, N56988, N48014, N59019, N2521...",[]
4996,U18530,"[N38779, N7128, N13801, N49279, N26043, N64734...","[N27617, N26941, N35973, N41002, N32257, N7421...",[]
4997,U18535,"[N18708, N9009, N31448, N60858]","[N16044, N45106, N45183, N22077, N57628, N5913...",[]
4998,U18538,[N64174],"[N8798, N34920, N26367, N53570, N33382, N1895,...",[]


In [12]:
def calculate_precision(row):
    if len(row["Recommended Articles"]) == 0:
        return 0
    return len(row["Intersection"]) / (len(row["Intersection"]) + (len(row["Recommended Articles"]) - len(row["Intersection"])))

user_impression_data['Precision'] = user_impression_data.apply(calculate_precision, axis=1)
user_impression_data["Precision"].mean()

0.0004878449883609204

In [13]:
def calculate_recall(row):
    return len(row["Intersection"]) / (len(row["Intersection"]) + len(list(set(row["Impressions"]) - set(row["Recommended Articles"]))))

user_impression_data['Recall'] = user_impression_data.apply(calculate_recall, axis=1)
user_impression_data["Recall"].mean()

0.0018289585606782403

In [14]:
def calculate_f1_score(row):
    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return (2 * row['Precision'] * row['Recall']) / (row['Precision'] + row['Recall'])

user_impression_data['F1 Score'] = user_impression_data.apply(calculate_f1_score, axis=1)
user_impression_data['F1 Score'].mean()

0.0006798926177504385

In [15]:
def calculate_f_beta(row):
    beta = 2

    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return ((1 + (beta ** 2)) * row['Precision'] * row['Recall']) / ((beta ** 2) * (row['Precision']) + row['Recall'])

user_impression_data['F-Beta Score'] = user_impression_data.apply(calculate_f_beta, axis=1)
user_impression_data['F-Beta Score'].mean()

0.0009885079065687012

In [16]:
user_impression_data.to_csv('../data/results/content_based_user_approach_results_train.csv', index=False)

#### Validation Data

In [18]:
news_column_headers = ["News ID", "Category", "SubCategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities"]

news_data = pd.read_csv('../data/mind-news-dataset/MINDsmall_train/news.tsv', delimiter='\t', names=news_column_headers)
news_data_val = pd.read_csv('../data/mind-news-dataset/MINDsmall_dev/news.tsv', delimiter='\t', names=news_column_headers)

news_data = pd.concat([news_data, news_data_val], ignore_index=True).drop_duplicates(subset=['News ID']).reset_index(drop=True)
news_data = news_data.drop(columns=['URL', 'Title Entities', 'Abstract Entities'])

news_data['Abstract'] = news_data['Abstract'].fillna('')

news_data['Context'] = news_data['Title'] + ' ' + news_data['Abstract']
news_data = news_data.drop(columns=["Title", "Abstract"])

news_data = news_data.set_index('News ID')

news_data['Embedding'] = news_data.progress_apply(lambda row: create_embeddings(row), axis=1)
news_data = pd.DataFrame(data=np.stack(news_data['Embedding'].values), index=news_data.index)

news_data.head()

100%|██████████| 65238/65238 [1:55:48<00:00,  9.39it/s]  


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2294,2295,2296,2297,2298,2299,2300,2301,2302,2303
News ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N55528,-0.383724,-1.547945,-0.829346,1.253517,1.268836,-0.662258,-0.137644,0.974485,0.825314,-0.002158,...,-0.207909,0.325078,0.601655,0.150507,-0.400716,-0.296924,-0.32323,0.153161,-0.190733,0.214007
N19639,-0.343001,-1.337414,-0.567985,1.220375,1.153712,-0.418675,0.115724,0.739819,1.103576,-0.015558,...,0.509726,0.331881,-0.093307,-0.159236,-0.169413,-0.062995,0.250308,0.764319,-0.14807,-0.043718
N61837,-0.049046,-1.276408,-0.632205,0.248834,2.083739,-1.391234,0.284319,0.541727,0.634959,-0.346889,...,0.254583,0.249418,0.160578,-0.14776,-0.112158,-0.108611,0.056507,0.680737,-0.123355,-0.077637
N53526,-0.343001,-1.337414,-0.567985,1.220375,1.153712,-0.418675,0.115724,0.739819,1.103576,-0.015558,...,0.59606,0.375594,-0.325976,-0.207872,-0.171114,-0.171538,-0.019665,-0.16576,-0.081754,0.34604
N38324,-0.343001,-1.337414,-0.567985,1.220375,1.153712,-0.418675,0.115724,0.739819,1.103576,-0.015558,...,0.394966,0.32027,-0.011746,-0.102617,-0.24865,-0.262519,0.176324,0.546731,-0.106276,0.262711


In [21]:
behaviour_column_headers = ["Impression ID", "User ID", "Time", "History", "Impressions"]

customer_behaviour_data = pd.read_csv(
    '../data/mind-news-dataset/MINDsmall_dev/behaviors.tsv', delimiter='\t', names=behaviour_column_headers).dropna(subset=['History'])

customer_behaviour_data['History'] = customer_behaviour_data['History'].str.split(' ')
customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].str.split(' ')

def clean_impressions(impression_list):
    return [x.split('-')[0] for x in impression_list if x.endswith('1')]

customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].apply(clean_impressions)
customer_behaviour_data = customer_behaviour_data.drop(columns=['Impression ID', 'Time'])

impression_data = customer_behaviour_data.groupby('User ID', as_index=False)['Impressions'].agg(lambda x: [item for sublist in x for item in sublist])
user_history_data = customer_behaviour_data.groupby('User ID', as_index=False)['History'].agg(lambda x: list(set().union(*x)))

customer_behaviour_data = pd.merge(user_history_data, impression_data, on='User ID', how='left')
customer_behaviour_data

Unnamed: 0,User ID,History,Impressions
0,U1,"[N58267, N40207, N62058, N32607, N13374, N2568...",[N20036]
1,U10,"[N9120, N27612, N36699, N57967, N9803, N64777,...",[N32536]
2,U10000,"[N56753, N35560, N50049, N47348, N62058, N9155...","[N50775, N60215, N31958]"
3,U10002,"[N28467, N48098, N32203, N50, N4082, N25113, N...","[N35676, N5940, N20477, N9284, N57560, N25673,..."
4,U10004,"[N55805, N52665, N18870, N33859, N43482, N1562...","[N33176, N36779]"
...,...,...,...
48588,U9990,[N6616],[N36779]
48589,U9994,[N52551],[N23513]
48590,U9996,"[N4719, N8448, N31165, N28296, N60340]","[N30290, N496, N20187, N9284]"
48591,U9998,"[N24593, N22519, N47993, N54271, N20483, N8422...",[N53615]


In [22]:
customer_behaviour_data = customer_behaviour_data.iloc[:5000, :]

In [35]:
def user_preference_recomendation(row):
    history_data = news_data[news_data.index.isin(row["History"])].mean().values.T.reshape(1, -1)
    similarity_matrix = pd.DataFrame(data=cosine_similarity(history_data, news_data).reshape(-1, 1), index=news_data.index, columns=['Similarity'])

    num_aritcles = (int(len(row['Impressions']) * 1.5) + 1) if len(row['Impressions']) >= 5 else 16
    similarity_matrix = similarity_matrix.apply(lambda col: col.nlargest(num_aritcles).index).iloc[1:, :]

    return list(set(similarity_matrix.values.flatten()) - set(row["History"]))

customer_behaviour_data['Recommended Articles'] = customer_behaviour_data.progress_apply(lambda row: user_preference_recomendation(row), axis=1)
user_impression_data = customer_behaviour_data.drop(columns=['History'])

user_impression_data

100%|██████████| 5000/5000 [34:33<00:00,  2.41it/s]


Unnamed: 0,User ID,Impressions,Recommended Articles
0,U1,[N20036],"[N45740, N48204, N22832, N36743, N48203, N3597..."
1,U10,[N32536],"[N55599, N12664, N25949, N57972, N42176, N2970..."
2,U10000,"[N50775, N60215, N31958]","[N55786, N54270, N20511, N62501, N20484, N2352..."
3,U10002,"[N35676, N5940, N20477, N9284, N57560, N25673,...","[N54380, N24573, N37595, N2042, N62914, N56385..."
4,U10004,"[N33176, N36779]","[N23062, N43648, N33063, N17593, N17068, N4296..."
...,...,...,...
4995,U18883,[N58251],"[N33765, N26941, N12285, N30873, N41002, N6007..."
4996,U18887,[N36779],"[N33170, N4020, N58271, N12935, N35492, N22, N..."
4997,U18895,"[N50775, N57007, N55036, N53572, N13270, N5357...","[N19217, N36743, N7421, N33401, N16247, N14981..."
4998,U18899,[N36779],"[N14988, N42579, N45183, N22209, N29235, N1286..."


In [36]:
def find_intersection(row):
    return list(set(row['Recommended Articles']).intersection(set(row['Impressions'])))

user_impression_data['Intersection'] = user_impression_data.apply(find_intersection, axis=1)
user_impression_data

Unnamed: 0,User ID,Impressions,Recommended Articles,Intersection
0,U1,[N20036],"[N45740, N48204, N22832, N36743, N48203, N3597...",[]
1,U10,[N32536],"[N55599, N12664, N25949, N57972, N42176, N2970...",[]
2,U10000,"[N50775, N60215, N31958]","[N55786, N54270, N20511, N62501, N20484, N2352...",[]
3,U10002,"[N35676, N5940, N20477, N9284, N57560, N25673,...","[N54380, N24573, N37595, N2042, N62914, N56385...",[]
4,U10004,"[N33176, N36779]","[N23062, N43648, N33063, N17593, N17068, N4296...",[]
...,...,...,...,...
4995,U18883,[N58251],"[N33765, N26941, N12285, N30873, N41002, N6007...",[]
4996,U18887,[N36779],"[N33170, N4020, N58271, N12935, N35492, N22, N...",[]
4997,U18895,"[N50775, N57007, N55036, N53572, N13270, N5357...","[N19217, N36743, N7421, N33401, N16247, N14981...",[]
4998,U18899,[N36779],"[N14988, N42579, N45183, N22209, N29235, N1286...",[]


In [37]:
def calculate_precision(row):
    if len(row["Recommended Articles"]) == 0:
        return 0
    return len(row["Intersection"]) / (len(row["Intersection"]) + (len(row["Recommended Articles"]) - len(row["Intersection"])))

user_impression_data['Precision'] = user_impression_data.apply(calculate_precision, axis=1)
user_impression_data["Precision"].mean()

0.00014362637362637362

In [38]:
def calculate_recall(row):
    return len(row["Intersection"]) / (len(row["Intersection"]) + len(list(set(row["Impressions"]) - set(row["Recommended Articles"]))))

user_impression_data['Recall'] = user_impression_data.apply(calculate_recall, axis=1)
user_impression_data["Recall"].mean()

0.0009816666666666667

In [39]:
def calculate_f1_score(row):
    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return (2 * row['Precision'] * row['Recall']) / (row['Precision'] + row['Recall'])

user_impression_data['F1 Score'] = user_impression_data.apply(calculate_f1_score, axis=1)
user_impression_data['F1 Score'].mean()

0.00022594771241830065

In [40]:
def calculate_f_beta(row):
    beta = 2

    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return ((1 + (beta ** 2)) * row['Precision'] * row['Recall']) / ((beta ** 2) * (row['Precision']) + row['Recall'])

user_impression_data['F-Beta Score'] = user_impression_data.apply(calculate_f_beta, axis=1)
user_impression_data['F-Beta Score'].mean()

0.0003832001273609417

In [41]:
user_impression_data.to_csv('../data/results/content_based_user_approach_results_test.csv', index=False)