In [1]:
import pandas as pd
import numpy as np

import spacy
import warnings

from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

tqdm.pandas()
warnings.filterwarnings('ignore')

#### Training Data

In [2]:
embeddings_model = spacy.load("en_core_web_lg")

In [3]:
def create_embeddings(row):
    category_vector = embeddings_model(row["Category"]).vector
    subcategory_vector = embeddings_model(row["SubCategory"]).vector
    context_vector = embeddings_model(row["Context"]).vector

    return list(category_vector) + list(subcategory_vector) + list(context_vector)

In [4]:
news_column_headers = ["News ID", "Category", "SubCategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities"]
news_data = pd.read_csv('../data/mind-news-dataset/MINDsmall_train/news.tsv', delimiter='\t', names=news_column_headers)

news_data = news_data.drop(columns=['URL', 'Title Entities', 'Abstract Entities'])

news_data['Abstract'] = news_data['Abstract'].fillna('')

news_data['Context'] = news_data['Title'] + ' ' + news_data['Abstract']
news_data = news_data.drop(columns=["Title", "Abstract"])

news_data = news_data.set_index('News ID')

news_data['Embedding'] = news_data.progress_apply(lambda row: create_embeddings(row), axis=1)
news_data = pd.DataFrame(data=np.stack(news_data['Embedding'].values), index=news_data.index)

news_data.head()

100%|██████████| 51282/51282 [08:21<00:00, 102.25it/s]


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,890,891,892,893,894,895,896,897,898,899
News ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N55528,0.7229,0.8321,-1.7988,-3.6058,1.2708,-1.7303,0.42605,2.8627,-0.94957,-1.6504,...,0.804701,0.036395,-0.167648,-0.280542,-1.452165,1.441531,1.009135,-0.64842,-1.655276,0.335003
N19639,1.3115,1.6485,-5.8189,2.7559,4.8246,-1.1699,1.1501,5.0687,-0.58208,-1.1435,...,0.634347,-2.146799,1.355682,-1.423024,-1.606487,0.306612,0.152569,0.092978,-3.074269,1.047438
N61837,7.8999,-2.4716,-0.66704,-2.0706,6.573,-4.6976,-0.31608,5.5686,0.45894,-5.3438,...,-0.055678,0.758951,-0.0307,1.019456,-2.1542,2.223886,-0.075903,-0.688906,-1.995316,0.060624
N53526,1.3115,1.6485,-5.8189,2.7559,4.8246,-1.1699,1.1501,5.0687,-0.58208,-1.1435,...,2.013857,-0.809737,0.91782,0.15031,-1.46526,0.435829,1.73881,1.646224,-2.670809,0.871495
N38324,1.3115,1.6485,-5.8189,2.7559,4.8246,-1.1699,1.1501,5.0687,-0.58208,-1.1435,...,0.356733,-1.070814,0.421982,0.551095,-1.583753,-0.233303,0.870333,0.14705,-2.636249,0.910148


In [5]:
behaviour_column_headers = ["Impression ID", "User ID", "Time", "History", "Impressions"]

customer_behaviour_data = pd.read_csv(
    '../data/mind-news-dataset/MINDsmall_train/behaviors.tsv', delimiter='\t', names=behaviour_column_headers).dropna(subset=['History'])

customer_behaviour_data['History'] = customer_behaviour_data['History'].str.split(' ')
customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].str.split(' ')

def clean_impressions(impression_list):
    return [x.split('-')[0] for x in impression_list if x.endswith('1')]

customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].apply(clean_impressions)
customer_behaviour_data = customer_behaviour_data.drop(columns=['Impression ID', 'Time'])

impression_data = customer_behaviour_data.groupby('User ID', as_index=False)['Impressions'].agg(lambda x: [item for sublist in x for item in sublist])
user_history_data = customer_behaviour_data.groupby('User ID', as_index=False)['History'].agg(lambda x: list(set().union(*x)))

customer_behaviour_data = pd.merge(user_history_data, impression_data, on='User ID', how='left')
customer_behaviour_data

Unnamed: 0,User ID,History,Impressions
0,U100,"[N42330, N45954, N55743, N51705, N33998, N2012...",[N7800]
1,U1000,"[N1789, N29641, N41244]","[N29739, N7670, N58656, N53875]"
2,U10001,"[N27256, N20639, N34562, N47937, N27644, N1803...","[N1031, N10833, N35937]"
3,U10003,"[N31431, N1282, N61052, N41668, N26619, N28257...","[N18708, N57090, N55689]"
4,U10008,"[N33876, N9619, N32312, N60000, N59704, N33117...",[N15405]
...,...,...,...
49103,U9993,"[N47458, N14114]","[N22257, N30648]"
49104,U9995,"[N44399, N39520, N49103, N60671, N13707, N1629...","[N11817, N37204, N57426, N19444, N10812, N4714..."
49105,U9996,"[N8448, N60340, N28296, N4719, N31165]","[N287, N47098, N23446]"
49106,U9997,"[N90, N38367, N64836, N9072, N50744, N11929, N...","[N48410, N35738, N39269, N23081, N16502, N4245..."


In [6]:
def user_preference_recomendation(row):
    history_data = news_data[news_data.index.isin(row["History"])]
    similarity_matrix = pd.DataFrame(data=cosine_similarity(news_data, history_data), index=news_data.index, columns=history_data.index)
    similarity_matrix = similarity_matrix.apply(lambda col: col.nlargest(6).index).iloc[1:, :]

    return list(set(similarity_matrix.values.flatten()) - set(row["History"]))

customer_behaviour_data['Recommended Articles'] = customer_behaviour_data.progress_apply(lambda row: user_preference_recomendation(row), axis=1)
user_impression_data = customer_behaviour_data.drop(columns=['History'])

user_impression_data

100%|██████████| 49108/49108 [2:35:55<00:00,  5.25it/s]  


Unnamed: 0,User ID,Impressions,Recommended Articles
0,U100,[N7800],"[N30682, N60967, N37820, N24038, N44251, N4118..."
1,U1000,"[N29739, N7670, N58656, N53875]","[N32622, N17358, N38520, N23129, N6533, N26319..."
2,U10001,"[N1031, N10833, N35937]","[N25490, N43881, N1970, N44251, N47533, N20232..."
3,U10003,"[N18708, N57090, N55689]","[N64791, N61648, N27659, N16582, N9255, N61504..."
4,U10008,[N15405],"[N17338, N13693, N11097, N30518, N46763, N2661..."
...,...,...,...
49103,U9993,"[N22257, N30648]","[N2293, N5557, N18198, N1376, N47983, N54249, ..."
49104,U9995,"[N11817, N37204, N57426, N19444, N10812, N4714...","[N17595, N64427, N28825, N26081, N33468, N5946..."
49105,U9996,"[N287, N47098, N23446]","[N19494, N12160, N17740, N27063, N60148, N7487..."
49106,U9997,"[N48410, N35738, N39269, N23081, N16502, N4245...","[N22098, N3875, N51112, N47261, N6962, N32983,..."


In [7]:
def find_intersection(row):
    return list(set(row['Recommended Articles']).intersection(set(row['Impressions'])))

user_impression_data['Intersection'] = user_impression_data.apply(find_intersection, axis=1)
user_impression_data

Unnamed: 0,User ID,Impressions,Recommended Articles,Intersection
0,U100,[N7800],"[N30682, N60967, N37820, N24038, N44251, N4118...",[]
1,U1000,"[N29739, N7670, N58656, N53875]","[N32622, N17358, N38520, N23129, N6533, N26319...",[]
2,U10001,"[N1031, N10833, N35937]","[N25490, N43881, N1970, N44251, N47533, N20232...",[]
3,U10003,"[N18708, N57090, N55689]","[N64791, N61648, N27659, N16582, N9255, N61504...",[]
4,U10008,[N15405],"[N17338, N13693, N11097, N30518, N46763, N2661...",[]
...,...,...,...,...
49103,U9993,"[N22257, N30648]","[N2293, N5557, N18198, N1376, N47983, N54249, ...",[]
49104,U9995,"[N11817, N37204, N57426, N19444, N10812, N4714...","[N17595, N64427, N28825, N26081, N33468, N5946...",[]
49105,U9996,"[N287, N47098, N23446]","[N19494, N12160, N17740, N27063, N60148, N7487...",[]
49106,U9997,"[N48410, N35738, N39269, N23081, N16502, N4245...","[N22098, N3875, N51112, N47261, N6962, N32983,...",[N16502]


In [8]:
def calculate_precision(row):
    if len(row["Recommended Articles"]) == 0:
        return 0
    return len(row["Intersection"]) / (len(row["Intersection"]) + (len(row["Recommended Articles"]) - len(row["Intersection"])))

user_impression_data['Precision'] = user_impression_data.apply(calculate_precision, axis=1)
user_impression_data["Precision"].mean()

0.0007717723787252481

In [9]:
def calculate_recall(row):
    return len(row["Intersection"]) / (len(row["Intersection"]) + len(list(set(row["Impressions"]) - set(row["Recommended Articles"]))))

user_impression_data['Recall'] = user_impression_data.apply(calculate_recall, axis=1)
user_impression_data["Recall"].mean()

0.013568069350095116

In [10]:
def calculate_f1_score(row):
    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return (2 * row['Precision'] * row['Recall']) / (row['Precision'] + row['Recall'])

user_impression_data['F1 Score'] = user_impression_data.apply(calculate_f1_score, axis=1)
user_impression_data['F1 Score'].mean()

0.0013916479374896653

In [11]:
def calculate_f_beta(row):
    beta = 2

    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return ((1 + (beta ** 2)) * row['Precision'] * row['Recall']) / ((beta ** 2) * (row['Precision']) + row['Recall'])

user_impression_data['F-Beta Score'] = user_impression_data.apply(calculate_f_beta, axis=1)
user_impression_data['F-Beta Score'].mean()

0.0028155905630475406

In [12]:
user_impression_data.to_csv('../data/results/content_based_individual_approach_results_train.csv', index=False)

#### Validation Data

In [13]:
news_column_headers = ["News ID", "Category", "SubCategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities"]

news_data = pd.read_csv('../data/mind-news-dataset/MINDsmall_train/news.tsv', delimiter='\t', names=news_column_headers)
news_data_val = pd.read_csv('../data/mind-news-dataset/MINDsmall_dev/news.tsv', delimiter='\t', names=news_column_headers)

news_data = pd.concat([news_data, news_data_val], ignore_index=True).drop_duplicates(subset=['News ID']).reset_index(drop=True)
news_data = news_data.drop(columns=['URL', 'Title Entities', 'Abstract Entities'])

news_data['Abstract'] = news_data['Abstract'].fillna('')

news_data['Context'] = news_data['Title'] + ' ' + news_data['Abstract']
news_data = news_data.drop(columns=["Title", "Abstract"])

news_data = news_data.set_index('News ID')

news_data['Embedding'] = news_data.progress_apply(lambda row: create_embeddings(row), axis=1)
news_data = pd.DataFrame(data=np.stack(news_data['Embedding'].values), index=news_data.index)

news_data.head()

100%|██████████| 65238/65238 [19:24<00:00, 56.00it/s]


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,890,891,892,893,894,895,896,897,898,899
News ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N55528,0.7229,0.8321,-1.7988,-3.6058,1.2708,-1.7303,0.42605,2.8627,-0.94957,-1.6504,...,0.804701,0.036395,-0.167648,-0.280542,-1.452165,1.441531,1.009135,-0.64842,-1.655276,0.335003
N19639,1.3115,1.6485,-5.8189,2.7559,4.8246,-1.1699,1.1501,5.0687,-0.58208,-1.1435,...,0.634347,-2.146799,1.355682,-1.423024,-1.606487,0.306612,0.152569,0.092978,-3.074269,1.047438
N61837,7.8999,-2.4716,-0.66704,-2.0706,6.573,-4.6976,-0.31608,5.5686,0.45894,-5.3438,...,-0.055678,0.758951,-0.0307,1.019456,-2.1542,2.223886,-0.075903,-0.688906,-1.995316,0.060624
N53526,1.3115,1.6485,-5.8189,2.7559,4.8246,-1.1699,1.1501,5.0687,-0.58208,-1.1435,...,2.013857,-0.809737,0.91782,0.15031,-1.46526,0.435829,1.73881,1.646224,-2.670809,0.871495
N38324,1.3115,1.6485,-5.8189,2.7559,4.8246,-1.1699,1.1501,5.0687,-0.58208,-1.1435,...,0.356733,-1.070814,0.421982,0.551095,-1.583753,-0.233303,0.870333,0.14705,-2.636249,0.910148


In [14]:
behaviour_column_headers = ["Impression ID", "User ID", "Time", "History", "Impressions"]

customer_behaviour_data = pd.read_csv(
    '../data/mind-news-dataset/MINDsmall_dev/behaviors.tsv', delimiter='\t', names=behaviour_column_headers).dropna(subset=['History'])

customer_behaviour_data['History'] = customer_behaviour_data['History'].str.split(' ')
customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].str.split(' ')

def clean_impressions(impression_list):
    return [x.split('-')[0] for x in impression_list if x.endswith('1')]

customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].apply(clean_impressions)
customer_behaviour_data = customer_behaviour_data.drop(columns=['Impression ID', 'Time'])

impression_data = customer_behaviour_data.groupby('User ID', as_index=False)['Impressions'].agg(lambda x: [item for sublist in x for item in sublist])
user_history_data = customer_behaviour_data.groupby('User ID', as_index=False)['History'].agg(lambda x: list(set().union(*x)))

customer_behaviour_data = pd.merge(user_history_data, impression_data, on='User ID', how='left')
customer_behaviour_data

Unnamed: 0,User ID,History,Impressions
0,U1,"[N57737, N58267, N32607, N13374, N40207, N5230...",[N20036]
1,U10,"[N9803, N57967, N9120, N36699, N27612, N2945, ...",[N32536]
2,U10000,"[N18094, N48998, N60516, N47847, N47348, N1005...","[N50775, N60215, N31958]"
3,U10002,"[N50126, N48098, N2618, N46169, N54300, N64777...","[N35676, N5940, N20477, N9284, N57560, N25673,..."
4,U10004,"[N27251, N15627, N52665, N33859, N15402, N1887...","[N33176, N36779]"
...,...,...,...
48588,U9990,[N6616],[N36779]
48589,U9994,[N52551],[N23513]
48590,U9996,"[N8448, N60340, N28296, N4719, N31165]","[N30290, N496, N20187, N9284]"
48591,U9998,"[N951, N20483, N10449, N11512, N32643, N5102, ...",[N53615]


In [16]:
def user_preference_recomendation(row):
    history_data = news_data[news_data.index.isin(row["History"])]
    similarity_matrix = pd.DataFrame(data=cosine_similarity(news_data, history_data), index=news_data.index, columns=history_data.index)
    similarity_matrix = similarity_matrix.apply(lambda col: col.nlargest(6).index).iloc[1:, :]

    return list(set(similarity_matrix.values.flatten()) - set(row["History"]))

customer_behaviour_data['Recommended Articles'] = customer_behaviour_data.progress_apply(lambda row: user_preference_recomendation(row), axis=1)
user_impression_data = customer_behaviour_data.drop(columns=['History'])

user_impression_data

100%|██████████| 48593/48593 [4:45:48<00:00,  2.83it/s]  


Unnamed: 0,User ID,Impressions,Recommended Articles
0,U1,[N20036],"[N40433, N19851, N20940, N26095, N5310, N32991..."
1,U10,[N32536],"[N5270, N15354, N3672, N51991, N53643, N42250,..."
2,U10000,"[N50775, N60215, N31958]","[N47707, N49847, N52480, N52829, N44251, N4305..."
3,U10002,"[N35676, N5940, N20477, N9284, N57560, N25673,...","[N15675, N52040, N19724, N32670, N26924, N2365..."
4,U10004,"[N33176, N36779]","[N41654, N32183, N21045, N34937, N5078, N22249..."
...,...,...,...
48588,U9990,[N36779],"[N37148, N39243, N22923, N39009, N5339]"
48589,U9994,[N23513],"[N6102, N22, N2910, N8073, N22288]"
48590,U9996,"[N30290, N496, N20187, N9284]","[N19494, N12160, N17740, N27063, N60148, N7487..."
48591,U9998,[N53615],"[N63695, N28144, N4830, N33579, N27182, N48928..."


In [17]:
def find_intersection(row):
    return list(set(row['Recommended Articles']).intersection(set(row['Impressions'])))

user_impression_data['Intersection'] = user_impression_data.apply(find_intersection, axis=1)
user_impression_data

Unnamed: 0,User ID,Impressions,Recommended Articles,Intersection
0,U1,[N20036],"[N40433, N19851, N20940, N26095, N5310, N32991...",[]
1,U10,[N32536],"[N5270, N15354, N3672, N51991, N53643, N42250,...",[]
2,U10000,"[N50775, N60215, N31958]","[N47707, N49847, N52480, N52829, N44251, N4305...",[]
3,U10002,"[N35676, N5940, N20477, N9284, N57560, N25673,...","[N15675, N52040, N19724, N32670, N26924, N2365...",[]
4,U10004,"[N33176, N36779]","[N41654, N32183, N21045, N34937, N5078, N22249...",[]
...,...,...,...,...
48588,U9990,[N36779],"[N37148, N39243, N22923, N39009, N5339]",[]
48589,U9994,[N23513],"[N6102, N22, N2910, N8073, N22288]",[]
48590,U9996,"[N30290, N496, N20187, N9284]","[N19494, N12160, N17740, N27063, N60148, N7487...",[]
48591,U9998,[N53615],"[N63695, N28144, N4830, N33579, N27182, N48928...",[]


In [18]:
def calculate_precision(row):
    if len(row["Recommended Articles"]) == 0:
        return 0
    return len(row["Intersection"]) / (len(row["Intersection"]) + (len(row["Recommended Articles"]) - len(row["Intersection"])))

user_impression_data['Precision'] = user_impression_data.apply(calculate_precision, axis=1)
user_impression_data["Precision"].mean()

0.00017869769086816195

In [19]:
def calculate_recall(row):
    return len(row["Intersection"]) / (len(row["Intersection"]) + len(list(set(row["Impressions"]) - set(row["Recommended Articles"]))))

user_impression_data['Recall'] = user_impression_data.apply(calculate_recall, axis=1)
user_impression_data["Recall"].mean()

0.008549096639590786

In [20]:
def calculate_f1_score(row):
    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return (2 * row['Precision'] * row['Recall']) / (row['Precision'] + row['Recall'])

user_impression_data['F1 Score'] = user_impression_data.apply(calculate_f1_score, axis=1)
user_impression_data['F1 Score'].mean()

0.0003405922341288291

In [21]:
def calculate_f_beta(row):
    beta = 2

    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return ((1 + (beta ** 2)) * row['Precision'] * row['Recall']) / ((beta ** 2) * (row['Precision']) + row['Recall'])

user_impression_data['F-Beta Score'] = user_impression_data.apply(calculate_f_beta, axis=1)
user_impression_data['F-Beta Score'].mean()

0.0007603829155379303

In [22]:
user_impression_data.to_csv('../data/results/content_based_individual_approach_results_test.csv', index=False)