In [1]:
import pandas as pd
import opendatasets as od

import os

from tqdm import tqdm

tqdm.pandas()

In [2]:
if not (os.path.exists('../data/mind-news-dataset')):
    od.download(dataset_id_or_url="https://www.kaggle.com/datasets/arashnic/mind-news-dataset/data", data_dir='../data/')

#### Training Data

In [3]:
news_column_headers = ["News ID", "Category", "SubCategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities"]
news_data = pd.read_csv('../data/mind-news-dataset/MINDsmall_train/news.tsv', delimiter='\t', names=news_column_headers)

news_data = news_data[['News ID', 'Category', 'SubCategory']]

news_data.head()

Unnamed: 0,News ID,Category,SubCategory
0,N55528,lifestyle,lifestyleroyals
1,N19639,health,weightloss
2,N61837,news,newsworld
3,N53526,health,voices
4,N38324,health,medical


In [4]:
behaviour_column_headers = ["Impression ID", "User ID", "Time", "History", "Impressions"]
customer_behaviour_data = pd.read_csv('../data/mind-news-dataset/MINDsmall_train/behaviors.tsv', delimiter='\t', names=behaviour_column_headers)

cold_start_users = customer_behaviour_data[customer_behaviour_data['History'].isna()]

customer_behaviour_data = customer_behaviour_data.dropna(subset=['History'])

customer_behaviour_data['History'] = customer_behaviour_data['History'].str.split(' ')
customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].str.split(' ')
cold_start_users['Impressions'] = cold_start_users['Impressions'].str.split(' ')

cold_start_users['History'] = [[] for _ in range(len(cold_start_users))]

customer_behaviour_data = pd.concat([customer_behaviour_data, cold_start_users], ignore_index=True)

def clean_impressions(row: list):
    return list(map(lambda x: x.split('-')[0], list(filter(lambda x: x[-1] == '1', row))))

customer_behaviour_data = customer_behaviour_data.drop(columns=['Impression ID', 'Time'])
customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].apply(lambda x: clean_impressions(x))

impression_data = customer_behaviour_data[['User ID', 'Impressions']]
customer_behaviour_data = customer_behaviour_data.drop(columns=['Impressions'])
impression_data = impression_data.groupby(by='User ID').agg('sum').reset_index()

user_history_data = customer_behaviour_data[["User ID", "History"]]
user_history_data = user_history_data.groupby('User ID')['History'].agg(lambda x: list(set().union(*x))).reset_index()

customer_behaviour_data = pd.merge(left=user_history_data, right=impression_data, on='User ID', how='left')

customer_behaviour_data

Unnamed: 0,User ID,History,Impressions
0,U100,"[N45954, N42330, N18870, N55743, N50095, N3399...",[N7800]
1,U1000,"[N29641, N1789, N41244]","[N29739, N7670, N58656, N53875]"
2,U10001,"[N33976, N46444, N51706, N2735, N22816, N47937...","[N1031, N10833, N35937]"
3,U10003,"[N50839, N31431, N41668, N26619, N61052, N3907...","[N18708, N57090, N55689]"
4,U10008,"[N59704, N23614, N5812, N33117, N10376, N33876...",[N15405]
...,...,...,...
49995,U9993,"[N47458, N14114]","[N22257, N30648]"
49996,U9995,"[N37123, N5102, N7943, N4415, N17166, N32117, ...","[N11817, N37204, N57426, N19444, N10812, N4714..."
49997,U9996,"[N60340, N31165, N8448, N28296, N4719]","[N287, N47098, N23446]"
49998,U9997,"[N11929, N11727, N55285, N9072, N90, N50744, N...","[N48410, N35738, N39269, N23081, N16502, N4245..."


In [5]:
user_history_data = customer_behaviour_data[['User ID', 'History']]
user_impression_data = customer_behaviour_data[['User ID', 'History', 'Impressions']]

In [6]:
user_history_data = user_history_data.explode('History').reset_index(drop=True)
user_history_data['Clicked'] = 1

user_history_data = pd.merge(left=user_history_data, right=news_data, left_on='History', right_on='News ID', how='left').drop(columns=['News ID'])

subcategory_popularity = user_history_data.groupby(['History', 'Category', 'SubCategory'])['Clicked'].sum().reset_index()
subcategory_popularity.columns = ['News ID', 'Category', 'SubCategory', 'Popularity']

subcategory_popularity = subcategory_popularity.sort_values('Popularity', ascending=False)

subcategory_popularity

Unnamed: 0,News ID,Category,SubCategory,Popularity
11718,N306,movies,movies-celebrity,4747
18507,N42620,lifestyle,lifestylebuzz,3998
20321,N45794,news,newscrime,3283
12375,N31801,news,newspolitics,3207
25615,N55189,tv,tvnews,3045
...,...,...,...,...
33110,N9852,sports,football_ncaa,1
33113,N9856,news,newspolitics,1
33117,N9861,news,newscrime,1
33120,N9868,sports,football_nfl,1


In [7]:
user_impression_data

Unnamed: 0,User ID,History,Impressions
0,U100,"[N45954, N42330, N18870, N55743, N50095, N3399...",[N7800]
1,U1000,"[N29641, N1789, N41244]","[N29739, N7670, N58656, N53875]"
2,U10001,"[N33976, N46444, N51706, N2735, N22816, N47937...","[N1031, N10833, N35937]"
3,U10003,"[N50839, N31431, N41668, N26619, N61052, N3907...","[N18708, N57090, N55689]"
4,U10008,"[N59704, N23614, N5812, N33117, N10376, N33876...",[N15405]
...,...,...,...
49995,U9993,"[N47458, N14114]","[N22257, N30648]"
49996,U9995,"[N37123, N5102, N7943, N4415, N17166, N32117, ...","[N11817, N37204, N57426, N19444, N10812, N4714..."
49997,U9996,"[N60340, N31165, N8448, N28296, N4719]","[N287, N47098, N23446]"
49998,U9997,"[N11929, N11727, N55285, N9072, N90, N50744, N...","[N48410, N35738, N39269, N23081, N16502, N4245..."


In [8]:
def recommend_articles(row):
    if len(row) < 1:
        return subcategory_popularity.head(10)['News ID'].tolist()

    user_categories = subcategory_popularity[subcategory_popularity['News ID'].isin(row)]['Category'].values.tolist()
    user_subcategories = subcategory_popularity[subcategory_popularity['News ID'].isin(row)]['SubCategory'].values.tolist()

    similar_articles_cat = subcategory_popularity[subcategory_popularity['Category'].isin(user_categories)]
    similar_articles_subcat = subcategory_popularity[subcategory_popularity['SubCategory'].isin(user_subcategories)]

    recommended_articles = similar_articles_cat.groupby('Category').head(2)['News ID'].values.tolist()
    recommended_articles += similar_articles_subcat.groupby('SubCategory').head(2)['News ID'].values.tolist()

    return list(set(recommended_articles).union(set(subcategory_popularity.head(2)['News ID'].tolist())) - set(row))

user_impression_data['Recommended Articles'] = user_impression_data['History'].progress_apply(recommend_articles)

100%|██████████| 50000/50000 [08:22<00:00, 99.57it/s] 


In [9]:
user_impression_data

Unnamed: 0,User ID,History,Impressions,Recommended Articles
0,U100,"[N45954, N42330, N18870, N55743, N50095, N3399...",[N7800],"[N55189, N871, N51706, N45794, N306, N2203, N2..."
1,U1000,"[N29641, N1789, N41244]","[N29739, N7670, N58656, N53875]","[N5978, N18870, N41375, N306, N8448, N24075, N..."
2,U10001,"[N33976, N46444, N51706, N2735, N22816, N47937...","[N1031, N10833, N35937]","[N55189, N54827, N55846, N45794, N18870, N306,..."
3,U10003,"[N50839, N31431, N41668, N26619, N61052, N3907...","[N18708, N57090, N55689]","[N44559, N4607, N306, N16715, N43142, N42989, ..."
4,U10008,"[N59704, N23614, N5812, N33117, N10376, N33876...",[N15405],"[N14761, N44559, N8448, N306, N11101, N16715, ..."
...,...,...,...,...
49995,U9993,"[N47458, N14114]","[N22257, N30648]","[N45794, N8448, N306, N44007, N32312, N31801, ..."
49996,U9995,"[N37123, N5102, N7943, N4415, N17166, N32117, ...","[N11817, N37204, N57426, N19444, N10812, N4714...","[N4966, N63779, N13933, N44559, N33276, N62853..."
49997,U9996,"[N60340, N31165, N8448, N28296, N4719]","[N287, N47098, N23446]","[N55846, N22816, N18870, N33276, N55326, N306,..."
49998,U9997,"[N11929, N11727, N55285, N9072, N90, N50744, N...","[N48410, N35738, N39269, N23081, N16502, N4245...","[N32089, N306, N10406, N47277, N3388, N22816, ..."


In [10]:
def find_intersection(row):
    return list(set(row['Recommended Articles']).intersection(set(row['Impressions'])))

user_impression_data['Intersection'] = user_impression_data.apply(find_intersection, axis=1)
user_impression_data

Unnamed: 0,User ID,History,Impressions,Recommended Articles,Intersection
0,U100,"[N45954, N42330, N18870, N55743, N50095, N3399...",[N7800],"[N55189, N871, N51706, N45794, N306, N2203, N2...",[]
1,U1000,"[N29641, N1789, N41244]","[N29739, N7670, N58656, N53875]","[N5978, N18870, N41375, N306, N8448, N24075, N...",[]
2,U10001,"[N33976, N46444, N51706, N2735, N22816, N47937...","[N1031, N10833, N35937]","[N55189, N54827, N55846, N45794, N18870, N306,...",[]
3,U10003,"[N50839, N31431, N41668, N26619, N61052, N3907...","[N18708, N57090, N55689]","[N44559, N4607, N306, N16715, N43142, N42989, ...",[]
4,U10008,"[N59704, N23614, N5812, N33117, N10376, N33876...",[N15405],"[N14761, N44559, N8448, N306, N11101, N16715, ...",[]
...,...,...,...,...,...
49995,U9993,"[N47458, N14114]","[N22257, N30648]","[N45794, N8448, N306, N44007, N32312, N31801, ...",[]
49996,U9995,"[N37123, N5102, N7943, N4415, N17166, N32117, ...","[N11817, N37204, N57426, N19444, N10812, N4714...","[N4966, N63779, N13933, N44559, N33276, N62853...",[]
49997,U9996,"[N60340, N31165, N8448, N28296, N4719]","[N287, N47098, N23446]","[N55846, N22816, N18870, N33276, N55326, N306,...",[]
49998,U9997,"[N11929, N11727, N55285, N9072, N90, N50744, N...","[N48410, N35738, N39269, N23081, N16502, N4245...","[N32089, N306, N10406, N47277, N3388, N22816, ...",[]


In [11]:
def calculate_precision(row):
    return len(row["Intersection"]) / len(row["Recommended Articles"])

user_impression_data['Precision'] = user_impression_data.apply(calculate_precision, axis=1)
user_impression_data["Precision"].mean()

np.float64(0.0006711573059203221)

In [17]:
def calculate_recall(row):
    return len(row["Intersection"]) / (len(row["Intersection"]) + len(list(set(row["Impressions"]) - set(row["Recommended Articles"]))))

user_impression_data['Recall'] = user_impression_data.apply(calculate_recall, axis=1)
user_impression_data["Recall"].mean()

np.float64(0.003657452366491421)

In [13]:
def calculate_f1_score(row):
    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return (2 * row['Precision'] * row['Recall']) / (row['Precision'] + row['Recall'])

user_impression_data['F1 Score'] = user_impression_data.apply(calculate_f1_score, axis=1)
user_impression_data['F1 Score'].mean()

np.float64(0.0010388157863278197)

In [14]:
def calculate_f_beta(row):
    beta = 2

    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return ((1 + (beta ** 2)) * row['Precision'] * row['Recall']) / ((beta ** 2) * (row['Precision']) + row['Recall'])

user_impression_data['F-Beta Score'] = user_impression_data.apply(calculate_f_beta, axis=1)
user_impression_data['F-Beta Score'].mean()

np.float64(0.0016706409972376255)

In [15]:
user_impression_data.to_csv('../data/results/popularity_approach_results_train.csv', index=False)

#### Validation Data

In [18]:
news_column_headers = ["News ID", "Category", "SubCategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities"]
news_data = pd.read_csv('../data/mind-news-dataset/MINDsmall_dev/news.tsv', delimiter='\t', names=news_column_headers)

news_data = news_data[['News ID', 'Category', 'SubCategory']]

news_data.head()

Unnamed: 0,News ID,Category,SubCategory
0,N55528,lifestyle,lifestyleroyals
1,N18955,health,medical
2,N61837,news,newsworld
3,N53526,health,voices
4,N38324,health,medical


In [19]:
behaviour_column_headers = ["Impression ID", "User ID", "Time", "History", "Impressions"]
customer_behaviour_data = pd.read_csv('../data/mind-news-dataset/MINDsmall_dev/behaviors.tsv', delimiter='\t', names=behaviour_column_headers)

cold_start_users = customer_behaviour_data[customer_behaviour_data['History'].isna()]

customer_behaviour_data = customer_behaviour_data.dropna(subset=['History'])

customer_behaviour_data['History'] = customer_behaviour_data['History'].str.split(' ')
customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].str.split(' ')
cold_start_users['Impressions'] = cold_start_users['Impressions'].str.split(' ')

cold_start_users['History'] = [[] for _ in range(len(cold_start_users))]

customer_behaviour_data = pd.concat([customer_behaviour_data, cold_start_users], ignore_index=True)

def clean_impressions(row: list):
    return list(map(lambda x: x.split('-')[0], list(filter(lambda x: x[-1] == '1', row))))

customer_behaviour_data = customer_behaviour_data.drop(columns=['Impression ID', 'Time'])
customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].apply(lambda x: clean_impressions(x))

impression_data = customer_behaviour_data[['User ID', 'Impressions']]
customer_behaviour_data = customer_behaviour_data.drop(columns=['Impressions'])
impression_data = impression_data.groupby(by='User ID').agg('sum').reset_index()

user_history_data = customer_behaviour_data[["User ID", "History"]]
user_history_data = user_history_data.groupby('User ID')['History'].agg(lambda x: list(set().union(*x))).reset_index()

customer_behaviour_data = pd.merge(left=user_history_data, right=impression_data, on='User ID', how='left')

customer_behaviour_data

Unnamed: 0,User ID,History,Impressions
0,U1,"[N23571, N24356, N57737, N10646, N40207, N6205...",[N20036]
1,U10,"[N64777, N36699, N9803, N57967, N2945, N9120, ...",[N32536]
2,U10000,"[N25933, N63709, N64273, N47847, N2479, N8572,...","[N50775, N60215, N31958]"
3,U10002,"[N24356, N4607, N49475, N42136, N18030, N10470...","[N35676, N5940, N20477, N9284, N57560, N25673,..."
4,U10004,"[N52665, N18870, N33859, N55805, N43482, N2725...","[N33176, N36779]"
...,...,...,...
49995,U9990,[N6616],[N36779]
49996,U9994,[N52551],[N23513]
49997,U9996,"[N60340, N31165, N8448, N28296, N4719]","[N30290, N496, N20187, N9284]"
49998,U9998,"[N29867, N5102, N49146, N32643, N16233, N8422,...",[N53615]


In [20]:
user_history_data = customer_behaviour_data[['User ID', 'History']]
user_impression_data = customer_behaviour_data[['User ID', 'History', 'Impressions']]

In [21]:
user_history_data = user_history_data.explode('History').reset_index(drop=True)
user_history_data['Clicked'] = 1

user_history_data = pd.merge(left=user_history_data, right=news_data, left_on='History', right_on='News ID', how='left').drop(columns=['News ID'])

subcategory_popularity = user_history_data.groupby(['History', 'Category', 'SubCategory'])['Clicked'].sum().reset_index()
subcategory_popularity.columns = ['News ID', 'Category', 'SubCategory', 'Popularity']

subcategory_popularity = subcategory_popularity.sort_values('Popularity', ascending=False)

subcategory_popularity

Unnamed: 0,News ID,Category,SubCategory,Popularity
13302,N306,movies,movies-celebrity,5659
21002,N42620,lifestyle,lifestylebuzz,5628
23062,N45794,news,newscrime,4323
21341,N43142,sports,basketball_nba,3943
14063,N31801,news,newspolitics,3923
...,...,...,...,...
15870,N3463,sports,mma,1
15871,N34630,sports,football_nfl,1
15872,N34632,news,newsscienceandtechnology,1
21569,N43472,news,newsworld,1


In [22]:
user_impression_data

Unnamed: 0,User ID,History,Impressions
0,U1,"[N23571, N24356, N57737, N10646, N40207, N6205...",[N20036]
1,U10,"[N64777, N36699, N9803, N57967, N2945, N9120, ...",[N32536]
2,U10000,"[N25933, N63709, N64273, N47847, N2479, N8572,...","[N50775, N60215, N31958]"
3,U10002,"[N24356, N4607, N49475, N42136, N18030, N10470...","[N35676, N5940, N20477, N9284, N57560, N25673,..."
4,U10004,"[N52665, N18870, N33859, N55805, N43482, N2725...","[N33176, N36779]"
...,...,...,...
49995,U9990,[N6616],[N36779]
49996,U9994,[N52551],[N23513]
49997,U9996,"[N60340, N31165, N8448, N28296, N4719]","[N30290, N496, N20187, N9284]"
49998,U9998,"[N29867, N5102, N49146, N32643, N16233, N8422,...",[N53615]


In [23]:
def recommend_articles(row):
    if len(row) < 1:
        return subcategory_popularity.head(10)['News ID'].tolist()

    user_categories = subcategory_popularity[subcategory_popularity['News ID'].isin(row)]['Category'].values.tolist()
    user_subcategories = subcategory_popularity[subcategory_popularity['News ID'].isin(row)]['SubCategory'].values.tolist()

    similar_articles_cat = subcategory_popularity[subcategory_popularity['Category'].isin(user_categories)]
    similar_articles_subcat = subcategory_popularity[subcategory_popularity['SubCategory'].isin(user_subcategories)]

    recommended_articles = similar_articles_cat.groupby('Category').head(2)['News ID'].values.tolist()
    recommended_articles += similar_articles_subcat.groupby('SubCategory').head(2)['News ID'].values.tolist()

    return list(set(recommended_articles).union(set(subcategory_popularity.head(2)['News ID'].tolist())) - set(row))

user_impression_data['Recommended Articles'] = user_impression_data['History'].progress_apply(recommend_articles)

100%|██████████| 50000/50000 [09:33<00:00, 87.20it/s] 


In [24]:
def find_intersection(row):
    return list(set(row['Recommended Articles']).intersection(set(row['Impressions'])))

user_impression_data['Intersection'] = user_impression_data.apply(find_intersection, axis=1)
user_impression_data

Unnamed: 0,User ID,History,Impressions,Recommended Articles,Intersection
0,U1,"[N23571, N24356, N57737, N10646, N40207, N6205...",[N20036],"[N44559, N306, N16715, N43142, N47558, N51706,...",[]
1,U10,"[N64777, N36699, N9803, N57967, N2945, N9120, ...",[N32536],"[N8448, N306, N43142, N58264, N42137, N7422, N...",[]
2,U10000,"[N25933, N63709, N64273, N47847, N2479, N8572,...","[N50775, N60215, N31958]","[N4607, N32089, N306, N8448, N12349, N16715, N...",[]
3,U10002,"[N24356, N4607, N49475, N42136, N18030, N10470...","[N35676, N5940, N20477, N9284, N57560, N25673,...","[N44559, N32089, N64408, N12349, N33969, N4314...",[]
4,U10004,"[N52665, N18870, N33859, N55805, N43482, N2725...","[N33176, N36779]","[N306, N43142, N46392, N7422, N51706, N64555, ...",[]
...,...,...,...,...,...
49995,U9990,[N6616],[N36779],"[N45794, N306, N31801, N42620, N46392]",[]
49996,U9994,[N52551],[N23513],"[N55189, N306, N29177, N42620, N871]",[]
49997,U9996,"[N60340, N31165, N8448, N28296, N4719]","[N30290, N496, N20187, N9284]","[N55846, N22816, N33276, N55326, N55743, N306,...",[]
49998,U9998,"[N29867, N5102, N49146, N32643, N16233, N8422,...",[N53615],"[N14761, N33276, N306, N8448, N11101, N33969, ...",[]


In [25]:
def calculate_precision(row):
    return len(row["Intersection"]) / len(row["Recommended Articles"])

user_impression_data['Precision'] = user_impression_data.apply(calculate_precision, axis=1)
user_impression_data["Precision"].mean()

np.float64(8.848235810179634e-05)

In [26]:
def calculate_recall(row):
    return len(row["Intersection"]) / (len(row["Intersection"]) + len(list(set(row["Impressions"]) - set(row["Recommended Articles"]))))

user_impression_data['Recall'] = user_impression_data.apply(calculate_recall, axis=1)
user_impression_data["Recall"].mean()

np.float64(0.001091280747030747)

In [27]:
def calculate_f1_score(row):
    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return (2 * row['Precision'] * row['Recall']) / (row['Precision'] + row['Recall'])

user_impression_data['F1 Score'] = user_impression_data.apply(calculate_f1_score, axis=1)
user_impression_data['F1 Score'].mean()

np.float64(0.00015621377549575883)

In [28]:
def calculate_f_beta(row):
    beta = 2

    if row['Precision'] == 0 and row['Recall'] == 0:
        return 0
    return ((1 + (beta ** 2)) * row['Precision'] * row['Recall']) / ((beta ** 2) * (row['Precision']) + row['Recall'])

user_impression_data['F-Beta Score'] = user_impression_data.apply(calculate_f_beta, axis=1)
user_impression_data['F-Beta Score'].mean()

np.float64(0.0003019557215245324)

In [29]:
user_impression_data.to_csv('../data/results/popularity_approach_results_test.csv', index=False)