## Importing Libraries

In [1]:
import pandas as pd
import opendatasets as od

import os

from sklearn.feature_extraction.text import TfidfVectorizer

## Downloading Dataset

In [2]:
if not (os.path.exists('../data/mind-news-dataset')):
    od.download(dataset_id_or_url="https://www.kaggle.com/datasets/arashnic/mind-news-dataset/data", data_dir='../data/')

## Reading Data

In [3]:
news_column_headers = ["News ID", "Category", "SubCategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities"]
news_data = pd.read_csv('../data/mind-news-dataset/MINDsmall_train/news.tsv', delimiter='\t', names=news_column_headers)

news_data = news_data.drop(columns=['URL', 'Title Entities', 'Abstract Entities'])

news_data.head()

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re..."


## Preprocessing New Article Data

In [4]:
news_data = news_data.iloc[:, :]

In [5]:
news_data.shape

(51282, 5)

In [6]:
news_data['Abstract'] = news_data['Abstract'].fillna('')

news_data['News Meta-Data'] = news_data['Category'] + ' ' + news_data['SubCategory'] + ' ' + news_data['Title'] + ' ' + news_data['Abstract']
news_data = news_data.drop(columns=["Category", "SubCategory", "Title", "Abstract"])

news_data = news_data.set_index('News ID')

In [7]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
tfidf_matrix = tfidf_vectorizer.fit_transform(news_data['News Meta-Data'])

news_data = pd.DataFrame(tfidf_matrix.toarray()).set_index(news_data.index)

news_data

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
News ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N55528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N19639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N61837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N53526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N38324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N16909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N47585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N7482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N34418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
news_data.to_parquet('../data/item_cosine_sim_matrix.parquet')

In [9]:
news_data.shape

(51282, 5000)

In [10]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51282 entries, N55528 to N44276
Columns: 5000 entries, 0 to 4999
dtypes: float64(5000)
memory usage: 1.9+ GB


## Preprocessing User Behavior Data

In [11]:
behaviour_column_headers = ["Impression ID", "User ID", "Time", "History", "Impressions"]
customer_behaviour_data = pd.read_csv('../data/mind-news-dataset/MINDsmall_train/behaviors.tsv', delimiter='\t', names=behaviour_column_headers)

In [12]:
customer_behaviour_data.head()

Unnamed: 0,Impression ID,User ID,Time,History,Impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [13]:
customer_behaviour_data = customer_behaviour_data.dropna(subset=['History'])

In [14]:
customer_behaviour_data['History'] = customer_behaviour_data['History'].str.split(' ')
customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].str.split(' ')

In [15]:
def clean_impressions(row: list):
    return list(map(lambda x: x.split('-')[0], list(filter(lambda x: x[-1] == '1', row))))

customer_behaviour_data = customer_behaviour_data.drop(columns=['Impression ID', 'Time'])
customer_behaviour_data['Impressions'] = customer_behaviour_data['Impressions'].apply(lambda x: clean_impressions(x))

impression_data = customer_behaviour_data[['User ID', 'Impressions']]
customer_behaviour_data = customer_behaviour_data.drop(columns=['Impressions'])
impression_data = impression_data.groupby(by='User ID').agg('sum').reset_index()

user_history_data = customer_behaviour_data[["User ID", "History"]]
user_history_data = user_history_data.groupby('User ID')['History'].agg(lambda x: list(set().union(*x))).reset_index()

customer_behaviour_data = pd.merge(left=user_history_data, right=impression_data, on='User ID', how='left')

customer_behaviour_data

Unnamed: 0,User ID,History,Impressions
0,U100,"[N53465, N50095, N33998, N42330, N20121, N5574...",[N7800]
1,U1000,"[N1789, N29641, N41244]","[N29739, N7670, N58656, N53875]"
2,U10001,"[N47937, N61319, N34562, N56447, N56814, N2725...","[N1031, N10833, N35937]"
3,U10003,"[N61052, N1282, N50839, N28257, N41668, N31431...","[N18708, N57090, N55689]"
4,U10008,"[N36526, N32312, N10376, N33117, N59704, N6304...",[N15405]
...,...,...,...
49103,U9993,"[N47458, N14114]","[N22257, N30648]"
49104,U9995,"[N16292, N11629, N44399, N36053, N16043, N4844...","[N11817, N37204, N57426, N19444, N10812, N4714..."
49105,U9996,"[N8448, N4719, N31165, N28296, N60340]","[N287, N47098, N23446]"
49106,U9997,"[N11929, N11727, N64836, N9072, N46990, N46759...","[N48410, N35738, N39269, N23081, N16502, N4245..."


In [16]:
user_history_data = customer_behaviour_data[['User ID', 'History']]
user_impression_data = customer_behaviour_data[['User ID', 'Impressions']]

In [17]:
user_history_data = user_history_data.iloc[:, :]
user_impression_data = user_impression_data.iloc[:, :]

In [18]:
user_history_data.shape, user_impression_data.shape

((49108, 2), (49108, 2))

In [19]:
user_impression_data.to_parquet('../data/user_impression_data.parquet')
user_history_data.to_parquet('../data/user_history_data.parquet')

In [20]:
user_history_data = user_history_data.explode('History')
user_history_data = user_history_data.groupby(['User ID', 'History']).size().reset_index(name='User Clicks')

user_history_data = user_history_data.pivot(index='User ID', columns='History', values='User Clicks').fillna(0)

In [21]:
user_history_data

History,N10,N100,N1000,N10000,N10001,N10002,N10003,N10004,N10009,N1001,...,N9977,N9978,N998,N9980,N9984,N9987,N9988,N9990,N9992,N9993
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U10001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U10003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U10008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U9993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
user_history_data.to_parquet('../data/user_history_matrix.parquet')

In [2]:
3.5269028871391074e-05

3.5269028871391074e-05

In [None]:
0.00003