<a href="https://colab.research.google.com/github/gokulbytes/personalized-news-recommendation-engine/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Requirements
import pandas as pd

# behaviors_df

In [2]:
# Load the dataset
behaviors_df = pd.read_csv('/content/behaviors.tsv', sep='\t', header=None, names=['UserID', 'TimeStamp', 'History', 'Impressions'])
behaviors_df.head()

Unnamed: 0,UserID,TimeStamp,History,Impressions
1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [3]:
behaviors_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 156965 entries, 1 to 156965
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   UserID       156965 non-null  object
 1   TimeStamp    156965 non-null  object
 2   History      153727 non-null  object
 3   Impressions  156965 non-null  object
dtypes: object(4)
memory usage: 6.0+ MB


# news_df

In [4]:
# Load the dataset
news_df = pd.read_csv('/content/news.tsv', sep='\t', header=None, names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL','TitleEntities','AbstractEntities'])
news_df.head()

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL,TitleEntities,AbstractEntities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [5]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51282 entries, 0 to 51281
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   NewsID            51282 non-null  object
 1   Category          51282 non-null  object
 2   SubCategory       51282 non-null  object
 3   Title             51282 non-null  object
 4   Abstract          48616 non-null  object
 5   URL               51282 non-null  object
 6   TitleEntities     51279 non-null  object
 7   AbstractEntities  51278 non-null  object
dtypes: object(8)
memory usage: 3.1+ MB


In [6]:
# Remove duplicate rows based on all columns and reset the index
news_df = news_df.drop_duplicates().reset_index(drop=True)

# Data Transformation

## user_impressions_df

In [7]:
# Select UserID and Impressions columns
user_impressions_df = behaviors_df[['UserID', 'Impressions']].copy()

In [8]:
# Remove duplicate rows and reset the index
user_impressions_df = user_impressions_df.drop_duplicates().reset_index(drop=True)

In [9]:
# Split the Impressions string into a list of strings
user_impressions_df['Impressions'] = user_impressions_df['Impressions'].str.split()

In [10]:
# Extract NewsIDs where the impression ended with '-1' (clicked)
user_impressions_df['Clicked'] = user_impressions_df['Impressions'].apply(lambda x: [i.split('-')[0] for i in x if i.endswith('-1')])

In [11]:
# Drop the original Impressions column as it's no longer needed
user_impressions_df.drop(columns = ['Impressions'], inplace=True)

In [12]:
# Rename the 'Clicked' column to 'NewsID' for clarity
user_impressions_df = user_impressions_df.rename(columns={'Clicked' : 'NewsID'})

### user_item_matrix

In [13]:
# Explode the NewsID column so each NewsID in the list gets its own row
user_impressions_df_exploded = user_impressions_df.explode('NewsID').reset_index(drop=True)

In [14]:
# Remove duplicate rows and reset the index
user_impressions_df_exploded = user_impressions_df_exploded.drop_duplicates().reset_index(drop=True)

In [15]:
# Create the user-item matrix using pivot_table, assigning a value of 1 for clicked items and 0 for unclicked items.
user_item_matrix = user_impressions_df_exploded.assign(clicked=1).pivot_table(index='UserID', columns='NewsID', values='clicked', fill_value=0)

## news_df

In [16]:
# Combine 'Title' and 'Abstract' columns into a new 'Content' column
news_df['Content'] = news_df['Title'] + ' ' + news_df['Abstract'].fillna('')

In [17]:
# Drop the original 'Abstract', 'TitleEntities', and 'AbstractEntities' columns
news_df = news_df.drop(columns=['Abstract', 'TitleEntities', 'AbstractEntities'])

# Save Files

In [18]:
# Export user_item_matrix to a CSV file
user_item_matrix.to_csv('user_item_matrix.csv')

In [19]:
# Export news_df to a CSV file
news_df.to_csv('news_df.csv', index=False)