In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.metrics.pairwise import cosine_similarity
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.similarities import MatrixSimilarity



In [2]:
# Load consumer transactions the datasets
consumer_transactions = pd.read_csv('data/consumer_transanctions.csv')
consumer_transactions.head(2)

  consumer_transactions = pd.read_csv('data/consumer_transanctions.csv')


Unnamed: 0,event_timestamp,interaction_type,item_id,consumer_id,consumer_session_id,consumer_device_info,consumer_location,country
0,1465413032,content_watched,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,content_watched,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US


In [3]:
platform_content = pd.read_csv('data/platform_content.csv')
platform_content.head(2)

Unnamed: 0,event_timestamp,interaction_type,item_id,producer_id,producer_session_id,producer_device_info,producer_location,producer_country,item_type,item_url,title,text_description,language
0,1459192779,content_pulled_out,-6451309518266745024,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,content_present,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en


In [4]:
# Step 1: Impute Ratings Based on Interaction Type
# Assign a rating score to each interaction type
interaction_weights = {
    "content_followed" : 5,
    "content_commented_on":4,
    "content_saved":3,
    "content_liked":2,
    "content_watched":1,
}

# Map interaction weights to consumer transactions
consumer_transactions['rating'] = consumer_transactions['interaction_type'].map(interaction_weights)

# Drop interactions without a defined weight
consumer_transactions = consumer_transactions.dropna(subset=['rating'])

# Step 2: Filter English Content
# Filter platform content to retain only English articles
platform_content = platform_content[platform_content['language'] == 'en']

# Step 3: Merge Datasets
# Merge consumer transactions and platform content on `item_id` to connect interactions with content
merged_data = pd.merge(
    consumer_transactions,
    platform_content[['item_id', 'title', 'text_description', 'item_type', 'language']],
    on='item_id',
    how='inner'
)
merged_data
# # Step 4: Data Cleanup
# # Remove articles that have been pulled out by filtering for only active content in platform content
# active_content_ids = platform_content[platform_content['interaction_type'] == 'content_present']['item_id']
# merged_data = merged_data[merged_data['item_id'].isin(active_content_ids)]

# # Step 5: Create User-Item Matrix for Collaborative Filtering
# # Pivot the data to create a user-item matrix
# user_item_matrix = merged_data.pivot_table(index='consumer_id',
#                                            columns='item_id', 
#                                            values='rating', 
#                                            fill_value=0)


Unnamed: 0,event_timestamp,interaction_type,item_id,consumer_id,consumer_session_id,consumer_device_info,consumer_location,country,rating,title,text_description,item_type,language
0,1465413032,content_watched,-3499919498720038879,-8845298781299428018,1264196770339959068,,,,1,Hiri wants to fix the workplace email problem,Hiri is the latest startup trying to fix email...,HTML,en
1,1465412560,content_watched,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US,1,Top 10 Intranet Trends of 2016,"Summary: Hero images, carousels, fat footers, ...",HTML,en
2,1465412290,content_watched,-7820640624231356730,-445337111692715325,561148 1178424124714,,,,1,How This Googler Redesigned The Workweek,Jake Knapp has always been concerned about the...,HTML,en
3,1465415066,content_watched,-1492913151930215984,4254153380739593270,8743229464706506141,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR,1,Chrome DevTools - Console API Reference,The DevTools docs have moved! Read the latest ...,HTML,en
4,1465413771,content_watched,3064370296170038610,3609194402293569455,1143207167886864524,,,,1,"Google, Amazon and the upcoming battle over AI...",The stage is set for the coming battle between...,HTML,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...
47257,1485187998,content_watched,569574447134368517,-5230721907253934520,-1055756461332933762,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2...,SP,BR,1,Mastering Bash and Terminal,If there is one tool that every developer uses...,HTML,en
47258,1485190133,content_watched,5484061377044071389,3609194402293569455,-344378995821744418,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,SP,BR,1,The Ultimate Solution to Versioning REST APIs:...,Versioning your API is terrifying. If you push...,HTML,en
47259,1485190425,content_watched,-5813211845057621660,102305705598210278,5527770709392883642,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR,1,Communication patterns in a Microservice world,"Ok, so you drank the microservice kool-aid and...",HTML,en
47260,1485190072,content_watched,-1999468346928419252,-9196668942822132778,-8300596454915870873,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,SP,BR,1,"Docker - Build, Ship, and Run Any App, Anywhere",AWS users are now using Docker containers to h...,HTML,en


In [5]:
# Step 4: Data Cleanup
# Remove articles that have been pulled out by filtering for only active content in platform content
active_content_ids = platform_content[platform_content['interaction_type'] == 'content_present']['item_id']
merged_data = merged_data[merged_data['item_id'].isin(active_content_ids)]

# Step 5: Create User-Item Matrix for Collaborative Filtering
# Pivot the data to create a user-item matrix
user_item_matrix = merged_data.pivot_table(index='consumer_id',
                                           columns='item_id', 
                                           values='rating', 
                                           fill_value=0)


In [6]:
user_item_matrix

item_id,-9192549002213406534,-9190737901804729417,-9189659052158407108,-9171475473795142532,-9166778629773133902,-9160910454530522563,-9153494109165200346,-9152398073968262186,-9137036168156595470,-9128741757954228992,...,9151634133568930081,9158289456544908688,9168028029170358424,9207286802575546269,9208127165664287660,9209629151177723638,9215261273565326920,9217155070834564627,9220445660318725468,9222265156747237864
consumer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223121837663643404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9212075797126931087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9207251133131336884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9199575329909162940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9196668942822132778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9165571805999894845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9187866633451383747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9191849144618614467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9199170757466086545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# compute user similarity matrix
user_similarity = cosine_similarity(user_item_matrix)
user_similarity

array([[1.        , 0.        , 0.        , ..., 0.05752766, 0.        ,
        0.06447092],
       [0.        , 1.        , 0.29277002, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.29277002, 1.        , ..., 0.41403934, 0.36514837,
        0.        ],
       ...,
       [0.05752766, 0.        , 0.41403934, ..., 1.        , 0.18898224,
        0.        ],
       [0.        , 0.        , 0.36514837, ..., 0.18898224, 1.        ,
        0.        ],
       [0.06447092, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])