In [19]:
import pandas as pd
import numpy as np
import random
import math
from sklearn.model_selection import train_test_split

In [20]:
articles_df = pd.read_csv('data/shared_articles.csv.zip')
articles_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en


In [21]:
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']

In [22]:
articles_df['content'] = articles_df['title'] + " " + articles_df['text']

In [23]:
articles_df = articles_df[['contentId', 'content', 'title', 'url', 'lang']].set_index('contentId')

In [24]:
articles_df

Unnamed: 0_level_0,content,title,url,lang
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-4110354420726924665,"Ethereum, a Virtual Currency, Enables Transact...","Ethereum, a Virtual Currency, Enables Transact...",http://www.nytimes.com/2016/03/28/business/dea...,en
-7292285110016212249,Bitcoin Future: When GBPcoin of Branson Wins O...,Bitcoin Future: When GBPcoin of Branson Wins O...,http://cointelegraph.com/news/bitcoin-future-w...,en
-6151852268067518688,Google Data Center 360° Tour We're excited to ...,Google Data Center 360° Tour,https://cloudplatform.googleblog.com/2016/03/G...,en
2448026894306402386,"IBM Wants to ""Evolve the Internet"" With Blockc...","IBM Wants to ""Evolve the Internet"" With Blockc...",https://bitcoinmagazine.com/articles/ibm-wants...,en
-2826566343807132236,IEEE to Talk Blockchain at Cloud Computing Oxf...,IEEE to Talk Blockchain at Cloud Computing Oxf...,http://www.coindesk.com/ieee-blockchain-oxford...,en
...,...,...,...,...
9213260650272029784,"Conheça a Liga IoT, plataforma de inovação abe...","Conheça a Liga IoT, plataforma de inovação abe...",https://startupi.com.br/2017/02/liga-ventures-...,pt
-3295913657316686039,Amazon takes on Skype and GoToMeeting with its...,Amazon takes on Skype and GoToMeeting with its...,https://thenextweb.com/apps/2017/02/14/amazon-...,en
3618271604906293310,"Code.org 2016 Annual Report February 9, 2017 -...",Code.org 2016 Annual Report,https://code.org/about/2016,en
6607431762270322325,JPMorgan Software Does in Seconds What Took La...,JPMorgan Software Does in Seconds What Took La...,https://www.bloomberg.com/news/articles/2017-0...,en


In [25]:
interactions_df = pd.read_csv('data/users_interactions.csv.zip')
interactions_df.head(10)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,
5,1465413742,VIEW,310515487419366995,-8763398617720485024,1395789369402380392,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,MG,BR
6,1465415950,VIEW,-8864073373672512525,3609194402293569455,1143207167886864524,,,
7,1465415066,VIEW,-1492913151930215984,4254153380739593270,8743229464706506141,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR
8,1465413762,VIEW,310515487419366995,344280948527967603,-3167637573980064150,,,
9,1465413771,VIEW,3064370296170038610,3609194402293569455,1143207167886864524,,,


In [26]:
interactions_df = interactions_df[interactions_df['contentId'].isin(set(articles_df.index))]

In [27]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])

In [28]:
INTERACTIONS_THRESHOLD = 15

users_interactions_count_df = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= INTERACTIONS_THRESHOLD].reset_index()[['personId']]
print(f'# users with at least {INTERACTIONS_THRESHOLD} interactions: {len(users_with_enough_interactions_df)}')

# users: 1895
# users with at least 15 interactions: 579


In [29]:
print('# of interactions: %d' % len(interactions_df))
interactions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'personId',
               right_on = 'personId')
print('# of interactions from users with at least 15 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 72269
# of interactions from users with at least 15 interactions: 62687


In [30]:
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['personId', 'contentId'])['eventStrength'].sum() \
                    .apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head(10)

# of unique user/item interactions: 34397


Unnamed: 0,personId,contentId,eventStrength
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0
3,-9223121837663643404,-8187220755213888616,1.0
4,-9223121837663643404,-7423191370472335463,3.169925
5,-9223121837663643404,-7331393944609614247,1.0
6,-9223121837663643404,-6872546942144599345,1.0
7,-9223121837663643404,-6728844082024523434,1.0
8,-9223121837663643404,-6590819806697898649,1.0
9,-9223121837663643404,-6558712014192834002,1.584963


# Train-test split

In [26]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['personId'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 27517
# interactions on Test set: 6880


In [27]:
interactions_full_df['personId'].nunique(), interactions_train_df['personId'].nunique(), interactions_test_df['personId'].nunique()

(579, 579, 579)

In [28]:
interaction_per_person_count = pd.concat([interactions_train_df.groupby('personId')['contentId'].count(), interactions_test_df.groupby('personId')['contentId'].count()], axis=1)
interaction_per_person_count.columns = ['train', 'test']
interaction_per_person_count

Unnamed: 0_level_0,train,test
personId,Unnamed: 1_level_1,Unnamed: 2_level_1
-9223121837663643404,34,9
-9172914609055320039,19,5
-9120685872592674274,19,5
-9109785559521267180,34,9
-9047547311469006438,19,5
...,...,...
9105269044962898535,14,3
9109075639526981934,58,15
9148269800512008413,41,10
9187866633451383747,14,3


In [29]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = interactions_full_df.set_index('personId')
interactions_train_indexed_df = interactions_train_df.set_index('personId')
interactions_test_indexed_df = interactions_test_df.set_index('personId')

In [30]:
interactions_train_indexed_df

Unnamed: 0_level_0,contentId,eventStrength
personId,Unnamed: 1_level_1,Unnamed: 2_level_1
6120111409505648242,7202701636941380671,1.000000
-1479311724257856983,3149164017776669829,2.000000
-5070718731712624222,-2097075598039554565,2.700440
4313045637915476309,-7047448754687279385,1.000000
-7531858294361854119,-9081753261356157170,2.169925
...,...,...
-8719462623048086192,-6999287066519531005,1.584963
8968131284214320024,-7130425546996181071,1.000000
-1352064057049251194,8119274153437896343,1.000000
-8853658195208337106,3818189513627822856,1.584963


In [31]:
interactions_test_indexed_df

Unnamed: 0_level_0,contentId,eventStrength
personId,Unnamed: 1_level_1,Unnamed: 2_level_1
-8020832670974472349,3906974906788964502,1.000000
-1496589699638750920,3170775058142440102,1.000000
-4355190001103094206,-4754223659064624252,1.000000
-3216836291203563246,4068119742428755460,1.000000
-174458633445209100,8657408509986329668,2.000000
...,...,...
2901546026085255870,1992928170409443117,1.000000
-5823562314929634489,-8722526817358894474,2.000000
2944202749145946530,6340108943344143104,1.000000
1895326251577378793,7065704533945771463,1.584963


In [40]:
interactions_train_indexed_df.to_csv('processed_data/interactions_train.csv')

In [47]:
interactions_test_indexed_df.to_csv('processed_data/interactions_test.csv')

In [55]:
interactions_full_indexed_df.to_csv('processed_data/interactions_full.csv')

In [53]:
articles_df.to_csv('processed_data/articles_df.csv')

# Train-validation-test split

In [31]:
interactions_train_df, interactions_validation_and_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['personId'], 
                                   test_size=0.40,
                                   random_state=42)

interactions_validation_df, interactions_test_df = train_test_split(interactions_validation_and_test_df,
                                   stratify=interactions_validation_and_test_df['personId'], 
                                   test_size=0.50,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Validation set: %d' % len(interactions_validation_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 20638
# interactions on Validation set: 6879
# interactions on Test set: 6880


In [33]:
interaction_per_person_count = pd.concat([interactions_train_df.groupby('personId')['contentId'].count(), interactions_validation_df.groupby('personId')['contentId'].count(), interactions_test_df.groupby('personId')['contentId'].count()], axis=1)
interaction_per_person_count.columns = ['train', 'validation', 'test']
interaction_per_person_count

Unnamed: 0_level_0,train,validation,test
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9223121837663643404,26,8,9
-9172914609055320039,14,5,5
-9120685872592674274,14,5,5
-9109785559521267180,26,8,9
-9047547311469006438,14,5,5
...,...,...,...
9105269044962898535,10,4,3
9109075639526981934,44,14,15
9148269800512008413,31,10,10
9187866633451383747,10,4,3


In [34]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = interactions_full_df.set_index('personId')
interactions_train_indexed_df = interactions_train_df.set_index('personId')
interactions_validation_indexed_df = interactions_validation_df.set_index('personId')
interactions_test_indexed_df = interactions_test_df.set_index('personId')

In [35]:
interactions_train_indexed_df.to_csv('processed_data/interactions_train_with_validation.csv')
interactions_validation_indexed_df.to_csv('processed_data/interactions_validation.csv')
interactions_test_indexed_df.to_csv('processed_data/interactions_test_with_validation.csv')