In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import coo_array, hstack, coo_matrix

In [2]:
interactions = pd.read_csv('Sample data/train/behaviors.tsv', sep='\t',header=None, index_col=0, usecols=[0, 1, 3, 4])
interactions.columns = ['uID','history','impLog']

interactions['impLog'] = interactions['impLog'].apply(lambda x: [(y.split('-')[0], int(y.split('-')[1])) for y in x.split(' ')])
interactions['history'] = interactions['history'].str.split()

In [3]:
news = pd.read_csv('Sample data/train/news.tsv',sep='\t',header=None, index_col=0)
news.columns = ["Category", "SubCategory", "Title", "Abstract", "URL", "Entities", "RelatedEntities"]
news.drop(columns=["URL", "Entities", "RelatedEntities"], inplace=True)

In [4]:
# tf-idf vectorization
def vectorize(data):
    vectorizer = TfidfVectorizer(max_features=1000)
    vectorizer.fit(data)
    return vectorizer.transform(data)
tfidf = vectorize(news['Title'])
# one-hot encoding
def one_hot(data):
    enc = OneHotEncoder()
    return enc.fit_transform(data)
one_hot_category = one_hot(news[['Category']])
one_hot_subcategory = one_hot(news[['SubCategory']])
# combine tf-idf and one_hot_category and one_hot_subcategory
news_vector = hstack([tfidf, one_hot_category, one_hot_subcategory])
news_map = dict(zip(news.index, news_vector.toarray()))


In [5]:
# label/sub-label user-hitory one-hot encoding

def process_row(row, categories):
    category_count = {category: 0 for category in categories}
    for category in row:
        if category in category_count:
            category_count[category] += 1
    category_count = {k: category_count[k] for k in sorted(category_count)}
    return list(category_count.values())

def labels_one_hot(category):
    categories = news[category].unique()
    interactions[category] = interactions['history'].apply(lambda x: [news.loc[nID][category] for nID in x] if x is not np.NaN else [])
    return interactions[category].apply(lambda x: process_row(x, categories))

interactions['category_hist_encoded'] = labels_one_hot('Category')
interactions['subcategory_hist_encoded'] = labels_one_hot('SubCategory')

In [6]:
interactions_explode = interactions[['uID', 'impLog', 'category_hist_encoded', 'subcategory_hist_encoded']].explode('impLog')

interactions_explode['nID'] = interactions_explode['impLog'].apply(lambda x: news_map[x[0]] if x[0] in news_map else [0] * len(list(news_map.values())[0]))
interactions_explode['label'] = interactions_explode['impLog'].apply(lambda x: 1 if x[1] == 1 else -1)

In [7]:
interactions_explode.head()

Unnamed: 0_level_0,uID,impLog,category_hist_encoded,subcategory_hist_encoded,nID,label
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,U13740,"(N55689, 1)","[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 3, 0, 2, 0, 2, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
1,U13740,"(N35729, 0)","[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 3, 0, 2, 0, 2, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-1
2,U91836,"(N20678, 0)","[0, 0, 5, 3, 1, 0, 5, 0, 1, 1, 54, 0, 3, 5, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-1
2,U91836,"(N39317, 0)","[0, 0, 5, 3, 1, 0, 5, 0, 1, 1, 54, 0, 3, 5, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-1
2,U91836,"(N58114, 0)","[0, 0, 5, 3, 1, 0, 5, 0, 1, 1, 54, 0, 3, 5, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-1


In [8]:
# user_id one-hot encoding
encoder = OneHotEncoder()
encoded_user = encoder.fit_transform(interactions_explode[['uID']])

In [9]:
coo_encoded_user = coo_array(encoded_user)
coo_category_hist_encoded = coo_array(interactions_explode['category_hist_encoded'].to_list())
coo_subcategory_hist_encoded = coo_array(interactions_explode['subcategory_hist_encoded'].to_list())
#coo_news = coo_matrix(interactions_explode['nID'])

In [10]:
coo_news = coo_array(interactions_explode['nID'].to_list())

In [15]:
X_train = hstack([coo_encoded_user, coo_category_hist_encoded, coo_subcategory_hist_encoded, coo_news])

In [16]:
import scipy.sparse as sp
x
sp.save_npz('X_train.npz', X_train)

In [16]:
from fastFM import sgd
fm = sgd.FMClassification(n_iter=1000, init_stdev=0.1, l2_reg_w=0,
                          l2_reg_V=0, rank=2, step_size=0.1)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

ModuleNotFoundError: No module named 'fastFM'