In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix, hstack, lil_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from FactorizationMachineClassifier import FactorizationMachineClassifier

In [2]:
def process_dataset(news_filepath, interactions_filepath):
    """ Read and process the datasets given the filepath. Return the cleaned news_df and interactions_df"""

    news = pd.read_csv(news_filepath,sep='\t',header=None, index_col=0)
    news.columns = ["Category", "SubCategory", "Title", "Abstract", "URL", "Entities", "RelatedEntities"]
    news.drop(columns=["URL", "Entities", "RelatedEntities"], inplace=True)

    interactions = pd.read_csv(interactions_filepath, sep='\t',header=None, index_col=0, usecols=[0, 1, 3, 4])
    interactions.columns = ['uID','history','impLog']
    # data type conversion
    interactions['impLog'] = interactions['impLog'].apply(lambda x: [(y.split('-')[0], int(y.split('-')[1])) for y in x.split(' ')])
    interactions['history'] = interactions['history'].str.split()
    
    return news, interactions

In [3]:
def feature_label(news, interactions):
    """ Given the news dataset and interactions dataset, return the feature vector (X) and the lable vector (y)"""

    # news: tfidf + labels
    def vectorize(data):
        vectorizer = TfidfVectorizer(max_features=1000)
        vectorizer.fit(data)
        return vectorizer.transform(data)
    tfidf = vectorize(news['Title'])
    def one_hot(data):
        enc = OneHotEncoder()
        return enc.fit_transform(data)
    one_hot_category = one_hot(news[['Category']])
    one_hot_subcategory = one_hot(news[['SubCategory']])
    ## combine tf-idf and one_hot_category and one_hot_subcategory
    news_vector = hstack([tfidf, one_hot_category, one_hot_subcategory])
    news_map = dict(zip(news.index, news_vector.toarray()))

    # user_history: news labels (aggregated)
    def process_row(row, categories):
        category_count = {category: 0 for category in categories}
        for category in row:
            if category in category_count:
                category_count[category] += 1
        category_count = {k: category_count[k] for k in sorted(category_count)}
        return list(category_count.values())
    def labels_one_hot(category):
        categories = news[category].unique()
        interactions[category] = interactions['history'].apply(lambda x: [news.loc[nID][category] for nID in x] if x is not np.NaN else [])
        return interactions[category].apply(lambda x: process_row(x, categories))
    interactions['category_hist_encoded'] = labels_one_hot('Category')
    interactions['subcategory_hist_encoded'] = labels_one_hot('SubCategory')
    
    # explode the dataframe to make one impLog pair per row
    interactions_explode = interactions[['uID', 'impLog', 'category_hist_encoded', 'subcategory_hist_encoded']].explode('impLog')
    interactions_explode['nID'] = interactions_explode['impLog'].apply(lambda x: news_map[x[0]] if x[0] in news_map else [0] * len(list(news_map.values())[0]))
    interactions_explode['label'] = interactions_explode['impLog'].apply(lambda x: x[1])

    # standardize the user_history label columns
    scalar = MinMaxScaler()
    category_hist_encoded = scalar.fit_transform(interactions_explode['category_hist_encoded'].to_list())
    subcategory_hist_encoded = scalar.fit_transform(interactions_explode['subcategory_hist_encoded'].to_list())

    news_encoded = csr_matrix(interactions_explode['nID'].to_list())
    
    return hstack([category_hist_encoded, subcategory_hist_encoded, news_encoded]).tocsr(), interactions_explode['label'].to_numpy()


In [5]:
news_train, interactions_train = process_dataset('Sample data/train/news.tsv', 'Sample data/train/behaviors.tsv')
news_test, interactions_test = process_dataset('Sample data/test/news.tsv', 'Sample data/test/behaviors.tsv')

In [6]:
X_train, y_train = feature_label(news_train, interactions_train)
X_test, y_test = feature_label(news_test, interactions_test)

In [None]:
fm = FactorizationMachineClassifier(n_iter = 10, learning_rate = 0.01, n_factors=10, verbose=True)
fm.fit(X_train, y_train)

In [None]:
y_pred_train = fm.predict(X_train)
print("Train AUC score: " + str(roc_auc_score(y_train, y_pred_train)))

In [None]:
y_pred_test = fm.predict(X_test)
print("Test AUC score: " + str(roc_auc_score(y_test, y_pred_test)))