In [1]:
import gc # garbage collector
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix, hstack, lil_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from collections import Counter
from FactorizationMachineClassifier import FactorizationMachineClassifier

In [2]:
def process_dataset(news_filepath, interactions_filepath, fraction):
    """ Read and process the datasets given the filepath. Return the cleaned news_df and interactions_df"""

    news = pd.read_csv(news_filepath,sep='\t',header=None, index_col=0)
    news.columns = ["Category", "SubCategory", "Title", "Abstract", "URL", "Entities", "RelatedEntities"]
    news.drop(columns=["URL", "Entities", "RelatedEntities"], inplace=True)
    news['Abstract'] = news['Abstract'].fillna('missing')

    interactions = pd.read_csv(interactions_filepath, sep='\t',header=None, index_col=0)
    interactions.columns = ['uID', 'timestamp', 'history', 'impLog']
    # data type conversion
    interactions['impLog'] = interactions['impLog'].apply(lambda x: [(y.split('-')[0], int(y.split('-')[1])) for y in x.split(' ')])
    interactions['history'] = interactions['history'].str.split()
    interactions['timestamp'] = pd.to_datetime(interactions['timestamp'], format='%m/%d/%Y %I:%M:%S %p')

    gc.collect()
    
    return news, interactions.sample(frac=fraction, random_state=1)

In [3]:
def feature_label(news, interactions, max_feature):
    """ Given the news dataset and interactions dataset, return the feature vector (X) and the lable vector (y)"""

    def vectorize(data):
        vectorizer = TfidfVectorizer(max_features=max_feature, stop_words='english')
        vectorizer.fit(data)
        return vectorizer.transform(data)
    tfidf = vectorize(news['Title'])
    tfidf_abstract = vectorize(news['Abstract'])
    def one_hot(data, category):
        enc = OneHotEncoder(categories=[category_dict[category]], handle_unknown='ignore')
        return enc.fit_transform(data)
    one_hot_category = one_hot(news[['SubCategory']], 'SubCategory')
    news_vector = hstack([tfidf, tfidf_abstract, one_hot_category], format='csr')
    news_map = dict(zip(news.index, news_vector.toarray()))

    def process_row(row, categories):   
        category_count = Counter(row)
        return [category_count.get(category, 0) for category in categories]
    def labels_one_hot(category):
        interactions[category] = interactions['history'].apply(lambda x: [news.at[nID, category] for nID in x] if x is not np.NaN else [])
        return interactions[category].apply(lambda x: process_row(x, category_dict[category]))
    interactions['subcategory_hist_encoded'] = labels_one_hot('SubCategory')
    interactions['history'] = interactions['history'].apply(lambda x: [news_map[nid] for nid in x if nid in news_map] if x is not np.NaN else [0] * max_feature)

    interactions_explode = interactions[['uID', 'impLog', 'timestamp', 'subcategory_hist_encoded']].explode('impLog')
    interactions_explode['nID'] = interactions_explode['impLog'].apply(lambda x: news_map[x[0]] if x[0] in news_map else np.zeros(len(list(news_map.values())[0])))
    interactions_explode['label'] = interactions_explode['impLog'].apply(lambda x: x[1])

    # standardize the user_history label columns
    scalar = MinMaxScaler()
    subcategory_hist_encoded = scalar.fit_transform(interactions_explode['subcategory_hist_encoded'].to_list())

    news_encoded = csr_matrix(interactions_explode['nID'].to_list())
    
    labels = interactions_explode['label'].to_numpy() # y


    del interactions, interactions_explode, news_map, news # free up memory
    gc.collect()

    return hstack([news_encoded, subcategory_hist_encoded], format='csr'), labels#, time_encoded], format='csr'), labels

In [4]:
news_train, interactions_train = process_dataset('Sample data/train/news.tsv', 'Sample data/train/behaviors.tsv', fraction=0.5)

In [5]:
category_dict = {'Category': news_train['Category'].unique(), 'SubCategory': news_train['SubCategory'].unique()}

X_train, y_train = feature_label(news_train, interactions_train, 1000)
print("train data collected")

train data collected


In [6]:
fm = FactorizationMachineClassifier(n_iter = 10, learning_rate = 0.01, n_factors=1, verbose=True)
fm.fit(X_train, y_train)
print("model training completed")

y_pred_train = fm.predict(X_train)
print("Train AUC score: " + str(roc_auc_score(y_train, y_pred_train)))

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:05<00:00,  1.87it/s]


model training completed
Train AUC score: 0.6248767519769859


In [7]:
news_test, interactions_test = process_dataset('Sample data/test/news.tsv', 'Sample data/test/behaviors.tsv', fraction=0.5)

In [8]:
X_test, y_test = feature_label(news_test, interactions_test, 1000)
print("test data collected")

y_pred_test = fm.predict(X_test)
print("Test AUC score: " + str(roc_auc_score(y_test, y_pred_test)))

test data collected
Test AUC score: 0.5810629501877567
