In [207]:
import glob
import pandas as pd

from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [189]:
SEED = 42

In [217]:
def load_products():
    """
    Load the products data into a dataframe.

    Returns:
        dataframe: Pandas Dataframe containing the products data.
    """
    product_filenames = glob.glob("dataset/products-data-*.tsv")
    product_dfs = []
    for product_filename in product_filenames:
        products_df = pd.read_csv(product_filename, sep='\t', names=["id", "category", "product"])
        product_dfs.append(products_df)

    products_data = pd.concat(product_dfs, ignore_index=True)
    return products_data

In [218]:
def load_reviews():
    """
    Load the reviews data into a dataframe.

    Returns:
        dataframe: Pandas Dataframe containing the reviews data.
    """
    review_filenames = glob.glob(f"dataset/reviews-*.tsv")
    review_dfs = []
    for review_filename in review_filenames:
        reviews_df = pd.read_csv(review_filename, sep='\t', names=["id", "rating", "review_text"])
        review_dfs.append(reviews_df)

    reviews_data = pd.concat(review_dfs, ignore_index=True)
    return reviews_data

In [210]:
def load_data():
    """
    Load and join the products and reviews data. Encode the labels 
    correctly and drop unnecessary columns.

    Returns:
        dataframe: Pandas dataframe containing the products and reviews data.
    """
    products_data = load_products()
    reviews_data = load_reviews()
    data = products_data.merge(reviews_data, on='id', how='left')
    
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['category'])
    data = data.drop(['id', 'category'], axis=1)
    return data

In [219]:
def split_data(data):
    """
    Split the entire dataset into a train, evaluation and test split.

    Args:
        data (dataframe): Dataframe containing all products and reviews data.

    Returns:
        dataframe: Training data
        dataframe: Training labels
        dataframe: Evaluation data
        dataframe: Evaluation labels
        dataframe: Test data
        dataframe: Test labels
    """
    train_df, eval_df = train_test_split(data, test_size=0.2, random_state=SEED)
    eval_df, test_df = train_test_split(eval_df, test_size=0.5, random_state=SEED)

    X_train = train_df.drop('label', axis=1)
    y_train = train_df['label']
    X_eval = eval_df.drop('label', axis=1)
    y_eval = eval_df['label']
    X_test = test_df.drop('label', axis=1)
    y_test = test_df['label']
    
    return X_train, y_train, X_eval, y_eval, X_test, y_test

In [212]:
def text_to_tfidf(data, column, max_features):
    """
    Encode a text column in a dataframe to tf-idf. 
    Only the max_features most used words will be used.

    Args:
        data (dataframe): Data with column to be encoded.
        column (string): Name of the column to be encoded.
        max_features (int): Amount of most used words to encode.

    Returns:
        dataframe: Dataframe with the encoded column.
    """
    vectorizer = TfidfVectorizer(max_features=max_features)
    
    products_tfidf = vectorizer.fit_transform(data[column])
    products_tfidf_df = pd.DataFrame(products_tfidf.toarray()
                                    )
    X = pd.concat([data, products_tfidf_df], axis=1)
    X = X.drop(column, axis=1)
    X.columns = X.columns.astype(str)
    
    return X

In [226]:
# Load data, transform product column to tf-idf, drop rating related columns, split data
data = load_data()
data = text_to_tfidf(data, 'product', 1000)

# After experimenting on evaluation data, including the review data
# seems to slightly decrease performance. That's why we leave 
# the two columns out here.

# data = text_to_tfidf(data, 'review_text', 1000)
data = data.drop(['rating', 'review_text'], axis=1)

X_train, y_train, X_eval, y_eval, X_test, y_test = split_data(data)

In [227]:
# Instantiate model and fit training data
model = linear_model.LogisticRegression()
model.fit(X_train, y_train)

# Evaluate on the evaluation set
eval_accuracy = model.score(X_eval, y_eval)
print("Evaluation Accuracy:", eval_accuracy)

Evaluation Accuracy: 0.92


In [228]:
# Predict on the test set
test_accuracy = model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.8920454545454546


In [236]:
# Compute baseline accuracy
print('Different labels: ' + str(set(y)))
print('Amount of samples: ' + str(len(y)))

print('\nBaseline accuracy if we only predict one class:')
print(len([label for label in y_test if label == 0])/len(y_test))
print(len([label for label in y_test if label == 1])/len(y_test))
print(len([label for label in y_test if label == 2])/len(y_test))

Different labels: {0, 1, 2}
Amount of samples: 1754

Baseline accuracy if we only predict one class:
0.48295454545454547
0.42045454545454547
0.09659090909090909
