In [2]:
import pandas as pd
import scipy

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

def preprocess_training(useDoc2Vec = False):
    # Most of this is a task in preprocessing appropriately

    train_data = pd.read_csv("./project_data_files/book_rating_train.csv")

    # These are strings we need to handle with provided files
    if (useDoc2Vec):
        train_name = pd.read_csv("./project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
        train_authors = pd.read_csv("./project_data_files/book_text_features_doc2vec/train_authors_doc2vec20.csv", index_col = False, delimiter = ',', header=None)
        train_desc = pd.read_csv("./project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
    else:
        train_name = scipy.sparse.load_npz('./project_data_files/book_text_features_countvec/train_name_vec.npz')
        train_authors = scipy.sparse.load_npz('./project_data_files/book_text_features_countvec/train_authors_vec.npz')
        train_desc = scipy.sparse.load_npz('./project_data_files/book_text_features_countvec/train_desc_vec.npz')

        train_name = pd.DataFrame.sparse.from_spmatrix(train_name)
        train_authors = pd.DataFrame.sparse.from_spmatrix(train_authors)
        train_desc = pd.DataFrame.sparse.from_spmatrix(train_desc)

    # Give these all column names which aren't ints
    train_name = train_name.set_axis([f"name-{x}" for x in train_name.columns], axis=1)
    train_authors = train_authors.set_axis([f"author-{x}" for x in train_authors.columns], axis=1)
    train_desc = train_desc.set_axis([f"desc-{x}" for x in train_desc.columns], axis=1)

    # Transform categorical values into useful vectors
    train_categorical = pd.get_dummies(train_data[['Publisher', 'Language']])

    # Merge all the pieces together
    combine = pd.concat([train_name, train_authors, train_desc, train_categorical], axis=1)

    # Add the remaining attributes
    X_train = pd.concat([combine, train_data[['PublishYear', 'PublishMonth', 'PublishDay', 'pagesNumber']]], axis=1)
    y_train = train_data['rating_label']

    return X_train, y_train

X, y = preprocess_training()

# Temporarily, we want a validation set as the test set gives us no feedback
# When submitting, train on all data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)



In [4]:
from sklearn import ensemble
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

import time

def run_models(X_train, y_train, X_val, y_val):
    models = [
          GaussianNB(),
          MLPClassifier(),
          LinearSVC(),
          SVC(kernel='poly', degree=3),
          DecisionTreeClassifier(),
          BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=10,\
                              max_samples=0.8, max_features=0.8),
          KNeighborsClassifier(), # When the n_neighbors parameter is not set in the KNeighborsClassifier() function, its default value is set to 5
          LogisticRegression(max_iter = 1000)]
    titles = ['GNB',
            'MLP',
            'LinearSVC',
            'Polynomial (3) SVC',
            'Decision Tree',
            'Random Forest',
            'KNN',
            'Logistic Regression']

    for title, model in zip(titles, models):
        model.fit(X_train,y_train)
        start = time.time()
        acc = model.score(X_val, y_val)
        end = time.time()
        t = end - start
        print(title, "Accuracy:",acc, 'Time:', t)

run_models(X_train, y_train, X_val, y_val)






GNB Accuracy: 0.5048775200520269 Time: 25.787254333496094




MLP Accuracy: 0.5794493821807934 Time: 18.90970754623413




In [7]:
def preprocess_testing(useDoc2Vec = True):
    test_data = pd.read_csv("./project_data_files/book_rating_test.csv")

    if (useDoc2Vec):
        test_name = pd.read_csv("./project_data_files/book_text_features_doc2vec/test_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
        test_authors = pd.read_csv("./project_data_files/book_text_features_doc2vec/test_authors_doc2vec20.csv", index_col = False, delimiter = ',', header=None)
        test_name = pd.read_csv("./project_data_files/book_text_features_doc2vec/test_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
    else:
        test_name = scipy.sparse.load_npz('./project_data_files/book_text_features_countvec/test_name_vec.npz')
        test_authors = scipy.sparse.load_npz('./project_data_files/book_text_features_countvec/test_authors_vec.npz')
        test_name = scipy.sparse.load_npz('./project_data_files/book_text_features_countvec/test_desc_vec.npz')