In [1]:
import pandas as pd
import scipy
import numpy as np

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif

def preprocess_training():
    # Most of this is a task in preprocessing appropriately

    train_data = pd.read_csv("./project_data_files/book_rating_train.csv")

    # These are strings we need to handle with provided files
    train_name = pd.read_csv("./project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
    train_authors = pd.read_csv("./project_data_files/book_text_features_doc2vec/train_authors_doc2vec20.csv", index_col = False, delimiter = ',', header=None)
    train_desc = pd.read_csv("./project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)


    # Give these all column names which aren't ints
    train_name = train_name.set_axis([f"name-{x}" for x in train_name.columns], axis=1)
    train_authors = train_authors.set_axis([f"author-{x}" for x in train_authors.columns], axis=1)
    train_desc = train_desc.set_axis([f"desc-{x}" for x in train_desc.columns], axis=1)

    # Transform categorical values into useful vectors
    train_lang = pd.get_dummies(train_data[['Language']])
    train_lang = train_lang.drop(['Language_ara', 'Language_frs', 'Language_heb', 'Language_zho', 'Language_lat'], axis=1)

    cv = CountVectorizer()
    train_publisher = pd.DataFrame(cv.fit_transform(train_data[['Publisher']]).todense())
    train_publisher = train_publisher.set_axis([f"publisher-char-{x}" for x in train_publisher.columns], axis=1)

    # Merge all the pieces together
    combine = pd.concat([train_name, train_desc, train_authors, train_publisher, train_lang], axis=1)

    # Add the remaining attributes
    X_train = pd.concat([combine, train_data[['PublishYear', 'PublishMonth', 'PublishDay', 'pagesNumber']]], axis=1).fillna(0)
    y_train = train_data['rating_label']

    return X_train, y_train

# Get the preprocessed data for doc2vec
X1, y1 = preprocess_training()

def getFeatureCorr(X1, y1, excludeFours = False):
    if (excludeFours):
        combine = pd.concat([X1, y1], axis=1)
        combine = combine[combine.rating_label != 4]
        X1 = combine.drop(['rating_label'], axis=1)
        y1 = combine[['rating_label']]

    corr_matrix = X1.corrwith(y1)
    best_features = corr_matrix.sort_values(ascending=False).head(15).index
    return best_features

def getFeatureMI(X1, y1, excludeFours = False):
    if (excludeFours):
        combine = pd.concat([X1, y1], axis=1)
        combine = combine[combine.rating_label != 4]
        X1 = combine.drop(['rating_label'], axis=1)
        y1 = combine[['rating_label']]
        

    coeff_df =pd.DataFrame(mutual_info_classif(X1, y1).reshape(-1, 1), columns=['Coefficient'], index=X1.columns)
    best_features = coeff_df.sort_values(by=['Coefficient'], ascending=False).head(15).index
    return best_features

# Feature selection
X1 = X1[list(set(getFeatureCorr(X1, y1)).union(set(getFeatureMI(X1, y1))))]

print(X1.columns)
print(X1.shape)
print(y1.shape)




Index(['desc-1', 'desc-60', 'desc-67', 'author-14', 'desc-64', 'desc-41',
       'name-57', 'pagesNumber', 'PublishMonth', 'PublishYear', 'name-99',
       'desc-87', 'desc-94', 'desc-55', 'name-79', 'desc-81', 'desc-78',
       'desc-18', 'name-94', 'name-27', 'desc-62', 'name-16', 'desc-72',
       'desc-77', 'desc-75', 'desc-33', 'desc-38', 'desc-29', 'name-98'],
      dtype='object')
(23063, 29)
(23063,)


In [31]:
def preprocess_testing():
    # Most of this is a task in preprocessing appropriately

    train_data = pd.read_csv("./project_data_files/book_rating_test.csv")

    # These are strings we need to handle with provided files
    train_name = pd.read_csv("./project_data_files/book_text_features_doc2vec/test_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
    train_authors = pd.read_csv("./project_data_files/book_text_features_doc2vec/test_authors_doc2vec20.csv", index_col = False, delimiter = ',', header=None)
    train_desc = pd.read_csv("./project_data_files/book_text_features_doc2vec/test_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)


    # Give these all column names which aren't ints
    train_name = train_name.set_axis([f"name-{x}" for x in train_name.columns], axis=1)
    train_authors = train_authors.set_axis([f"author-{x}" for x in train_authors.columns], axis=1)
    train_desc = train_desc.set_axis([f"desc-{x}" for x in train_desc.columns], axis=1)

    # Transform categorical values into useful vectors
    train_lang = pd.get_dummies(train_data[['Language']])
    # Remove languages not found in the training data - they tell us nothing
    train_lang = train_lang.drop(['Language_hun', 'Language_urd', 'Language_tha', 'Language_glg'], axis=1)

    cv = CountVectorizer()
    train_publisher = pd.DataFrame(cv.fit_transform(train_data[['Publisher']]).todense())
    train_publisher = train_publisher.set_axis([f"publisher-char-{x}" for x in train_publisher.columns], axis=1)

    # Merge all the pieces together
    combine = pd.concat([train_desc, train_name, train_authors, train_publisher, train_lang], axis=1)

    # Add the remaining attributes
    X_train = pd.concat([combine, train_data[['PublishYear', 'PublishMonth', 'PublishDay', 'pagesNumber']]], axis=1).fillna(0)

    return X_train

X_test = preprocess_testing()
X_test = X_test[list(set(getFeatureCorr(X1, y1)).union(set(getFeatureMI(X1, y1))))]


In [35]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


import time

def format_test_pred(model, X_train, y_train, X_test, title):
        model.fit(X_train, y_train)
        print("Fitted model")
        y_pred = model.predict(X_test)
        print("Made predictions")
        print("Max prediction", max(y_pred))
        y_pred = pd.Series(y_pred, name="rating_label", index=range(1,len(y_pred)+1))
        y_pred.index.name = "id"
        y_pred.to_csv(f"./model_predictions/{title}.csv")

def feature_filter(X_train, y_train, X_test):
        selector = SelectKBest(k=20, score_func=mutual_info_classif)
        print("Filtering features")
        X_train = pd.DataFrame(selector.fit_transform(X_train, y_train))
        for feat_num in selector.get_support(indices=True):
                print(selector.get_feature_names_out()[feat_num])

        X_test = pd.DataFrame(selector.transform(X_test))
        print("Chose best features as", X_train.head())
        return X_train, y_train, X_test


def run_models(X_train, y_train, X_test):
    models = [
          AdaBoostClassifier(),
          GaussianNB(),
          MLPClassifier(),
          LinearSVC(),
          LinearRegression(),
          SVC(kernel='poly', degree=3),
          DecisionTreeClassifier(criterion="log_loss"),
          BaggingClassifier(base_estimator=DecisionTreeClassifier(splitter="random"),n_estimators=10,\
                              max_samples=0.8, max_features=0.8),
          KNeighborsClassifier(n_neighbors=2),
          KNeighborsClassifier(n_neighbors=7),
          LogisticRegression(max_iter = 5000),
          DummyClassifier(strategy="most_frequent"),
          ]
    titles = [
        'ADA Boosting',
        'GNB',
            'MLP',
            'LinearSVC',
            'LinearRegression',
            'Polynomial (3) SVC',
            'Decision Tree',
            'Random Forest',
            'KNN-2',
            'KNN-7',
            'Logistic Regression',
            'ZeroR',]
    

    for title, model in zip(titles, models):
        print("Running model", title)
        start = time.time()
        
        # X1, X2, y1, y2 = train_test_split(X_train, y_train, test_size=0.2)
        # model.fit(X1, y1)
        # y_pred = model.predict(X2)
        # print(classification_report(y2, y_pred))
        # print("Accuracy is", accuracy_score(y2, y_pred))
        acc = cross_validate(model, X_train, y_train)
        

        #format_test_pred(model, X_train, y_train, X_test, title)

        end = time.time()
        t = end - start
        print("Acc was", np.mean(acc['test_score']), "for model", title, "in time", t)
        print("Generated predictions for", title)

print("Running models on doc2vec set")
run_models(X1, y1, X_test)




Running models on doc2vec set
Running model Polynomial (3) SVC
