# Relevant imports

In [12]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier

import time

# Feature Selection Utils

In [13]:
def getFeatureCorr(X1, y1, printed = 5, returned = 7, threshold = 0):
    """
    Get the features with the highest absolute value correlation with the target variable
    Print the first <printed> rows, return the names of the top <returned> features.
    """
    corr_matrix = X1.corrwith(y1, )
    corr_matrix = corr_matrix.sort_values(key = abs, ascending=False)
    print("Best correlation features:\n", corr_matrix.head(printed))

    # If we're only return those with correlation above a certain threshold, filter this way
    if (threshold > 0):
        best_features = corr_matrix[abs(corr_matrix) > threshold].index 
    else:
        best_features = corr_matrix.sort_values(key = abs, ascending=False).head(returned).index
    #print("Returning", best_features)
    return best_features

def getFeatureMI(X1, y1, printed = 10, returned = 7):
    """
    Get the features with the highest mutual information with the target variable
    Print the first <printedFeatures> rows, return the names of the top <returnedFeatures> features.
    """
    coeff_df = pd.DataFrame(mutual_info_classif(X1, y1).reshape(-1, 1), columns=['Coefficient'], index=X1.columns)
    print("Best mutual information features:\n", coeff_df.sort_values(by=['Coefficient'], ascending=False).head(printed))
    best_features = coeff_df.sort_values(by=['Coefficient'], ascending=False).head(returned).index

    return best_features


# Relevant preprocessing

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

REDUCE_FEATURES = False
# doc2vec features used, for the sake of test preprocessing to reference
feats_to_use = {}

def preprocess_training():
    train_data = pd.read_csv("./project_data_files/book_rating_train.csv")
    y_train = train_data['rating_label']

    # These are strings we need to handle with provided files
    train_name = pd.read_csv("./my_doc2vec_files/train_name50.csv", index_col = False, delimiter = ',', header=None)
    train_authors = pd.read_csv("./my_doc2vec_files/train_authors50.csv", index_col = False, delimiter = ',', header=None)
    train_desc = pd.read_csv("./my_doc2vec_files/train_desc100.csv", index_col = False, delimiter = ',', header=None)
    train_pub = pd.read_csv("./my_doc2vec_files/train_publisher50.csv", index_col = False, delimiter = ',', header=None)


    # Give these all column names which aren't ints
    train_name = train_name.set_axis([f"name-{x}" for x in train_name.columns], axis=1)
    train_authors = train_authors.set_axis([f"author-{x}" for x in train_authors.columns], axis=1)
    train_desc = train_desc.set_axis([f"desc-{x}" for x in train_desc.columns], axis=1)
    train_pub = train_pub.set_axis([f"pub-{x}" for x in train_pub.columns], axis=1)

    # Take the doc2vec features which have correlation above 0.035
    limit = 0.035
    feats_to_use['name'] = getFeatureCorr(train_name, y_train, threshold=limit)
    feats_to_use['authors'] = getFeatureCorr(train_authors, y_train, threshold=limit)
    feats_to_use['desc'] = getFeatureCorr(train_desc, y_train, threshold=limit)
    feats_to_use['pub'] = getFeatureCorr(train_pub, y_train, threshold=limit)

    train_name = train_name[feats_to_use['name']]
    train_authors = train_authors[feats_to_use['authors']]
    train_desc = train_desc[feats_to_use['desc']]
    train_pub = train_pub[feats_to_use['pub']]

    # Transform categorical values into useful vectors
    train_lang = pd.get_dummies(train_data[['Language']])
    # These languages are not in the test set - should discard
    train_lang = train_lang.drop(['Language_ara', 'Language_frs', 'Language_heb', 'Language_zho', 'Language_lat'], axis=1)
    
    # Merge all the pieces together
    combine = pd.concat([train_name, train_desc, train_authors, train_pub, train_lang], axis=1)

    # Add the remaining attributes to the finalised preprocessed set
    X_train = pd.concat([combine, train_data[['PublishYear', 'PublishMonth', 'PublishDay', 'pagesNumber']]], axis=1).fillna(0)
    

    return X_train, y_train

# Get the preprocessed data for doc2vec
X1, y1 = preprocess_training()

# See how many columns/features we have
print(X1.shape)
print(y1.shape)



Best correlation features:
 name-27    0.083490
name-6     0.055268
name-24   -0.046594
name-5    -0.044607
name-32    0.043722
dtype: float64
Best correlation features:
 author-3    -0.024333
author-9     0.023891
author-38    0.022675
author-31   -0.021979
author-40   -0.019455
dtype: float64
Best correlation features:
 desc-87   -0.099600
desc-79   -0.088456
desc-41    0.080505
desc-1     0.076676
desc-62    0.072713
dtype: float64
Best correlation features:
 pub-46    0.043906
pub-15   -0.042931
pub-7     0.035258
pub-40   -0.034245
pub-5     0.033788
dtype: float64
(23063, 71)
(23063,)


In [15]:
def preprocess_testing():
    test_data = pd.read_csv("./project_data_files/book_rating_test.csv")

    # These are strings we need to handle with provided files
    test_name = pd.read_csv("./my_doc2vec_files/test_name50.csv", index_col = False, delimiter = ',', header=None)
    test_authors = pd.read_csv("./my_doc2vec_files/test_authors50.csv", index_col = False, delimiter = ',', header=None)
    test_desc = pd.read_csv("./my_doc2vec_files/test_desc100.csv", index_col = False, delimiter = ',', header=None)
    test_pub = pd.read_csv("./my_doc2vec_files/test_publisher50.csv", index_col = False, delimiter = ',', header=None)

    # Give these all column names which aren't ints
    test_name = test_name.set_axis([f"name-{x}" for x in test_name.columns], axis=1)
    test_authors = test_authors.set_axis([f"author-{x}" for x in test_authors.columns], axis=1)
    test_desc = test_desc.set_axis([f"desc-{x}" for x in test_desc.columns], axis=1)
    test_pub = test_pub.set_axis([f"pub-{x}" for x in test_pub.columns], axis=1)

    # Take the doc2vec features which have correlation above 0.035
    test_name = test_name[feats_to_use['name']]
    test_authors = test_authors[feats_to_use['authors']]
    test_desc = test_desc[feats_to_use['desc']]
    test_pub = test_pub[feats_to_use['pub']]

    # Transform categorical values into useful vectors
    test_lang = pd.get_dummies(test_data[['Language']])
    # Remove languages not found in the training data - they tell us nothing
    test_lang = test_lang.drop(['Language_hun', 'Language_urd', 'Language_tha', 'Language_glg'], axis=1)\
    
    # Merge all the pieces together
    combine = pd.concat([test_name, test_desc, test_authors, test_pub, test_lang], axis=1)

    # Add the remaining attributes
    X_train = pd.concat([combine, test_data[['PublishYear', 'PublishMonth', 'PublishDay', 'pagesNumber']]], axis=1).fillna(0)

    return X_train

X_test = preprocess_testing()



# Ensure both sets have the same features
print("Features match?", set(X_test.columns).union(set(X1.columns)) == set(X_test.columns))


Features match? True


# Cursory evaluation

In [16]:
# For creating Kaggle prediction files
def format_test_pred(model, X_train, y_train, X_test, title):
        model.fit(X_train, y_train)
        print("Fitted model")
        y_pred = model.predict(X_test)
        print("Made predictions")
        y_pred = pd.Series(y_pred, name="rating_label", index=range(1,len(y_pred)+1))
        y_pred.index.name = "id"
        y_pred.to_csv(f"./model_predictions/{title}.csv")

In [17]:
def run_models(X_train, y_train, X_test):
    fixed = 43 # Include this so the results from the report are repeatable
    models = [
          AdaBoostClassifier(estimator=LogisticRegression(max_iter = 5000, random_state=fixed), random_state=fixed),
          GaussianNB(),
          MLPClassifier(max_iter = 1000, random_state=fixed),
          LinearSVC(random_state=fixed),
          SVC(kernel='poly', degree=3, random_state=fixed, max_iter=5000),
          DecisionTreeClassifier(criterion="log_loss", random_state=fixed),
          BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=15,\
                              max_samples=0.8, max_features=0.8, random_state=fixed),
          KNeighborsClassifier(n_neighbors=3),
          KNeighborsClassifier(n_neighbors=7),
          LogisticRegression(max_iter = 5000, random_state=fixed),
          DummyClassifier(strategy="most_frequent"),
          ]
    titles = [
        'ADA Boosting',
        'GNB',
            'MLP',
            'LinearSVC',
            'Polynomial (3) SVC',
            'Decision Tree',
            'Random Forest',
            'KNN-3',
            'KNN-7',
            'Logistic Regression',
            'ZeroR',]
    

    for title, model in zip(titles, models):
        print("Running model", title)
        start = time.time()
        
        acc = np.mean(cross_val_score(model, X1, y1, cv=5))
        
        format_test_pred(model, X_train, y_train, X_test, "cursory/cursory-"+title)

        end = time.time()
        t = end - start
        print("Acc was", acc, "for model", title, "in time", t)
        print("Generated predictions for", title)

print("Running models on doc2vec set")
run_models(X1, y1, X_test)




Running models on doc2vec set
Running model Logistic Regression


KeyboardInterrupt: 

# Ensemble Methods

In [None]:
def run_best_models(X_train, y_train, X_test):
    fixed = 43 # Include this so the results from the report are repeatable
    models = [
          AdaBoostClassifier(estimator=LogisticRegression(max_iter = 5000, multi_class='ovr', random_state=fixed), random_state=fixed),
          MLPClassifier(max_iter = 1000, random_state=fixed, activation='tanh'),
          MLPClassifier(max_iter = 1000, random_state=fixed, activation='logistic'),
          MLPClassifier(max_iter = 1000, random_state=fixed, activation='identity'),
          BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=30,\
                              max_samples=0.8, max_features=0.8, random_state=fixed),
          ]
    titles = [
            'Logistic Boosting',
            'MLP_logistic',
            'MLP_tanh',
            'MLP id',
            'Random forest (30)'
            ]
    
    X1, X_val, y1, y_val = train_test_split(X_train, y_train, test_size=0.2)

    for title, model in zip(titles, models):
        print("Running model", title)
        
        
        model.fit(X1, y1)
        print("Fitted model")
        y_pred = model.predict(X_val)
        print("Made predictions")

        print(classification_report(y_val, y_pred))
        print(accuracy_score(y_val, y_pred))
        
        format_test_pred(model, X_train, y_train, X_test, "hyper_tuning/"+title)

        print("Generated predictions for", title)

run_best_models(X1, y1, X_test)

Running model Logistic Boosting
Fitted model
Made predictions
              precision    recall  f1-score   support

         3.0       0.00      0.00      0.00      1213
         4.0       0.70      1.00      0.82      3213
         5.0       0.00      0.00      0.00       187

    accuracy                           0.70      4613
   macro avg       0.23      0.33      0.27      4613
weighted avg       0.49      0.70      0.57      4613

0.6965098634294385


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitted model
Made predictions
Generated predictions for Logistic Boosting
Running model MLP_logistic
Fitted model
Made predictions
              precision    recall  f1-score   support

         3.0       0.00      0.00      0.00      1213
         4.0       0.70      1.00      0.82      3213
         5.0       0.33      0.01      0.01       187

    accuracy                           0.70      4613
   macro avg       0.34      0.33      0.28      4613
weighted avg       0.50      0.70      0.57      4613

0.6962930847604596


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitted model
Made predictions
Generated predictions for MLP_logistic
Running model MLP_tanh
Fitted model
Made predictions
              precision    recall  f1-score   support

         3.0       0.00      0.00      0.00      1213
         4.0       0.70      1.00      0.82      3213
         5.0       0.00      0.00      0.00       187

    accuracy                           0.70      4613
   macro avg       0.23      0.33      0.27      4613
weighted avg       0.49      0.70      0.57      4613

0.6962930847604596


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitted model
Made predictions
Generated predictions for MLP_tanh
Running model MLP id
Fitted model
Made predictions
              precision    recall  f1-score   support

         3.0       0.32      0.66      0.43      1213
         4.0       0.75      0.28      0.41      3213
         5.0       0.06      0.29      0.10       187

    accuracy                           0.38      4613
   macro avg       0.38      0.41      0.31      4613
weighted avg       0.61      0.38      0.41      4613

0.3830479080858444
Fitted model
Made predictions
Generated predictions for MLP id
Running model Random forest (30)




Fitted model
Made predictions
              precision    recall  f1-score   support

         3.0       0.38      0.09      0.14      1213
         4.0       0.70      0.95      0.81      3213
         5.0       0.44      0.02      0.04       187

    accuracy                           0.68      4613
   macro avg       0.51      0.35      0.33      4613
weighted avg       0.61      0.68      0.60      4613

0.6843702579666161




Fitted model
Made predictions
Generated predictions for Random forest (30)


# Tighter Feature Selection

In [None]:
# Feature selection
X1 = X1[list(set(getFeatureCorr(X1, y1)).union(set(getFeatureMI(X1, y1))))]
X_test = X_test[list(set(getFeatureCorr(X1, y1)).union(set(getFeatureMI(X1, y1))))]

# See how many columns/features we have now
print(X1.shape)
print(X1.columns)

def run_small_models(X_train, y_train, X_test):
    fixed = 43 # Include this so the results from the report are repeatable
    models = [
          AdaBoostClassifier(estimator=LogisticRegression(max_iter = 5000, multi_class='ovr', random_state=fixed), random_state=fixed),
          LogisticRegression(max_iter = 5000, multi_class='ovr', random_state=fixed),
          MLPClassifier(max_iter = 1000, random_state=fixed),
          BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=15,\
                              max_samples=0.8, max_features=0.8, random_state=fixed),
          ]
    titles = [
            'Logistic Boosting',
            'Logistic',
            'MLP',
            'Random forest'
            ]
    
    X1, X_val, y1, y_val = train_test_split(X_train, y_train, test_size=0.2)

    for title, model in zip(titles, models):
        print("Running model", title)
        
        format_test_pred(model, X_train, y_train, X_test, "small/small-"+title)

        print("Generated predictions for", title)

run_small_models(X1, y1, X_test)


Best correlation features:
 desc-87       -0.099600
desc-79       -0.088456
name-27        0.083490
pagesNumber    0.079376
PublishYear   -0.073535
dtype: float64
Best mutual information features:
              Coefficient
pagesNumber     0.020620
desc-87         0.011428
name-27         0.005580
desc-79         0.004392
PublishYear     0.001771
Best correlation features:
 desc-87       -0.099600
desc-79       -0.088456
name-27        0.083490
pagesNumber    0.079376
PublishYear   -0.073535
dtype: float64
Best mutual information features:
              Coefficient
pagesNumber     0.017258
desc-87         0.011472
PublishYear     0.009885
name-27         0.005571
desc-79         0.004392
(23063, 5)
Index(['name-27', 'pagesNumber', 'desc-79', 'desc-87', 'PublishYear'], dtype='object')
Running model Logistic Boosting
Fitted model
Made predictions
Generated predictions for Logistic Boosting
Running model Logistic
Fitted model
Made predictions
Generated predictions for Logistic
Running mode



Fitted model
Made predictions
Generated predictions for Random forest
