# Review Helpfulness Classification - ML Models
* Model Baselines - RF(Random Forest), SVM(Support Vector Machine)
* Dataset - Amazon(Toys and Games, CDs and Vinyls)
* Features - LIWC Contents(Linguistic, Psychological), Readability, Subjectivity, Sentiment

In [None]:
import random, os
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
amzn_train = pd.read_csv('/datasets/amzn_tg_train_final.csv')
amzn_test = pd.read_csv('/datasets/amzn_tg_test_final.csv')

amzn_train = amzn_train[['helpfulness label', 'pronoun', 'article', 'prep', 'auxverb', 'focuspresent', 'relativ', 'space', 'subjectivity', 'sentiment', 'readability']]
amzn_test = amzn_test[['helpfulness label', 'pronoun', 'article', 'prep', 'auxverb', 'focuspresent', 'relativ', 'space', 'subjectivity', 'sentiment', 'readability']]
amzn_train = amzn_train.dropna(axis=0)
amzn_test = amzn_test.dropna(axis=0)

In [None]:
def seed_everything(seed):
    '''
    seed 고정
    '''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_num = 42
seed_everything(seed_num)

In [None]:
# Scaling - MinMaxScaler()

def normalize(train, test):
  cols = ['Item_Rating', 'Item_Review_Count', 'Review_Rating', 'User_Reviews_Count', 'User_Helpful_Votes', 'Review_Length', 'readability', 'sentiment']

  scaler = MinMaxScaler()
  scaler.fit(train[cols])
  train[cols] = scaler.transform(train[cols])
  test[cols] = scaler.transform(test[cols])

  return train, test

amzn_train, amzn_test = normalize(amzn_train, amzn_test)

In [None]:
def train_model(train=None, model=None):
  '''
  모델 훈련 함수
  inputs :
    train - 훈련 데이터
    model - 사용할 모델 명(SVC, RF, MLR, XGB)
  '''
  X_train = train.drop(columns='Review_Helpfulness')
  y_train = train['Review_Helpfulness']

  if model == 'SVC':
    train_model = SVC(max_iter=10)
    train_model.fit(X_train, y_train)

  elif model == 'RF':
    train_model = RandomForestClassifier(n_estimators=10, random_state=seed_num, n_jobs=-1)
    train_model.fit(X_train, y_train)

  elif model == 'MLR':
    train_model = LogisticRegression(max_iter=10)
    train_model.fit(X_train, y_train)

  elif model == 'XGB':
    train_model = xgb.XGBClassifier(n_estimators=10, random_state=seed_num, n_jobs=-1)
    train_model.fit(X_train, y_train)

  elif model == 'DNN':
    train_model = MLPClassifier(max_iter=10, random_state=seed_num)
    train_model.fit(X_train, y_train)

  else : pass
  return train_model

svc = train_model(amzn_train, model='SVC')
rfc = train_model(amzn_train, model='RF')
mlr = train_model(amzn_train, model='MLR')
xgc = train_model(amzn_train, model='XGB')
dnn = train_model(amzn_train, model='DNN')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
def model_test(model, test=None):

  X_test = test.drop(columns='Review_Helpfulness')
  y_test = test['Review_Helpfulness']

  model_preds = model.predict(X_test)
  print(f'{model} Accuracy : {accuracy_score(y_test, model_preds)}')
  print(f'{model} Precision : {precision_score(y_test, model_preds)}')
  print(f'{model} Recall : {recall_score(y_test, model_preds)}')
  print(f'{model} F1-Score : {f1_score(y_test, model_preds)}')

In [None]:
# SVM
model_test(svc, amzn_test)

SVC(max_iter=10) Accuracy : 0.6164720904779277
SVC(max_iter=10) Precision : 0.6471275105091079
SVC(max_iter=10) Recall : 0.5070448307410796
SVC(max_iter=10) F1-Score : 0.5685852057043193


In [None]:
# RF
model_test(rfc, amzn_test)

RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=100) Accuracy : 0.8666545056548705
RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=100) Precision : 0.8673151036887502
RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=100) Recall : 0.864775846294602
RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=100) F1-Score : 0.8660436137071651


In [None]:
# MLR
model_test(mlr, amzn_test)

LogisticRegression(max_iter=10) Accuracy : 0.7099598686610726
LogisticRegression(max_iter=10) Precision : 0.7325463057195196
LogisticRegression(max_iter=10) Recall : 0.6585544373284538
LogisticRegression(max_iter=10) F1-Score : 0.693582578531509


In [None]:
# XGBoost
model_test(xgc, amzn_test)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=10, n_jobs=-1, num_parallel_tree=None,
              predictor=None, random_state=100, ...) Accuracy : 0.8710324699014959
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=Fa

In [None]:
# DNN
model_test(dnn, amzn_test)

MLPClassifier(max_iter=10, random_state=100) Accuracy : 0.7928675665815396
MLPClassifier(max_iter=10, random_state=100) Precision : 0.8085394126738794
MLPClassifier(max_iter=10, random_state=100) Recall : 0.7657822506861848
MLPClassifier(max_iter=10, random_state=100) F1-Score : 0.7865802086270087
