In [None]:
!apt-get install default-jre
!java -version

In [None]:
!pip install h2o

In [None]:
import h2o


In [None]:
h2o.init()

In [None]:
import tensorflow_datasets as tfds

In [None]:
dataset,info = tfds.load("amazon_us_reviews/Personal_Care_Appliances_v1_00",with_info=True,batch_size=-1)

In [None]:
train_dataset = dataset["train"]

In [None]:
info

In [None]:
dataset = tfds.as_numpy(train_dataset)

In [None]:
dataset

In [None]:
helpful_votes = dataset["data"]["helpful_votes"]
review_headline = dataset["data"]["review_headline"]
review_body = dataset["data"]["review_body"]
star_rating = dataset["data"]["star_rating"]


In [None]:
import numpy as np

In [None]:
reviewdf = h2o.H2OFrame(
    np.hstack((helpful_votes[:,None],review_headline[:,None],review_body[:,None],star_rating[:,None])),
    column_names=["votes","headline","review","rating"],
    column_types=["numeric", "string", "string","numeric"]
)

In [None]:
reviewdf.head(2)

In [None]:
reviewdf["target"] = (reviewdf["rating"] >= 4).ifelse("1","0")

In [None]:
reviewdf.head(2)

In [None]:
reviewdf.shape

In [None]:
reviewdf["target"].table()

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

In [None]:
def preprocess_text(line):
    tokenized = line.tokenize("\\W+")
    tokenized = tokenized.trim()
    tokenized = tokenized.tolower()
    tokenized = tokenized[(tokenized.nchar() >= 2) | (tokenized.isna()) , : ]
    tokenized_filter = tokenized[ (tokenized.isna()) | (  ~ tokenized.isin(stop_words)), : ]
    return tokenized

In [None]:
word_reviews = preprocess_text(reviewdf["review"])

In [None]:
word_reviews.head(5)

In [None]:
word_headline = preprocess_text(reviewdf["headline"])

In [None]:
word_headline.head()

In [None]:
from h2o.estimators import H2OWord2vecEstimator, H2OGradientBoostingEstimator,H2OXGBoostEstimator

In [None]:
vec_model = H2OWord2vecEstimator(vec_size = 100, model_id = "reviews_w2v.model")
vec_model.train(training_frame=word_reviews)

In [None]:
vec_model.find_synonyms("toothpaste")

In [None]:
h2o.save_model(vec_model,path="./")

In [None]:
review_vecs = vec_model.transform(word_reviews,aggregate_method="AVERAGE")

In [None]:
reviewdf_ext = reviewdf.cbind(review_vecs)

In [None]:
df_train,df_valid = reviewdf_ext.split_frame(ratios=[0.8])

In [None]:
df_valid["target"].table()

In [None]:
gbm_baseline = H2OGradientBoostingEstimator(
    stopping_metric="AUC",stopping_tolerance=0.001,stopping_rounds=5,score_tree_interval=10
)

In [None]:
gbm_baseline.train(x=review_vecs.names,y="target",training_frame=df_train,validation_frame=df_valid)

In [None]:
print("Baseline Auc",round( gbm_baseline.auc(valid=True), 3 ) )

In [None]:
gbm_baseline.confusion_matrix(valid=True)

In [None]:
gbm_balanced = H2OGradientBoostingEstimator(
    stopping_metric="AUC",stopping_tolerance=0.001,stopping_rounds=5,score_tree_interval=10,
    balance_classes=True
)

In [None]:
gbm_balanced.train(x=review_vecs.names,y="target",training_frame=df_train,validation_frame=df_valid)

In [None]:
print("Balaced AUC : ",round( gbm_balanced.auc(valid=True), 3 ) )

In [None]:
gbm_balanced.confusion_matrix(valid=True)

In [None]:
gbm_baseline_add_col = H2OGradientBoostingEstimator(
    stopping_metric="AUC",stopping_tolerance=0.001,stopping_rounds=5,score_tree_interval=10
)

In [None]:
gbm_baseline_add_col.train(x=["votes"] + review_vecs.names,
                           y="target",training_frame=df_train,validation_frame=df_valid)

In [None]:
print("Baseline Add colAUC : ",round( gbm_baseline_add_col.auc(valid=True), 3 ) )

In [None]:
gbm_baseline_add_col.confusion_matrix(valid=True)

In [None]:
headline_vecs = vec_model.transform(word_headline, aggregate_method="AVERAGE")
headline_vecs.names = ["headline_" + s for s in headline_vecs.names]

In [None]:
headline_vecs.head(2)

In [None]:
reviewdf_ext = reviewdf_ext.cbind(headline_vecs)

In [None]:
df_train,df_valid = reviewdf_ext.split_frame(ratios=[0.8,])

In [None]:
gbm_baseline_all_col = H2OGradientBoostingEstimator(
    stopping_metric="AUC",stopping_tolerance=0.001,stopping_rounds=5,score_tree_interval=10
)

In [None]:
gbm_baseline_all_col.train(
    x=["votes"] + review_vecs.names + headline_vecs.names,
                           y="target",training_frame=df_train,validation_frame=df_valid
    )

In [None]:
print("Baseline All col AUC : ",round( gbm_baseline_all_col.auc(valid=True), 3 ) )

In [None]:
gbm_baseline_all_col.confusion_matrix(valid=True)

In [None]:
gbm_baseline_all_col.score_history()

In [None]:
gbm_baseline_all_col.varimp_plot()

In [None]:
gbm_hyper= H2OGradientBoostingEstimator(
    ntrees=100,max_depth= 6, learn_rate=0.1
)

In [None]:
gbm_hyper.train(
    x=["votes"] + review_vecs.names + headline_vecs.names,
                           y="target",training_frame=df_train,validation_frame=df_valid
    )

In [None]:
print("Hyperparameter AUC : ",round( gbm_hyper.auc(valid=True), 3 ) )

In [None]:
gbm_hyper.confusion_matrix(valid=True)

In [None]:
gbm_hyper.score_history()

In [None]:
gbm_hyper.varimp_plot()


In [None]:
xgb_feat_all= H2OXGBoostEstimator(
    ntrees=100,max_depth= 6, learn_rate=0.1,max_leaves = 6,tree_method="hist", grow_policy="lossguide"
)

In [None]:
xgb_feat_all.train(
    x=["votes"] + review_vecs.names + headline_vecs.names,
                           y="target",training_frame=df_train,validation_frame=df_valid
    )

In [None]:
print("Hyperparameter AUC : ",round( xgb_feat_all.auc(valid=True), 3 ) )

In [None]:
xgb_feat_all.confusion_matrix(valid=True)

In [None]:
df_train,df_valid,df_test = reviewdf_ext.split_frame(ratios=[0.7,0.15])

In [None]:
gbm_final= H2OXGBoostEstimator(
    ntrees=200,max_depth= 6, learn_rate=0.1,
)

In [None]:
gbm_final.train(
    x=["votes"] + review_vecs.names + headline_vecs.names,
                           y="target",training_frame=df_train,validation_frame=df_valid
    )

In [None]:
print("Xgbost Final AUC : ",round( gbm_final.auc(valid=True), 3 ) )

In [None]:
gbm_final.confusion_matrix(valid=True)

In [None]:
gbm_final.model_performance(df_test)

In [None]:
gbm_final.varimp_plot()