In [0]:
#!pip install --upgrade pip
!pip install nltk
import nltk
nltk.download('punkt')
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import udf

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
def tokenize1(text):
    words = nltk.word_tokenize(text)
    return words  
tokenize_word = udf(lambda x: tokenize1(x)  , ArrayType(StringType()))

In [0]:
def tokenize2(text):
    sents = nltk.sent_tokenize(text)
    return sents  
tokenize_sent = udf(lambda x: tokenize2(x)  , ArrayType(StringType()))

In [0]:
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_en = stopwords.words('english')
def remove_stopwords1(word_list):
    filtered_words = [word for word in word_list if word not in stop_en]
    return filtered_words
remove_stopwords = udf(lambda x: remove_stopwords1(x) , ArrayType(StringType()))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
def remove_noise1(word_list):
    filtered_words = [word for word in word_list if word.isalnum() and len(word)>2]
    return filtered_words
remove_noise = udf(lambda x: remove_noise1(x) , ArrayType(StringType()))

In [0]:
from nltk.stem import SnowballStemmer
def stem1(word_list):
    snowball = SnowballStemmer(language='english')
    stemmed_words = [snowball.stem(word) for word in word_list]
    return stemmed_words
stem = udf(lambda x: stem1(x) , ArrayType(StringType()))

In [0]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
def sentiment1(text):
  sia = SentimentIntensityAnalyzer()
  return sia.polarity_scores(text)['compound']
sentiment = udf(lambda x: sentiment1(x) , FloatType())

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [0]:
#greg
#create udf for adding length of reviews by word count for all alphanumeric "words"

def get_length1(word_list):
  alphanum_words = [word for word in word_list if word.isalnum()]
  N = len(alphanum_words)
  return N

get_length = udf(lambda x: get_length1(x), IntegerType())

In [0]:
#greg
#create udf for adding average word length by review

def get_average_word_length1(word_list):
  alphanum_word_lengths = [len(word) for word in word_list if word.isalnum()]
  avg_word_len = sum(alphanum_word_lengths)/len(alphanum_word_lengths)
  return avg_word_len

get_average_word_length = udf(lambda x: get_average_word_length1(x), FloatType())

In [0]:
#greg
#create udf for ratio of capitalized characters to total characters (n), with no white spaces

def capital_ratio1(word_list):
  s = "".join(word_list)
  n = len(s)
  cap_n = len([char for char in s if char.isupper()])
  ratio_cap_to_total = cap_n / n
  return ratio_cap_to_total

capital_ratio = udf(lambda x: capital_ratio1(x), FloatType())

In [0]:
#greg
#create udf for ratio of long words (>5 letters) to total words (N)

def long_word_ratio1(word_list):
  alphanum_words = [word for word in word_list if word.isalnum()]
  N = len(alphanum_words)
  long_words = [word for word in alphanum_words if len(word) > 5]
  long_words_count = len(long_words)
  ratio_long_to_total = long_words_count / N
  return ratio_long_to_total

long_word_ratio = udf(lambda x: long_word_ratio1(x), FloatType())

In [0]:
#greg
#create udf for ratio of words after stop word removal vs total words

stop_en = stopwords.words('english')

def filtered_word_ratio1(word_list):
  alphanum_words = [word for word in word_list if word.isalnum()]
  N = len(alphanum_words)
  non_filtered_words = [word for word in alphanum_words if word not in stop_en]
  num_non_filtered_words = len(non_filtered_words)
  ratio_non_filtered_to_total = num_non_filtered_words / N
  return ratio_non_filtered_to_total

filtered_word_ratio = udf(lambda x: filtered_word_ratio1(x), FloatType())

In [0]:
#greg
#create udf for ratio of punctuation "words" vs total alphanumeric words

def punctuation_word_ratio1(word_list):
  alphanum_words = [word for word in word_list if word.isalnum()]
  N = len(alphanum_words)
  punctuation_words = [word for word in word_list if not(word.isalnum())]
  num_punctuation_words = len(punctuation_words)
  ratio_punctuation_to_total = num_punctuation_words / N
  return ratio_punctuation_to_total

punctuation_word_ratio = udf(lambda x: punctuation_word_ratio1(x), FloatType())

In [0]:
from pyspark.sql.functions import lower
def process_data(df):
  dfText = df.select("Index","Review" ,"polarity","real_fake", tokenize_word("Review").alias("tokenized_words"))
  dfText = dfText.withColumn("sentiment" ,sentiment("Review"))
  dfText = dfText.withColumn("tokenized_sents" ,tokenize_sent("Review"))
  dfText = dfText.withColumn("no_stopwords", remove_stopwords("tokenized_words"))
  dfText = dfText.withColumn("no_noise", remove_noise("no_stopwords"))
  dfText = dfText.withColumn("stemmed", stem("no_noise"))
  #dfText = dfText.select('*', F.concat_ws("_","real_fake","polarity").alias("target"))
  dfText = dfText.withColumn("length_in_words", get_length("tokenized_words"))
  dfText = dfText.withColumn("average_word_length", get_average_word_length("tokenized_words"))
  dfText = dfText.withColumn("capital_char_ratio", capital_ratio("tokenized_words"))
  dfText = dfText.withColumn("long_word_ratio", long_word_ratio("tokenized_words"))
  dfText = dfText.withColumn("non_stop_word_ratio", filtered_word_ratio("tokenized_words"))
  dfText = dfText.withColumn("punctuation_ratio", punctuation_word_ratio("tokenized_words"))

  return dfText

In [0]:
df_raw_old = spark.read.option("escape","\"").option("header",True).csv("/FileStore/tables/Original_data.csv")
df_old_all = process_data(df_raw_old)
#df_test = df_test_all.select("Review","stemmed","sentiment","target","real_fake")
df_old = df_old_all.select("Review","stemmed","sentiment","polarity", "length_in_words", "average_word_length", \
                             "capital_char_ratio", "long_word_ratio", "non_stop_word_ratio", "punctuation_ratio", "real_fake")

old_train_split, old_test_split = df_old.randomSplit(weights = [0.80, 0.20], seed = 1)
old_train_split.cache()
old_test_split.cache()

df_raw_new = spark.read.option("escape","\"").option("header",True).csv("/FileStore/tables/Hotel_Reviews_Calgary.csv")
df_new_all = process_data(df_raw_new)
#df_train = df_train.select("Review","stemmed","sentiment","target","real_fake")
df_new = df_new_all.select("Review","stemmed","sentiment","polarity", "length_in_words", "average_word_length", \
                             "capital_char_ratio", "long_word_ratio", "non_stop_word_ratio", "punctuation_ratio", "real_fake")

new_train_split, new_test_split = df_new.randomSplit(weights = [0.80, 0.20], seed = 1)
new_train_split.cache()
new_test_split.cache()

#df_train.display()
combined_train = old_train_split.union(new_train_split)
combined_test = old_test_split.union(new_test_split)

combined_train.cache()
combined_test.cache()

Out[17]: DataFrame[Review: string, stemmed: array<string>, sentiment: float, polarity: string, length_in_words: int, average_word_length: float, capital_char_ratio: float, long_word_ratio: float, non_stop_word_ratio: float, punctuation_ratio: float, real_fake: string]

In [0]:
print((combined_train.count(), len(combined_train.columns)))

(2119, 11)


In [0]:
combined_train.show(1)

+--------------------+--------------------+---------+--------+---------------+-------------------+------------------+---------------+-------------------+-----------------+---------+
|              Review|             stemmed|sentiment|polarity|length_in_words|average_word_length|capital_char_ratio|long_word_ratio|non_stop_word_ratio|punctuation_ratio|real_fake|
+--------------------+--------------------+---------+--------+---------------+-------------------+------------------+---------------+-------------------+-----------------+---------+
| Barely Average H...|[bare, averag, ho...|   0.1901|negative|            187|           4.390374|         0.0391924|     0.26737967|          0.5828877|      0.112299465|     real|
+--------------------+--------------------+---------+--------+---------------+-------------------+------------------+---------------+-------------------+-----------------+---------+
only showing top 1 row



# Logistic Regression

In [0]:
from pyspark.ml.feature import HashingTF, IDF, IndexToString, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

columns_to_be_scaled = ['length_in_words', 'average_word_length', 'capital_char_ratio', 'long_word_ratio', 'non_stop_word_ratio', 'punctuation_ratio']
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_be_scaled]
scalers  = [StandardScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_be_scaled]
scaling_pipeline = Pipeline(stages=assemblers + scalers)

hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures=20000)
idf = IDF(inputCol="rawFeatures", outputCol="TF_IDF", minDocFreq=2)
label_strIdx1 = StringIndexer(inputCol="polarity", outputCol="polarity_idx")
assembles = VectorAssembler(inputCols = ['TF_IDF','sentiment','polarity_idx', 'length_in_words_scaled', 'average_word_length_scaled', 'capital_char_ratio_scaled', 'long_word_ratio_scaled', 'non_stop_word_ratio_scaled', 'punctuation_ratio_scaled'],outputCol="features")
label_strIdx2 = StringIndexer(inputCol="real_fake", outputCol="label")
lr = LogisticRegression(regParam = 0.3)
label_idxStr = IndexToString(inputCol = "label", outputCol = "article_class")

pipeline = Pipeline(stages=[scaling_pipeline, hashingTF, idf,label_strIdx1, assembles, label_strIdx2, lr ,label_idxStr])

In [0]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10000,20000,50000]) \
    .addGrid(lr.regParam, [0.1, 0.3 ,0.5]) \
    .build()
crossval_lr = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,evaluator= MulticlassClassificationEvaluator(),numFolds=3,parallelism = 100 )  # use 3+ folds in practice

### ON REFINEMENT DO 5 FOLD? MAYBE JUST FOR BEST PRELIMINARY MODEL?###

In [0]:
cvModel_lr = crossval_lr.fit(combined_train)

In [0]:
pred_old_lr = cvModel_lr.transform(old_test_split)
pred_new_lr = cvModel_lr.transform(new_test_split)
pred_combined_lr = cvModel_lr.transform(combined_test)

pred_old_lr.cache()
pred_new_lr.cache()
pred_combined_lr.cache()

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
eval = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
acc_our_data = eval.evaluate(pred_new_lr)
print("our data: ", acc_our_data)
acc_original_data = eval.evaluate(pred_old_lr)
print("original data: ", acc_original_data)
acc_original_data = eval.evaluate(pred_combined_lr)
print("combined data: ", acc_original_data)

In [0]:
params = [{
      p.name: v
      for p,
      v in m.items()
   }
   for m in cvModel_lr.getEstimatorParamMaps()
]
import pandas as pd

pd.DataFrame.from_dict([{
      cvModel_lr.getEvaluator().getMetricName(): metric,
      ** ps
   }
   for ps, metric in zip(params, cvModel_lr.avgMetrics)
])

# Random Forest Classifier

In [0]:
#greg

from pyspark.ml.feature import HashingTF, IDF, IndexToString, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

columns_to_be_scaled = ['length_in_words', 'average_word_length', 'capital_char_ratio', 'long_word_ratio', 'non_stop_word_ratio', 'punctuation_ratio']
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_be_scaled]
scalers  = [StandardScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_be_scaled]
scaling_pipeline = Pipeline(stages=assemblers + scalers)

hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="TF_IDF", minDocFreq=2)
label_strIdx1 = StringIndexer(inputCol="polarity", outputCol="polarity_idx")
assembles = VectorAssembler(inputCols = ['TF_IDF','sentiment','polarity_idx', 'length_in_words_scaled', 'average_word_length_scaled', 'capital_char_ratio_scaled', 'long_word_ratio_scaled', 'non_stop_word_ratio_scaled', 'punctuation_ratio_scaled'],outputCol="features")
label_strIdx2 = StringIndexer(inputCol="real_fake", outputCol="label")
rfc = RandomForestClassifier(featuresCol="features",labelCol="label")
label_idxStr = IndexToString(inputCol = "label", outputCol = "article_class")

pipeline_rfc = Pipeline(stages=[scaling_pipeline, hashingTF, idf,label_strIdx1 , assembles, label_strIdx2, rfc ,label_idxStr])

In [0]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10000,20000,50000]) \
    .addGrid(rfc.maxDepth, [3, 5, 9]) \
    .addGrid(rfc.numTrees, [10, 20, 50]) \
    .build()
crossval_rfc = CrossValidator(estimator=pipeline_rfc, estimatorParamMaps=paramGrid,evaluator= MulticlassClassificationEvaluator(),numFolds=2,parallelism = 100 )  # use 3+ folds in practice

In [0]:
cvModel_rfc = crossval_rfc.fit(combined_train)



In [0]:
pred_old_rfc = cvModel_rfc.transform(old_test_split)
pred_new_rfc = cvModel_rfc.transform(new_test_split)
pred_combined_rfc = cvModel_rfc.transform(combined_test)

pred_old_rfc.cache()
pred_new_rfc.cache()
pred_combined_rfc.cache()

Out[23]: DataFrame[Review: string, stemmed: array<string>, sentiment: float, polarity: string, length_in_words: int, average_word_length: float, capital_char_ratio: float, long_word_ratio: float, non_stop_word_ratio: float, punctuation_ratio: float, real_fake: string, length_in_words_vec: vector, average_word_length_vec: vector, capital_char_ratio_vec: vector, long_word_ratio_vec: vector, non_stop_word_ratio_vec: vector, punctuation_ratio_vec: vector, length_in_words_scaled: vector, average_word_length_scaled: vector, capital_char_ratio_scaled: vector, long_word_ratio_scaled: vector, non_stop_word_ratio_scaled: vector, punctuation_ratio_scaled: vector, rawFeatures: vector, TF_IDF: vector, polarity_idx: double, features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double, article_class: string]

In [0]:
eval = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
acc_new_data = eval.evaluate(pred_new_rfc)
print("new data: ", acc_new_data)
acc_old_data = eval.evaluate(pred_old_rfc)
print("old data: ", acc_old_data)
acc_combined_data = eval.evaluate(pred_combined_rfc)
print("combined data: ", acc_combined_data)

new data:  0.8100558659217877
old data:  0.5761589403973509
combined data:  0.6632016632016632


In [0]:
params = [{
      p.name: v
      for p,
      v in m.items()
   }
   for m in cvModel_rfc.getEstimatorParamMaps()
]
import pandas as pd

pd.DataFrame.from_dict([{
      cvModel_rfc.getEvaluator().getMetricName(): metric,
      ** ps
   } for ps, metric in zip(params, cvModel_rfc.avgMetrics)
])

Unnamed: 0,f1,numFeatures,maxDepth,numTrees
0,0.490034,10000,3,10
1,0.48398,10000,3,20
2,0.484092,10000,3,50
3,0.538304,10000,5,10
4,0.515189,10000,5,20
5,0.504999,10000,5,50
6,0.593719,10000,9,10
7,0.616286,10000,9,20
8,0.573703,10000,9,50
9,0.498955,20000,3,10


In [0]:
print("Best regularization parameter: ", cvModel_rfc.bestModel.stages[6]._java_obj.getRegParam())
print("Best hashing number of features: ", cvModel_rfc.bestModel.stages[1]._java_obj.getNumFeatures())

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JError[0m                                 Traceback (most recent call last)
[0;32m<command-3521022214098944>[0m in [0;36m<module>[0;34m[0m
[0;32m----> 1[0;31m [0mprint[0m[0;34m([0m[0;34m"Best regularization parameter: "[0m[0;34m,[0m [0mcvModel_rfc[0m[0;34m.[0m[0mbestModel[0m[0;34m.[0m[0mstages[0m[0;34m[[0m[0;36m6[0m[0;34m][0m[0;34m.[0m[0m_java_obj[0m[0;34m.[0m[0mgetRegParam[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      2[0m [0mprint[0m[0;34m([0m[0;34m"Best hashing number of features: "[0m[0;34m,[0m [0mcvModel_rfc[0m[0;34m.[0m[0mbestModel[0m[0;34m.[0m[0mstages[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m.[0m[0m_java_obj[0m[0;34m.[0m[0mgetNumFeatures[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py

# Gradient Boosting Classifier

In [0]:
# #greg

# from pyspark.ml.feature import HashingTF, IDF, IndexToString, StringIndexer
# from pyspark.ml import Pipeline
# from pyspark.ml.feature import StandardScaler
# from pyspark.ml.classification import LogisticRegression
# from pyspark.ml.classification import RandomForestClassifier
# from pyspark.ml.classification import LinearSVC
# from pyspark.ml.classification import NaiveBayes
# from pyspark.ml.classification import GBTClassifier
# from pyspark.ml.feature import VectorAssembler
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# columns_to_be_scaled = ['length_in_words', 'average_word_length', 'capital_char_ratio', 'long_word_ratio', 'non_stop_word_ratio', 'punctuation_ratio']
# assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_be_scaled]
# scalers  = [StandardScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_be_scaled]
# scaling_pipeline = Pipeline(stages=assemblers + scalers)

# hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures")
# idf = IDF(inputCol="rawFeatures", outputCol="TF_IDF", minDocFreq=2)
# label_strIdx1 = StringIndexer(inputCol="polarity", outputCol="polarity_idx")
# assembles = VectorAssembler(inputCols = ['TF_IDF','sentiment','polarity_idx', 'length_in_words_scaled', 'average_word_length_scaled', 'capital_char_ratio_scaled', 'long_word_ratio_scaled', 'non_stop_word_ratio_scaled', 'punctuation_ratio_scaled'],outputCol="features")
# label_strIdx2 = StringIndexer(inputCol="real_fake", outputCol="label")
# gbtc = GBTClassifier(featuresCol="features",labelCol="label")
# label_idxStr = IndexToString(inputCol = "label", outputCol = "article_class")

# pipeline_gbtc = Pipeline(stages=[scaling_pipeline, hashingTF, idf,label_strIdx1 , assembles, label_strIdx2, gbtc ,label_idxStr])

In [0]:
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# paramGrid = ParamGridBuilder() \
#     .addGrid(hashingTF.numFeatures, [10000,16384,32768]) \
#     .addGrid(gbtc.maxDepth, [3, 5, 7]) \
#     .addGrid(gbtc.stepSize, [0.1, 0.5, 1.0]) \
#     .build()
# crossval_gbtc = CrossValidator(estimator=pipeline_gbtc, estimatorParamMaps=paramGrid,evaluator= MulticlassClassificationEvaluator(),numFolds=2,parallelism = 100 )  # use 3+ folds in practice

In [0]:
# cvModel_gbtc = crossval_gbtc.fit(combined_train)

In [0]:
# pred_old_gbtc = cvModel_gbtc.transform(old_test_split)
# pred_new_gbtc = cvModel_gbtc.transform(new_test_split)
# pred_combined_gbtc = cvModel_gbtc.transform(combined_test)

# pred_old_gbtc.cache()
# pred_new_gbtc.cache()
# pred_combined_gbtc.cache()

In [0]:
# eval = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
# acc_new_data = eval.evaluate(pred_new_gbtc)
# print("new data: ", acc_new_data)
# acc_old_data = eval.evaluate(pred_old_gbtc)
# print("old data: ", acc_old_data)
# acc_combined_data = eval.evaluate(pred_combined_gbtc)
# print("combined data: ", acc_combined_data)

In [0]:
# params = [{
#       p.name: v
#       for p,
#       v in m.items()
#    }
#    for m in cvModel_gbtc.getEstimatorParamMaps()
# ]
# import pandas as pd

# pd.DataFrame.from_dict([{
#       cvModel_gbtc.getEvaluator().getMetricName(): metric,
#       ** ps
#    } for ps, metric in zip(params, cvModel_gbtc.avgMetrics)
# ])

In [0]:
# print("Best regularization parameter: ", cvModel_gbtc.bestModel.stages[6]._java_obj.getRegParam())
# print("Best hashing number of features: ", cvModel_gbtc.bestModel.stages[1]._java_obj.getNumFeatures())

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-3521022214098952>[0m in [0;36m<module>[0;34m[0m
[0;32m----> 1[0;31m [0mprint[0m[0;34m([0m[0;34m"Best regularization parameter: "[0m[0;34m,[0m [0mcvModel_gbtc[0m[0;34m.[0m[0mbestModel[0m[0;34m.[0m[0mstages[0m[0;34m[[0m[0;36m6[0m[0;34m][0m[0;34m.[0m[0m_java_obj[0m[0;34m.[0m[0mgetRegParam[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      2[0m [0mprint[0m[0;34m([0m[0;34m"Best hashing number of features: "[0m[0;34m,[0m [0mcvModel_gbtc[0m[0;34m.[0m[0mbestModel[0m[0;34m.[0m[0mstages[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m.[0m[0m_java_obj[0m[0;34m.[0m[0mgetNumFeatures[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;31mNameError[0m: name 'cvModel_gbtc' is not defined

#