In [None]:
## Train model

In [None]:
import os
import json
from joblib import dump
from snowflake.snowpark import functions as fn
from snowflake.snowpark.session import Session
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from snowflake.snowpark.types import Variant
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

session = Session.builder.configs(json.load(open("./creds.json"))).create()
session.clear_imports()
session.clear_packages()
session.add_packages("snowflake-snowpark-python")
session.add_packages("scikit-learn", "pandas", "numpy", "nltk", "joblib", "cachetools")


In [None]:

def train_model(session: Session, train_dataset_name: str, test_dataset_name: str) -> Variant:
  train_dataset = session.table(train_dataset_name)
  train_dataset_flag = train_dataset.withColumn("SENTIMENT_FLAG", fn.when(train_dataset.SENTIMENT == "positive", 1)
                                                .otherwise(2))
  train_dataset_flag = train_dataset_flag.limit(100)

  train_x = train_dataset_flag.toPandas().REVIEW.values
  train_y = train_dataset_flag.toPandas().SENTIMENT_FLAG.values

  test_dataset = session.table(test_dataset_name)
  test_dataset_flag = test_dataset.withColumn("SENTIMENT_FLAG",
                                              fn.when(test_dataset.SENTIMENT == "positive", 1).otherwise(2))
  test_dataset_flag = test_dataset_flag.limit(20)
  test_x = test_dataset_flag.toPandas().REVIEW.values
  test_y = test_dataset_flag.toPandas().SENTIMENT_FLAG.values

  print('Configuring parameters ...')
  analyzer = 'word'
  ngram_range = (1, 2)

  token = "[\\w']+\\w\\b"
  max_df = 0.02
  min_df = 1 * 1. / len(train_x)

  binary = True

  print('Building Sparse Matrix ...')
  vectorizer = TfidfVectorizer(
    token_pattern=token,
    ngram_range=ngram_range,
    analyzer=analyzer,
    max_df=max_df,
    min_df=min_df,
    vocabulary=None,
    binary=binary
  )
  train_x_tfidf = vectorizer.fit_transform(train_x)
  test_x_tfidf = vectorizer.transform(test_x)
  # Split the data into training and validation sets
  train_x, val_x, train_y, val_y = train_test_split(train_x_tfidf, train_y, test_size=0.2, random_state=42)

  # Define classifiers
  classifiers = [
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier())
  ]

  best_classifier = None
  best_accuracy = 0

  for clf_name, clf in classifiers:
    print(f'Fitting {clf_name} model ...')
    clf.fit(train_x, train_y)
    val_pred = clf.predict(val_x)
    accuracy = accuracy_score(val_y, val_pred)
    print(f'Training accuracy ({clf_name}): {accuracy}')
    if accuracy > best_accuracy:
      best_accuracy = accuracy
      best_classifier = clf

  if best_classifier is not None:
    print('Best model selected:', best_classifier)

    # Parameter tuning for the best model
    param_grid = {
      'n_estimators': [100, 200],
      'max_depth': [5, 10],
      'min_samples_split': [2, 5],
      'min_samples_leaf': [1, 2]
    }

    print('Performing parameter tuning ...')
    grid_search = GridSearchCV(estimator=best_classifier, param_grid=param_grid, scoring='accuracy', cv=3)
    grid_search.fit(train_x, train_y)

    best_model = grid_search.best_estimator_
    print('Best model after parameter tuning:', best_model)
    test_predictions = best_model.predict(test_x_tfidf)

    # Generate classification report
    classification_report_output_dir = '/tmp'
    classification_report_file = os.path.join(classification_report_output_dir, 'classification_report.txt')
    with open(classification_report_file, 'w') as file:
      file.write(classification_report(test_y, test_predictions))

    # Save the best model
    model_output_dir = '/tmp'
    model_file = os.path.join(model_output_dir, 'best_model.joblib')
    dump(best_model, model_file, compress=True)
    session.file.put(model_file, "@models", auto_compress=True, overwrite=True)

    # Upload the Vectorizer to a stage
    vectorizer_output_dir = '/tmp'
    vectorizer_file = os.path.join(vectorizer_output_dir, 'vectorizer.joblib')
    dump(vectorizer, vectorizer_file, compress=True)
    session.file.put(vectorizer_file, "@models", auto_compress=True, overwrite=True)

In [None]:
session.sproc.register(func=train_model, name="train_model", replace=True)
session.call("train_model", "TRAIN_DATASET", "TEST_DATASET")
session.close()