In [2]:
import optuna
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import re
from scipy.sparse import csr_matrix
import asyncio
import nest_asyncio

nest_asyncio.apply()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, use_stem=False, use_lem=False, use_stop=False, use_regex=False):
        self.use_stem = use_stem
        self.use_lem = use_lem
        self.use_stop = use_stop
        self.use_regex = use_regex
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        texts = X.copy()
        if self.use_regex:
            texts = texts.str.lower()
            texts = texts.apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

        def process_text(text):
          words = text.split()
          if self.use_stop:
            words = [w for w in words if w not in self.stop_words]
          if self.use_stem:
            words = [self.stemmer.stem(w) for w in words]
          if self.use_lem:
            words = [self.lemmatizer.lemmatize(w) for w in words]
          return " ".join(words)
        
        texts = texts.apply(process_text)
        return texts

async def objective(trial, X_train, y_train, X_val, y_val, vectorizer):
    preprocessing_config = {
        'use_stem': trial.suggest_categorical('use_stem', [True, False]),
        'use_lem': trial.suggest_categorical('use_lem', [True, False]),
        'use_stop': trial.suggest_categorical('use_stop', [True, False]),
        'use_regex': trial.suggest_categorical('use_regex', [True, False])
    }

    gb_config = {
       'n_estimators': trial.suggest_int('n_estimators', 50, 300),
       'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
       'max_depth': trial.suggest_int('max_depth', 1, 7),
       'min_samples_split': trial.suggest_int('min_samples_split', 2, 150),
       'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
       'validation_fraction': 0.1,
        'n_iter_no_change': 5,
        'tol': 0.01
   }
    
    preprocessor = TextPreprocessor(**preprocessing_config)
    X_train_processed = preprocessor.fit_transform(X_train)
    X_val_processed = preprocessor.transform(X_val)

    X_train_vectorized = vectorizer.fit_transform(X_train_processed)
    X_val_vectorized = vectorizer.transform(X_val_processed)

    pipeline = Pipeline([
    ('classifier', GradientBoostingClassifier(random_state=42))
    ])

    pipeline.set_params(classifier = GradientBoostingClassifier(**gb_config))
    pipeline.fit(X_train_vectorized, y_train)
    y_pred = pipeline.predict(X_val_vectorized)
    return f1_score(y_val, y_pred, average='weighted')


async def run_study(study, objective, n_trials):
  tasks = [asyncio.create_task(study.optimize(objective, n_trials=1)) for _ in range(n_trials)]
  await asyncio.gather(*tasks)

async def main():
    
    df = pd.read_csv('text.csv')
    df.drop(['Unnamed: 0', "company", "review"], axis=1, inplace=True)

    
    df['text'] = df['text'].astype(str)
    df['rating'] = df['rating'].astype(int)

    X_temp, X_test, y_temp, y_test = train_test_split(df['text'], df['rating'], test_size=0.2, random_state=42, stratify = df['rating'])
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42, stratify = y_temp)

    
    vectorizer_config = {
        'max_features': 10000, 
        'ngram_range': (1,2)
    }
    vectorizer = TfidfVectorizer(**vectorizer_config)


    
    study = optuna.create_study(direction='maximize')
    objective_with_data = lambda trial: objective(trial, X_train, y_train, X_val, y_val, vectorizer)
    await run_study(study, objective_with_data, n_trials = 50)

    print("Best trial:")
    print("  Value: ", study.best_trial.value)
    print("  Params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

if __name__ == "__main__":
    asyncio.run(main())