In [1]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np 
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import re
import emoji



In [2]:
df = pd.read_csv('/Users/notagain/Desktop/Trust_pilot-1/fabian/NLP/text.csv')



In [3]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np 
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import re
import emoji



def remove_emojis(text: str) -> str:
    emoji_set = set(e['emoji'] for e in emoji.emoji_list(text))
    return ''.join(c for c in text if c not in emoji_set)


class TextPreprocessor(BaseEstimator, TransformerMixin):
   def __init__(self, use_stem=False, use_lem=False, use_stop=False, use_regex=False):
       self.use_stem = use_stem 
       self.use_lem = use_lem
       self.use_stop = use_stop
       self.use_regex = use_regex
       self.stemmer = PorterStemmer()
       self.lemmatizer = WordNetLemmatizer()
       self.stop_words = set(stopwords.words('english'))
       
   def fit(self, X, y=None):
       return self
       
   def transform(self, X):
       texts = X.copy()
       if self.use_regex:
           texts = texts.str.lower()
           texts = texts.apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
           
       if self.use_stop:
           texts = texts.apply(lambda x: ' '.join([w for w in x.split() if w not in self.stop_words]))
           
       if self.use_stem:
           texts = texts.apply(lambda x: ' '.join([self.stemmer.stem(w) for w in x.split()]))
           
       if self.use_lem:
           texts = texts.apply(lambda x: ' '.join([self.lemmatizer.lemmatize(w) for w in x.split()]))
           
       return texts

pipeline_configs = [
   {'use_stem': False, 'use_lem': False, 'use_stop': False, 'use_regex': False},
   {'use_stem': True, 'use_lem': False, 'use_stop': False, 'use_regex': False}, 
   {'use_stem': False, 'use_lem': True, 'use_stop': False, 'use_regex': False},
   {'use_stem': False, 'use_lem': False, 'use_stop': True, 'use_regex': True},
   {'use_stem': True, 'use_lem': True, 'use_stop': True, 'use_regex': True}
]

results = {}

for config in pipeline_configs:
   pipeline = Pipeline([
       ('preprocessor', TextPreprocessor(**config)),
       ('vectorizer', TfidfVectorizer(max_features=10000)),
       ('classifier', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1))
   ])
   
   X_train, X_test, y_train, y_test = train_test_split(df['text'], df['rating'], test_size=0.2, random_state=42)
   pipeline.fit(X_train, y_train)
   y_pred = pipeline.predict(X_test)
   
   results[str(config)] = {
       'f1': f1_score(y_test, y_pred, average='weighted'),
       'report': classification_report(y_test, y_pred)
   }

for config, result in results.items():
   print(f"\nConfig: {config}")
   print(f"F1 Score: {result['f1']}")
   print(f"Report:\n{result['report']}")