In [3]:
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


#Preprocessing
df_categories = pd.read_csv("trustpilot_categories.csv")
df_companies = pd.read_csv("trustpilot_companies.csv")
df_reviews_sports = pd.read_csv("trustpilot_reviews_1000.csv")
df_reviews_sports['comment_length'] = df_reviews_sports['cust_review_text'].fillna('').apply(lambda x: len(str(x)))
df_reviews_sports['cust_review_text'] = df_reviews_sports['cust_review_text'].fillna('')
df_reviews_sports['sentiment'] = df_reviews_sports['cust_review_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

#convert to small letters
df_reviews_sports['cust_review_text'] = df_reviews_sports['cust_review_text'].str.lower()

#delete special characters, numbers and html-tags
df_reviews_sports['cust_review_text'] = df_reviews_sports['cust_review_text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', str(x)))

#delete stopwords
stop_words = set(stopwords.words('english'))

df_reviews_sports['cust_review_text'] = df_reviews_sports['cust_review_text'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop_words])
)

#stemming (i.e. running --> run)
lemmatizer = WordNetLemmatizer()

df_reviews_sports['cust_review_text'] = df_reviews_sports['cust_review_text'].apply(
    lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()])
)
def lemmatize_with_pos(text):
    return ' '.join([lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in text.split()])

df_reviews_sports['cust_review_text'] = df_reviews_sports['cust_review_text'].apply(lemmatize_with_pos)
df_reviews_sports.head(10)

Unnamed: 0,review_title,cust_name,cust_rating,cust_review_text,date_experience,company,comment_length,sentiment
0,10/10 would recommend,David Silva,5,shop need buck cheaper stock guy,2025-01-14T01:18:40.000Z,esilencers.com,97,-0.1
1,When I had a question they got back to…,Riley Brown,5,question get back right away make sure everyth...,2025-01-14T11:32:00.000Z,esilencers.com,89,0.261905
2,eSilencers has the best pricing,William Harding,5,esilencers best price service company always c...,2025-01-10T06:52:30.000Z,esilencers.com,107,0.386667
3,AEM5 MUZZLE DEVICE AND COLLAR KIT 5.56 (LONG C...,Cesar Marroquin,5,aem muzzle device collar kit long collar perfe...,2024-12-26T07:39:39.000Z,esilencers.com,485,0.300595
4,Great range of in stock items and awesome cust...,CHRISTOPHER SMITH,5,great range stock item last two item need fini...,2025-01-04T15:55:03.000Z,esilencers.com,121,0.5
5,They were efficient in all phases,Danny Neal Huffines,5,efficient phase take time explain area entire ...,2024-12-29T14:56:15.000Z,esilencers.com,139,0.08
6,Great dealer!,B.W.,5,order bt tp factory sbr form submit next day f...,2024-12-24T13:37:28.000Z,esilencers.com,229,0.04
7,Esilencers is always fast and easy,Weston C,5,esilencers always fast easy didnt call always ...,2024-12-29T01:22:41.000Z,esilencers.com,133,0.277778
8,HIGHLY RECOMMEND A++,gho3st,5,great customer service answer phone right away,2025-01-08T23:25:18.000Z,esilencers.com,52,0.542857
9,Follow up email after not submitting…,Patrick A,5,follow email submit order ask small discount g...,2024-12-16T22:10:22.000Z,esilencers.com,176,0.375


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

#tuning the review column

#string to numeric
df_reviews_sports['cust_rating'] = pd.to_numeric(df_reviews_sports['cust_rating'], errors='coerce')

# deleting NaN
df_reviews_sports = df_reviews_sports.dropna(subset=['cust_rating', 'cust_review_text'])

#TF-IDF-vectorizing (converting text to numeric format)
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf.fit_transform(df_reviews_sports['cust_review_text'])

#target variable (Ratings)
y = df_reviews_sports['cust_rating']

#SMOTE (balancing of the Ratings)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

#test and training data 
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

#Logistic Regression

#training the model (Logistic Regression)
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

#prediction and evaluation
y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      6380
           1       0.98      0.96      0.97      6563
           2       0.96      0.99      0.98      6559
           3       0.83      0.85      0.84      6471
           4       0.86      0.81      0.83      6352

    accuracy                           0.92     32325
   macro avg       0.92      0.92      0.92     32325
weighted avg       0.92      0.92      0.92     32325

Confusion Matrix:
[[6268   28   22   49   13]
 [  16 6303   12  230    2]
 [  43    0 6479    3   34]
 [  53   30   82 5516  790]
 [ 110   98  136  877 5131]]


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# prediction
y_pred_rf = rf_model.predict(X_test)

# evaluation
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))


Random Forest Classification Report:
              precision    recall  f1-score   support

         1.0       0.99      1.00      1.00      6380
         2.0       1.00      1.00      1.00      6563
         3.0       1.00      1.00      1.00      6559
         4.0       0.98      0.99      0.98      6471
         5.0       0.98      0.98      0.98      6352

    accuracy                           0.99     32325
   macro avg       0.99      0.99      0.99     32325
weighted avg       0.99      0.99      0.99     32325

Random Forest Confusion Matrix:
[[6373    0    0    0    7]
 [   2 6546    0    9    6]
 [   1    0 6546    0   12]
 [   1    0    0 6388   82]
 [  29    2    2  110 6209]]


In [9]:
from sklearn.svm import SVC
#from sklearn.metrics import classification_report, confusion_matrix

# Support Vector Machine Classifier
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# prediction
y_pred_svm = svm_model.predict(X_test)

# evaluation
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))
print("SVM Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))


SVM Classification Report:
              precision    recall  f1-score   support

         1.0       0.97      1.00      0.98      6380
         2.0       0.98      0.94      0.96      6563
         3.0       0.98      0.99      0.99      6559
         4.0       0.82      0.89      0.85      6471
         5.0       0.89      0.81      0.85      6352

    accuracy                           0.93     32325
   macro avg       0.93      0.93      0.93     32325
weighted avg       0.93      0.93      0.93     32325

SVM Confusion Matrix:
[[6353   12    0   10    5]
 [  27 6162    0  374    0]
 [  21    0 6520    0   18]
 [  27    2   36 5763  643]
 [  97   97  107  892 5159]]


In [12]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Zielvariable umwandeln: Werte auf ganze Zahlen und bei 0 beginnend verschieben
y_train_int = y_train.astype(int) - 1
y_test_int = y_test.astype(int) - 1

# Gradient Boosting Classifier (XGBoost)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train_int)

# Vorhersage
y_pred_xgb = xgb_model.predict(X_test)

# Vorhersagen zurückverschieben, um originalen Bereich (1-5) wiederherzustellen
y_pred_xgb_original = y_pred_xgb + 1

# Bewertung
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb_original))
print("XGBoost Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb_original))


Parameters: { "use_label_encoder" } are not used.



XGBoost Classification Report:
              precision    recall  f1-score   support

         1.0       0.98      0.98      0.98      6380
         2.0       0.99      0.98      0.98      6563
         3.0       0.99      0.99      0.99      6559
         4.0       0.91      0.82      0.86      6471
         5.0       0.82      0.91      0.86      6352

    accuracy                           0.94     32325
   macro avg       0.94      0.94      0.94     32325
weighted avg       0.94      0.94      0.94     32325

XGBoost Confusion Matrix:
[[6259   35   10   26   50]
 [  28 6423    0   65   47]
 [  17    4 6472   23   43]
 [  24   21    3 5336 1087]
 [  70   32   59  427 5764]]


In [8]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import Dataset

# Prüfen, ob CPU verwendet wird
device = torch.device('cpu')
print(f"Using device: {device}")

# Schritt 1: Daten vorbereiten
# Zielvariable filtern: Nur Werte zwischen 1 und 5 zulassen
df_reviews_sports['cust_rating'] = pd.to_numeric(df_reviews_sports['cust_rating'], errors='coerce')
df_reviews_sports = df_reviews_sports.dropna(subset=['cust_rating', 'cust_review_text'])
df_reviews_sports = df_reviews_sports[df_reviews_sports['cust_rating'].between(1, 5)]

# Zielwerte in den Bereich [0, 4] umwandeln
df_reviews_sports['cust_rating'] = df_reviews_sports['cust_rating'].astype(int) - 1

# Aufteilen in Training und Test
X_train, X_test, y_train, y_test = train_test_split(
    df_reviews_sports['cust_review_text'],
    df_reviews_sports['cust_rating'],
    test_size=0.2,
    random_state=42
)

# Debugging: Überprüfen der Labels
print("Einzigartige Labels in y_train:", set(y_train))
print("Einzigartige Labels in y_test:", set(y_test))

# Schritt 2: Tokenizer laden
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizer-Funktion
def tokenize_data(texts, labels, tokenizer, max_len=64):  # Kürzere maximale Länge für Effizienz
    tokens = tokenizer(
        list(texts),  # Sicherstellen, dass Texte als Liste vorliegen
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    return tokens, torch.tensor(labels, dtype=torch.long)  # Labels als Long-Tensors zurückgeben

# Training-Daten tokenisieren
train_tokens, train_labels = tokenize_data(X_train, y_train.tolist(), tokenizer)
test_tokens, test_labels = tokenize_data(X_test, y_test.tolist(), tokenizer)

# Schritt 3: Dataset und DataLoader
class ReviewsDataset(Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.tokens['input_ids'][idx],
            'attention_mask': self.tokens['attention_mask'][idx],
            'labels': self.labels[idx]
        }

train_dataset = ReviewsDataset(train_tokens, train_labels)
test_dataset = ReviewsDataset(test_tokens, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Kleinere Batch-Größe für CPU
test_loader = DataLoader(test_dataset, batch_size=8)

# Schritt 4: BERT-Modell laden
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)  # 5 Klassen
model.to(device)

# Schritt 5: Optimizer und Loss-Funktion
optimizer = AdamW(model.parameters(), lr=5e-5)

# Schritt 6: Training
epochs = 2  # Reduzierte Anzahl der Epochen für Effizienz

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss}")

# Schritt 7: Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Schritt 8: Bericht und Metriken
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=['1', '2', '3', '4', '5']))



Using device: cpu
Einzigartige Labels in y_train: {0, 1}
Einzigartige Labels in y_test: {0, 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


: 