In [13]:
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


#Preprocessing
df_categories = pd.read_csv("trustpilot_categories.csv")
df_companies = pd.read_csv("trustpilot_companies.csv")
df_sports_companies = pd.read_csv("trustpilot_sports_companies.csv")
df_reviews_sports = pd.read_csv("trustpilot_sports_reviews.csv")
df_reviews_sports['comment_length'] = df_reviews_sports['cust_review_text'].fillna('').apply(lambda x: len(str(x)))
df_reviews_sports['cust_review_text'] = df_reviews_sports['cust_review_text'].fillna('')
df_reviews_sports['sentiment'] = df_reviews_sports['cust_review_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

#convert to small letters
df_reviews_sports['cust_review_text'] = df_reviews_sports['cust_review_text'].str.lower()

#delete special characters, numbers and html-tags
df_reviews_sports['cust_review_text'] = df_reviews_sports['cust_review_text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', str(x)))

#delete stopwords
stop_words = set(stopwords.words('english'))

df_reviews_sports['cust_review_text'] = df_reviews_sports['cust_review_text'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop_words])
)

#stemming (i.e. running --> run)
lemmatizer = WordNetLemmatizer()

df_reviews_sports['cust_review_text'] = df_reviews_sports['cust_review_text'].apply(
    lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()])
)
def lemmatize_with_pos(text):
    return ' '.join([lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in text.split()])

df_reviews_sports['cust_review_text'] = df_reviews_sports['cust_review_text'].apply(lemmatize_with_pos)
df_reviews_sports.head(10)


Unnamed: 0,review_title,cust_name,cust_location,cust_reviews,cust_rating,cust_review_text,seller_response,date_experience,comment_length,sentiment
0,Great range of in stock items and awesome cust...,CHRISTOPHER SMITH,US,1,5,great range stock item last two item need fini...,False,"December 28, 2024",121,0.5
1,AEM5 MUZZLE DEVICE AND COLLAR KIT 5.56 (LONG C...,Cesar Marroquin,US,1,5,aem muzzle device collar kit long collar perfe...,False,"December 19, 2024",485,0.300595
2,They were efficient in all phases,Danny Neal Huffines,US,3,5,efficient phase take time explain area entire ...,False,"December 28, 2024",138,0.08
3,Esilencers is always fast and easy,Weston C,US,2,5,esilencers always fast easy didnt call always ...,False,"December 21, 2024",132,0.277778
4,Great dealer!,B.W.,US,1,5,order bt tp factory sbr form submit next day f...,False,"December 17, 2024",229,0.04
5,Gordy clone complete!!,Kyle Fortin,US,1,5,ease use hand great experience business sure,False,"November 14, 2024",79,0.381481
6,Follow up email after not submitting…,Patrick A,US,3,5,follow email submit order ask small discount g...,False,"December 16, 2024",176,0.375
7,Fast shipping and packaged well,Zachary Lawson,US,1,5,fast ship package well many item stock others ...,False,"December 19, 2024",76,0.35
8,Great support,Christopher,US,10,5,order fill ship quickly part order compatible ...,False,"December 21, 2024",134,0.416667
9,Ships faster than anyone I have ordered…,Christopher,US,10,5,ship faster anyone order fromesilencers ship i...,False,"November 27, 2024",179,0.14375


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

#TF-IDF-vectorizing (converting text to numeric format)
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf.fit_transform(df_reviews_sports['cust_review_text'])

#target variable (Ratings)
y = df_reviews_sports['cust_rating']

#SMOTE (balancing of the Ratings)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

#test and training data 
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

#training the model (Logistic Regression)
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

#prediction and evaluation
y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       219
           2       1.00      1.00      1.00       219
           3       1.00      1.00      1.00       188
           4       0.98      1.00      0.99       209
           5       1.00      0.98      0.99       233

    accuracy                           1.00      1068
   macro avg       1.00      1.00      1.00      1068
weighted avg       1.00      1.00      1.00      1068

Confusion Matrix:
[[219   0   0   0   0]
 [  0 219   0   0   0]
 [  0   0 188   0   0]
 [  0   0   0 209   0]
 [  0   0   0   5 228]]
