# Module Importieren

In [1]:
# Module Importieren
import pandas as pd
import numpy as np
import nltk
import spacy
import re
import string
from sklearn.feature_extraction.text import CountVectorizer

# Kaggle Datensatz:
# https://www.kaggle.com/datasets/nicapotato/womens-ecommerce-clothing-reviews?resource=download

In [2]:
# Datensatz einlesen
data = pd.read_csv("Datensätze/Womens Clothing E-Commerce Reviews.csv", index_col="Unnamed: 0")

# Daten verstehen

In [3]:
data.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [4]:
# Einzigartige Einträge plotten
for column in data.columns:
    space = " " * (23 - len(column)) 
    print(f"{column.upper()}: ", space, len(data.loc[:, column].unique()))

CLOTHING ID:               1206
AGE:                       77
TITLE:                     13994
REVIEW TEXT:               22635
RATING:                    5
RECOMMENDED IND:           2
POSITIVE FEEDBACK COUNT:   82
DIVISION NAME:             4
DEPARTMENT NAME:           7
CLASS NAME:                21


In [6]:
# Fehlende Werte
data.isna().sum()

Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [7]:
# Describe
data.describe()

Unnamed: 0,Clothing ID,Age,Rating,Recommended IND,Positive Feedback Count
count,23486.0,23486.0,23486.0,23486.0,23486.0
mean,918.118709,43.198544,4.196032,0.822362,2.535936
std,203.29898,12.279544,1.110031,0.382216,5.702202
min,0.0,18.0,1.0,0.0,0.0
25%,861.0,34.0,4.0,1.0,0.0
50%,936.0,41.0,5.0,1.0,1.0
75%,1078.0,52.0,5.0,1.0,3.0
max,1205.0,99.0,5.0,1.0,122.0


# Preparation

In [8]:
# Nan's durch lehre Strings in 'Review Text' ersetzen
data["Review Text"] = data["Review Text"].fillna("")

In [9]:
# Funktion: Text Cleaner (ohne Stopwörter)

# Satzzeichen einlesen
punctuation = string.punctuation

# nlp-Funktion initialisieren
nlp = spacy.load("en_core_web_sm")

# Stopwörter einlesen
stopwords = nltk.corpus.stopwords.words("english")

# text_cleaner
def text_cleaner(text):
    doc = nlp(text)
    text_list_lemmatized = [token.lemma_.lower() for token in doc if token.text not in punctuation and token.lemma_.lower() not in stopwords]
    final_text = " ".join(text_list_lemmatized)
    return final_text

# Lemmatisierung der Spalte 'Review Text'
data["Review Text Lemmatized"] = data["Review Text"].apply(text_cleaner)

In [73]:
# Funktion: Text Cleaner (mit Stopwörter)
def text_cleaner_with_stopwords(text):
    doc = nlp(text)
    text_list_lemmatized = [token.lemma_.lower() for token in doc if token.text not in punctuation]
    final_text = " ".join(text_list_lemmatized)
    return final_text

# Lemmatisierung der Spalte 'Review Text' mit Stopwörtern
data["Review Text Lemmatized with Stopwords"] = data["Review Text"].apply(text_cleaner_with_stopwords)

In [27]:
# Funktion: Word Count
def text_word_count(text):
    doc = nlp(text)
    text_only_words = [token.text for token in doc if token.text not in punctuation]
    word_count = len(text_only_words)
    return word_count

# Zählung aller Wörter
data["Review Text Word Count"] = data["Review Text"].apply(text_word_count)

# Aufgabe 1

In [38]:
# Aufgabe 1:
# Wie viele Wörter enthält durchschnittlich ein Review?
data["Review Text Word Count"].describe()

count    23486.000000
mean        59.751469
std         31.148234
min          0.000000
25%         34.000000
50%         59.000000
75%         90.000000
max        119.000000
Name: Review Text Word Count, dtype: float64

# CountVectorizer

In [10]:
# Vektorisieren mit CountVectorizer (ohne Stopwörter)

# Initialisierung
count_vectorizer = CountVectorizer()

# fit_transform
vectorized_review = count_vectorizer.fit_transform(data["Review Text Lemmatized"])

# Umwandeln in DataFrame
features_train_transformed = pd.DataFrame(vectorized_review.toarray(), columns=count_vectorizer.get_feature_names_out())

In [74]:
# Vektorisieren mit CountVectorizer (mit Stopwörter)

# Initialisierung
count_vectorizer_ws = CountVectorizer()

# fit_transform
vectorized_review_ws = count_vectorizer_ws.fit_transform(data["Review Text Lemmatized with Stopwords"])

# Umwandeln in DataFrame
features_train_transformed_ws = pd.DataFrame(vectorized_review_ws.toarray(), columns=count_vectorizer_ws.get_feature_names_out())

# Aufgabe 2

In [72]:
# Aufgabe 2
# Was sind die 10 häufigsten Wörter in allen Reviews?

# Ohne Stopwörter:

# Initialisierung eines DataFrames
word_count = pd.DataFrame(columns=["absolute", "relative [%]"])

# Bestimmung der absoluten Anzahl aller Wörter + absteigende Sortierung
word_count["absolute"] = np.sum(features_train_transformed, axis=0).sort_values(ascending=False)

# Bestimmung der relativen Anzahl aller Wörter
word_count["relative [%]"] = round(100 * word_count["absolute"]/len(data), 2)

# 10 häufigsten Wörter
word_count[:10]

Unnamed: 0,absolute,relative [%]
dress,12105,51.54
fit,10892,46.38
love,10317,43.93
size,10300,43.86
wear,10081,42.92
look,9237,39.33
top,8367,35.63
like,7738,32.95
color,7087,30.18
great,6126,26.08


In [82]:
# Ohne Stopwörter:

# Initialisierung eines DataFrames
word_count_ws = pd.DataFrame(columns=["absolute", "relative [%]"])

# Bestimmung der absoluten Anzahl aller Wörter + absteigende Sortierung
word_count_ws["absolute"] = np.sum(features_train_transformed_ws, axis=0).sort_values(ascending=False)

# Bestimmung der relativen Anzahl aller Wörter
word_count_ws["relative [%]"] = round(100 * word_count_ws["absolute"]/len(data), 2)

# 10 häufigsten Wörter
word_count_ws[:10]

Unnamed: 0,absolute,relative [%]
be,81998,349.14
the,76168,324.31
it,49290,209.87
and,49010,208.68
this,25761,109.69
to,24599,104.74
in,20754,88.37
not,18258,77.74
but,16556,70.49
on,15331,65.28


# Aufgabe 3

In [89]:
# Aufgabe 3
# Wie viele einzigartige Wörter gibt es insgesamt?

print("Azahl einzigartiger Wörter ohne Stopwörter:  ", len(word_count))
print("Azahl einzigartiger Wörter mit Stopwörter:   ", len(word_count_ws))

Azahl einzigartiger Wörter ohne Stopwörter:   11436
Azahl einzigartiger Wörter mit Stopwörter:    11494
