In [None]:
### Data preprocessing

In [1]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import matplotlib.pyplot as plt

In [2]:
# load Aldi
df = pd.read_csv('/Users/esthercorcoran/Desktop/Thesis/Aldi/aldi_dataset.csv')


# load Lidl
df = pd.read_csv('/Users/esthercorcoran/Desktop/Thesis/Lidl/lidl_dataset.csv')

In [3]:
df

Unnamed: 0,Review Number,Review Text,Store
0,1,Aldi's generally receives positive reviews for...,Aldi
1,2,"Well laid out shop. Good selection of foods, ...",Aldi
2,3,Good store. The problem with some of these dis...,Aldi
3,4,Super-fantastic!!!! 💖💕💞💖💕💞\nVery good products...,Aldi
4,5,The quality of meat at Aldi doesn't looks well...,Aldi
...,...,...,...
4731,324,Very very nice....,Aldi
4732,325,Nice place to shop,Aldi
4733,326,Good value shame about the queasy,Aldi
4734,327,The tellers are very rude here,Aldi


In [4]:
### initial organisation 

df['Review Number'] = range(1, len(df) + 1)
df.set_index('Review Number', inplace=True)

In [5]:
### data cleaning 

In [6]:
# Preprocessing and tokenizing the text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Split text into words
    words = text.split()
    # Remove stop words
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    return words

df['Processed Text'] = df['Review Text'].apply(preprocess_text)

In [7]:
### sentiment analysis

In [8]:
analyzer = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
    sentiment_dict = analyzer.polarity_scores(text)
    compound_score = sentiment_dict['compound']
    if compound_score >= 0.05:
        return 'positive', compound_score
    elif compound_score <= -0.05:
        return 'negative', compound_score
    else:
        return 'neutral', compound_score

df['Sentiment Label'], df['Sentiment Score'] = zip(*df['Review Text'].apply(get_vader_sentiment))

In [9]:
### label distribution 

In [10]:
Label_Distribution = df['Sentiment Label'].value_counts()
print(Label_Distribution)

Sentiment Label
positive    3668
neutral      637
negative     431
Name: count, dtype: int64


In [11]:
### word frequency analysis

In [12]:
word_list = df['Processed Text'].sum()
word_freq = Counter(word_list)

word_freq

Counter({'good': 1331,
         'great': 1312,
         'staff': 1004,
         'aldi': 898,
         'shop': 749,
         'store': 582,
         'prices': 512,
         'value': 397,
         'friendly': 388,
         'nice': 386,
         'shopping': 375,
         'parking': 374,
         'place': 334,
         'products': 315,
         'food': 301,
         'helpful': 274,
         'quality': 262,
         'love': 241,
         'service': 220,
         'cheap': 217,
         'selection': 213,
         'price': 210,
         'like': 208,
         'best': 189,
         'clean': 186,
         'excellent': 173,
         'need': 168,
         'just': 159,
         'supermarket': 158,
         'stocked': 155,
         'items': 139,
         'time': 139,
         'busy': 133,
         'lovely': 133,
         'fresh': 132,
         'really': 127,
         'lidl': 123,
         'stock': 121,
         'easy': 121,
         'dont': 119,
         'big': 118,
         'money': 116,
         'va

In [13]:
### remove low freq words

In [14]:
low_stop_words = [word for word, freq in word_freq.items() if freq <= 10]
low_stop_words = set(low_stop_words)

In [15]:
low_stop_words

{'families',
 'lucky',
 'die',
 'facing',
 'manoeuvre',
 'fruitveg',
 'independent',
 'posted',
 'exploitation',
 'sparkling',
 'leaving',
 'comaderie',
 'listen',
 'onestop',
 'cake',
 'customary',
 'holidays',
 'spaceous',
 'unknown',
 'medical',
 'shelfsover',
 'including',
 'follows',
 'exiting',
 'lessthe',
 'checked',
 'updating',
 'rugsack',
 'verity',
 'firepits',
 'polish',
 'healthy',
 'cardsdriver',
 'wafers',
 'browse',
 'lov',
 'newspapers',
 'sandyfordstillorgan',
 'causes',
 'starving',
 'planter',
 'wellkept',
 'pull',
 'lamb',
 'scented',
 'insist',
 'loving',
 'forming',
 'ridiculous',
 'dayi',
 'monitor',
 'knocked',
 'jim',
 'leaping',
 'vegeta',
 'lasted',
 'knees',
 'overseas',
 'matt',
 'shampoo',
 'vouchers',
 'kean',
 'piece',
 'dump',
 'cuz',
 'order',
 'hit',
 'caused',
 'promotion',
 'greek',
 'handling',
 'reopen',
 'cafe',
 'width',
 'streaked',
 'northern',
 'deserves',
 'unsure',
 'ilac',
 'carpets',
 'suffers',
 'ma',
 'fingers',
 'prior',
 'win',
 'sho

In [16]:
new_stop_words = ENGLISH_STOP_WORDS.union(set(low_stop_words))

In [17]:
def preprocess_text(text, new_stop_words):
    # Check if the input text is a list
    if isinstance(text, list):
        # Join the list elements into a single string
        text = ' '.join(text)
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Split text into words
    words = text.split()
    # Remove stop words
    words = [word for word in words if word not in new_stop_words]
    return words

df['Processed_Text_1'] = df['Processed Text'].apply(lambda x: preprocess_text(x, new_stop_words))

In [19]:
word_list1 = df['Processed_Text_1'].sum()
word_freq1 = Counter(word_list1)

word_freq1

Counter({'good': 1331,
         'great': 1312,
         'staff': 1004,
         'aldi': 898,
         'shop': 749,
         'store': 582,
         'prices': 512,
         'value': 397,
         'friendly': 388,
         'nice': 386,
         'shopping': 375,
         'parking': 374,
         'place': 334,
         'products': 315,
         'food': 301,
         'helpful': 274,
         'quality': 262,
         'love': 241,
         'service': 220,
         'cheap': 217,
         'selection': 213,
         'price': 210,
         'like': 208,
         'best': 189,
         'clean': 186,
         'excellent': 173,
         'need': 168,
         'just': 159,
         'supermarket': 158,
         'stocked': 155,
         'items': 139,
         'time': 139,
         'busy': 133,
         'lovely': 133,
         'fresh': 132,
         'really': 127,
         'lidl': 123,
         'stock': 121,
         'easy': 121,
         'dont': 119,
         'big': 118,
         'money': 116,
         'va

In [20]:
# Aldi custom stop words
sorted_words = [word for word, freq in word_freq1.most_common()]

horizontal_words = ', '.join(sorted_words)
print(horizontal_words)

good, great, staff, aldi, shop, store, prices, value, friendly, nice, shopping, parking, place, products, food, helpful, quality, love, service, cheap, selection, price, like, best, clean, excellent, need, just, supermarket, stocked, items, time, busy, lovely, fresh, really, lidl, stock, easy, dont, big, money, variety, people, range, better, free, customer, lots, bit, plenty, car, experience, lot, stores, tills, buy, got, new, groceries, fruit, till, things, goods, reasonable, choice, open, special, location, long, park, shelves, customers, quick, small, pleasant, grocery, produce, stuff, times, aldis, super, low, weekly, offers, ok, veg, today, bargains, dublin, day, deals, quite, usual, check, cheaper, handy, checkout, local, vegetables, amazing, way, recommend, efficient, needed, favourite, available, queues, say, meat, know, use, looking, going, affordable, im, want, little, went, wide, needs, product, security, spacious, space, work, didnt, help, area, queue, week, fast, brillian

In [None]:
#Lidl custom stop words

custom_stop_words = {'good', 'great', 'lidl', 'shop', 'maybe', 'store', 'supermarkets', 'centre', 'products', 'nice', 'shopping', 'place', 'food', 'like', 'need', 'best', 'big', 'items', 'just', 'love', 'supermarket', 'excellent', 'dont', 'really', 'people', 'new', "lovely", 'buy', 'experience', 'range', 'better', 'small', 'customer', 'got', 'lots', 'things', 'stuff', 'today', 'plenty', 'lot', 'easy', 'times', 'day', 'check', 'groceries', 'area', 'money', 'customers', 'way', 'bit', 'goods', 'location', 'bad', 'free', 'amazing', 'quite', 'dublin', 'usual', 'large', 'know', 'available', 'week', 'ok', 'weekly', 'stores', 'produce', 'super', 'went', 'lidls', 'self', 'needed', 'cheaper', 'product', 'want', 'visit', 'say', 'section', 'going', 'ireland', 'little', 'recommend', 'special', 'shops', 'pleasant', 'especially', 'didnt', 'grocery', 'work', 'wide', 'home', 'working', 'different', 'street', 'looking', 'decent', 'usually', 'local', 'pay', 'im', 'wait', 'thats', 'supermarket', 'doesnt', 'needs', 'came', 'city', 'smaller', 'cemtre', 'pretty', 'bought', 'inside', 'efficient', 'ive', 'huge', 'quickly', 'does', 'happy', 'look', 'said', 'fine', 'make', 'use', 'daily', 'brilliant', 'told', 'come', 'theres', 'extremely', 'job', 'getting', 'left', 'problem', 'missing', 'seen', 'fantastic', 'bigger', 'did', 'poor', 'right', 'expect', 'compared', 'wrong', 'thank', 'road', 'standard', 'sure', 'minutes', 'hard', 'normal', 'cash', 'terrible', 'think', 'waiting', 'floor', 'worst', 'thing', 'branch', 'market', 'hours', 'man', 'worth', 'foods', 'u', 'perfect', 'kind', 'thomas', 'ask', 'wont', 'favourite', 'outside', 'grand', 'loads', 'early', 'spaces', 'larger', 'makes', 'days', 'highly', 'typical', 'recently', 'access', 'priced', 'high', 'run', 'generally', 'horrible', 'asked', 'closed', 'disappointed', 'point', 'late', 'extra', 'wish', 'doing', 'lack', 'opening', 'st', 'far', 'gone', 'regular', 'morning', 'years', 'weekend', 'thanks', 'laid', 'beautiful', 'item', 'change', 'used', 'card', 'kept', 'outs', 'person', 'away', 'wanted', 'tools', 'middle', 'avoid', 'approx', 'trying', 'lady', 'able', 'sell', 'room', 'size', 'located', 'wasnt', 'youre', 'sold'}

In [21]:
custom_stop_words = {'good', 'great', 'aldi', 'shop', 'store', 'nice', 'shopping', 'place', 'products', 'food', 'love', 
                     'like', 'best', 'excellent', 'need', 'just', 'supermarket', 'items', 'really', 'dont', 'big', 'money', 
                     'better', 'free', 'customer', 'lots', 'bit', 'experience', 'lot', 'stores', 'buy', 'got', 'new', 'things', 
                     'goods', 'special','customers', 'grocery', 'stuff', 'times', 'aldis', 'super', 'weekly', 'ok', 
                     'today', 'dublin', 'day', 'quite', 'usual', 'check', 'cheaper', 'amazing', 'way', "recommend", "needed", "favourite", 
                     "available", "say", "know", "use", "looking", "going", "im", "want", "little", "went", "needs", "product", "work", "didnt", "area", "week",
                     "brilliant", "fantastic", "bad", "visit", "usually", "happy", "home", "shops", "middle", "think", 
                     "underground", "ireland", "make", "supermarkets", "foods", "right", "ive", "pretty", "told", "thats", "look", "loads", 
                     "thank", "section", "wait", "priced", "branch", "high", "buys", "looks", "shoppers", "u", "compared", "hard", "road", "asked", "does", "getting", "used", "worth", "far", "especially", "stop", 
                     "different", "doesnt", "definitely", "did", "stars", "cheapest", "thing", "thanks", "years", "laid", "seen", 
                     "wish", "outside", "enjoy", "normal", "said", "number", "self", "extremely", "grand", "centre", "fine", "near", "left", "decent", "huge", "street", 
                     "minutes", "run", "sure", "come", "city", "pay", "id", "item", "overall", "wont", "sold", "away", "wanted", "feel", "covid", 
                     "ask", "problem", "having", "expect", "reasonably", "bright", "doing", "gone", "household",
                     "bought", "theres", "plus", "highly", "days", "trying", "youre", "makes", "located", "daily", "competitive", "n", "perfect", "cash", "weeks", "closed", "working", "pack", 
                     "tell", "star", "distancing", "guy", "pleasure", "center", "theyre" "bargain", "particular", "social", "prefer", "try", "euro", "came"}

In [22]:
all_stop_words = custom_stop_words.union(set(new_stop_words))

In [23]:
def preprocess_text(text, all_stop_words):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Split text into words
    words = text.split()
    # Remove stop words
    words = [word for word in words if word not in all_stop_words]
    return words

# Apply the preprocessing function to the DataFrame
df['Processed_Text_2'] = df['Review Text'].apply(lambda x: preprocess_text(x, all_stop_words))

In [25]:
# Remove rows where 'Processed_Text_2' is an empty list
df = df[df['Processed_Text_2'].apply(lambda x: len(x) > 0)]

# Optionally reset the index
df.reset_index(drop=True, inplace=True)

In [27]:
df = df.drop(['Review Text', 'Store', 'Processed Text', 'Processed_Text_1'], axis=1)

In [28]:
df.rename(columns={'Processed_Text_2': 'Cleaned Text'}, inplace=True)

In [29]:
df

Unnamed: 0,Sentiment Label,Sentiment Score,Cleaned Text
0,positive,0.8271,"[generally, reasonable, prices, value, options..."
1,positive,0.8442,"[selection, fresh, vegetables, meats, frozen, ..."
2,positive,0.6956,"[discount, theyre, stocked, fresh, stocked, li..."
3,positive,0.9974,"[wide, range, affordable, prices]"
4,neutral,0.0320,"[quality, meat, inside]"
...,...,...,...
3699,neutral,0.0000,"[ample, parking, space]"
3700,negative,-0.4588,"[till, rude]"
3701,positive,0.2960,[value]
3702,negative,-0.5095,[rude]


In [30]:
df.to_csv('/Users/esthercorcoran/Desktop/Thesis/Aldi/Preprocessed Aldi.csv', index=False)

In [None]:
### or 

df.to_csv('/Users/esthercorcoran/Desktop/Thesis/Lidl/Preprocessed Lidl.csv', index=False)

In [None]:
### Data analysis

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

In [None]:
df = pd.read_csv('/Users/esthercorcoran/Desktop/Thesis/Aldi/Preprocessed Aldi.csv')

### or 

 = pd.read_csv('/Users/esthercorcoran/Desktop/Thesis/Lidl/Preprocessed Lidl.csv')

In [None]:
# clean text data 
def clean_text(text_list):
    if isinstance(text_list, list):
        # Join the words in the list into a single string separated by commas
        return ', '.join(text_list)
    return text_list  # If it's not a list, return the text as it is

df['Cleaned Text'] = df['Cleaned Text'].apply(clean_text)

In [None]:
sentiment_counts = df['Sentiment Label'].value_counts()

plt.figure(figsize=(8, 6))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=['lightblue', 'lightpink', 'lightgreen'])
plt.title('Distribution of Sentiment Labels for Aldi Reviews')
plt.show()

In [None]:
# positive analysis 

In [None]:
positive_df = df[df['Sentiment Label'] == 'positive']

In [None]:
### positive topics 

import numpy as np

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
text_data = positive_df['Cleaned Text'].apply(lambda x: ' '.join(eval(x)))
X = vectorizer.fit_transform(text_data)

# Fit LDA model
lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(X)

def plot_topics(lda_model, vectorizer, n_top_words=5, n_topics=3, figsize=(12, 8)):
    feature_names = vectorizer.get_feature_names_out()
    colors = plt.cm.get_cmap('Set3', n_top_words)  # Using Set3 colormap for pleasant colors

    fig, axes = plt.subplots(1, n_topics, figsize=figsize, sharey=True)
    plt.subplots_adjust(wspace=0.4)

    for topic_idx, topic in enumerate(lda_model.components_):
        top_words_indices = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_indices]
        weights = topic[top_words_indices]
        bar_colors = colors(np.arange(n_top_words))

        ax = axes[topic_idx]
        ax.barh(top_words, weights, color=bar_colors, edgecolor='black')
        ax.set_title(f'Topic #{topic_idx + 1}', fontsize=14)
        ax.invert_yaxis()  # Invert y-axis to have the most relevant words at the top
        ax.set_xlabel('Weight', fontsize=12)

    plt.tight_layout()
    plt.show()

plot_topics(lda, vectorizer, n_top_words=5, n_topics=3, figsize=(15, 5))


In [None]:
import pandas as pd
import seaborn as sns

In [None]:
### unigrams

text_data = positive_df['Cleaned Text'].apply(lambda x: ' '.join(eval(x)))

vectorizer = CountVectorizer(ngram_range=(1, 1))
X = vectorizer.fit_transform(text_data)

unigram_freq = pd.DataFrame({
    'unigram': vectorizer.get_feature_names_out(),
    'frequency': X.toarray().sum(axis=0)
})

# sort
top_unigrams = unigram_freq.sort_values(by='frequency', ascending=False).head(20)
top_unigrams = top_unigrams.reset_index(drop=True)

# plot
sns.set(style="whitegrid")

plt.figure(figsize=(12, 8))  # Specify the figure size
barplot = sns.barplot(x='frequency', y='unigram', data=top_unigrams, palette="pastel")

plt.title('Top 20 Unigrams for Positive Reviews')
plt.xlabel('Frequency')
plt.ylabel('Unigrams')

plt.show()

In [None]:
### bigram

text_data = positive_df['Cleaned Text'].apply(lambda x: ' '.join(eval(x)))

vectorizer = CountVectorizer(ngram_range=(2, 2))
X = vectorizer.fit_transform(text_data)

bigram_freq = pd.DataFrame({
    'bigram': vectorizer.get_feature_names_out(),
    'frequency': X.toarray().sum(axis=0)
})

top_bigrams = bigram_freq.sort_values(by='frequency', ascending=False).head(20)

top_bigrams = top_bigrams.reset_index(drop=True)

### plot 
sns.set(style="whitegrid")

plt.figure(figsize=(12, 8))  # Specify the figure size
barplot = sns.barplot(x='frequency', y='bigram', data=top_bigrams, palette="pastel")

plt.title('Top 20 Bigrams for Positive Reviews')
plt.xlabel('Frequency')
plt.ylabel('Bigrams')
plt.show()

In [None]:
### negative analysis

In [None]:
### negative topics 

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
text_data = negative_df['Cleaned Text'].apply(lambda x: ' '.join(eval(x)))
X = vectorizer.fit_transform(text_data)

lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(X)

# plot
def plot_topics(lda_model, vectorizer, n_top_words=5, n_topics=3, figsize=(12, 8)):
    feature_names = vectorizer.get_feature_names_out()
    colors = plt.cm.get_cmap('Set3', n_top_words)  # Using Set3 colormap for pleasant colors

    fig, axes = plt.subplots(1, n_topics, figsize=figsize, sharey=True)
    plt.subplots_adjust(wspace=0.4)

    for topic_idx, topic in enumerate(lda_model.components_):
        top_words_indices = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_indices]
        weights = topic[top_words_indices]
        bar_colors = colors(np.arange(n_top_words))

        ax = axes[topic_idx]
        ax.barh(top_words, weights, color=bar_colors, edgecolor='black')
        ax.set_title(f'Topic #{topic_idx + 1}', fontsize=14)
        ax.invert_yaxis()  # Invert y-axis to have the most relevant words at the top
        ax.set_xlabel('Weight', fontsize=12)

    plt.tight_layout()
    plt.show()

plot_topics(lda, vectorizer, n_top_words=5, n_topics=3, figsize=(15, 5))

In [None]:
### unigrams

text_data3 = negative_df['Cleaned Text'].apply(lambda x: ' '.join(eval(x)))
vectorizer = CountVectorizer(ngram_range=(1, 1))
X = vectorizer.fit_transform(text_data3)

unigram_freq = pd.DataFrame({
    'unigram': vectorizer.get_feature_names_out(),
    'frequency': X.toarray().sum(axis=0)
})

top_unigrams3 = unigram_freq.sort_values(by='frequency', ascending=False).head(20)

# plot
plt.figure(figsize=(10, 8))
plt.barh(top_unigrams3['unigram'], top_unigrams3['frequency'], color=plt.cm.Pastel1.colors)
plt.xlabel('Frequency')
plt.ylabel('Unigrams')
plt.title('Top 20 Unigrams for Aldi Negative Reviews')
plt.gca().invert_yaxis()
plt

In [None]:
# bigram

text_data3 = negative_df['Cleaned Text'].apply(lambda x: ' '.join(eval(x)))
vectorizer = CountVectorizer(ngram_range=(2, 2))
X = vectorizer.fit_transform(text_data3)

bigram_freq = pd.DataFrame({
    'bigram': vectorizer.get_feature_names_out(),
    'frequency': X.toarray().sum(axis=0)
})

top_bigrams3 = bigram_freq.sort_values(by='frequency', ascending=False).head(20)

# plot
plt.figure(figsize=(10, 8))
plt.barh(top_bigrams3['bigram'], top_bigrams3['frequency'], color=plt.cm.Pastel1.colors)
plt.xlabel('Frequency')
plt.ylabel('Bigrams')
plt.title('Top 20 Bigrams for Aldi Negative Reviews')
plt.gca().invert_yaxis()
plt.show()

In [None]:
### clustering

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Cleaned Text'].apply(lambda x: ' '.join(eval(x))))

In [None]:
from sklearn.cluster import KMeans
### k means clustering
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(tfidf_matrix)

In [None]:
for cluster in range(optimal_k):
    cluster_data = df[df['Cluster'] == cluster]
    print(f"Cluster {cluster}:\n")
    print(cluster_data['Cleaned Text'].apply(lambda x: ' '.join(eval(x))).head(10))
    print("\n")

In [None]:
from collections import Counter

# change cluster number when needed


cluster_3_data = df[df['Cluster'] == 3]

# clean
def clean_and_tokenize(text):
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text).lower()
    # Tokenize the text
    tokens = text.split()
    return tokens

all_words3 = [token for tokens in cluster_3_data['Cleaned Text'].apply(clean_and_tokenize) for token in tokens]

# count most common words
common_words_3 = Counter(all_words3).most_common()

filtered_words = [(word, count) for word, count in common_words_3 if count > 7]

if filtered_words:
    words, counts = zip(*filtered_words)
else:
    words, counts = [], []

# plot
plt.figure(figsize=(12, 8))
plt.barh(words, counts, color='skyblue')
plt.xlabel('Count')
plt.ylabel('Words')
plt.title('Common Words in Quality Emphasisers Cluster')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# change key word when needed
# co occurance analysis 
def clean_and_tokenize(text):
    text = re.sub(r'[^\w\s]', '', text).lower()
    tokens = text.split()
    return tokens

# extract sentences containing the word "Lidl"
lidl_sentences = cluster_0_data[cluster_0_data['Cleaned Text'].str.contains('lidl', case=False)]

co_occurring_words = []


for index, row in lidl_sentences.iterrows():
    text = row['Cleaned Text']
    tokens = clean_and_tokenize(text)
    
    if 'lidl' in tokens:
        co_occurring_words.extend(tokens)

co_occurring_words = [word for word in co_occurring_words if word != 'lidl']
co_occurrence_counts = Counter(co_occurring_words)
top_co_occurring_words = co_occurrence_counts.most_common(20)
words, counts = zip(*top_co_occurring_words)

# plot
plt.figure(figsize=(12, 8))
plt.barh(words, counts, color='skyblue')
plt.xlabel('Count')
plt.ylabel('Words')
plt.title('"Lidl" Co-occurring Words in Lidl Comparers Cluster')
plt.gca().invert_yaxis()  # To display the highest counts on top
plt.show()