In [7]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from tabulate import tabulate
from collections import Counter
from nltk.stem import PorterStemmer

# Load the CSV file
df = pd.read_csv("customer_complaints_1.csv")

# List of common stopwords to remove
basic_stopwords = {
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
    'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
    'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
    'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
    'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
    'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
    'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
    'with', 'about', 'against', 'between', 'into', 'through', 'during',
    'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
    'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
    'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
    'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
    'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
    's', 't', 'can', 'will', 'just', 'don', 'should', 'now'
}

# Prepare stemming tool
stemmer = PorterStemmer()

# Clean and preprocess the text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in basic_stopwords]
    return ' '.join(words)

# Apply cleaning to the text column
df['processed_text'] = df['text'].astype(str).apply(preprocess_text)

# Convert text into TF-IDF vectors
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_text'])

# Group text into clusters
k = 5
km = KMeans(n_clusters=k, random_state=42)
km.fit(X)
y_pred = km.predict(X)

# Show top 10 keywords for each cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f"  {terms[ind]}")
    print()

# Calculate and print purity (dummy since no real labels)
total_samples = len(y_pred)
cluster_label_counts = Counter(y_pred)
purity = sum(cluster_label_counts.values()) / total_samples
print(f"Purity: {purity:.2f}")


Top terms per cluster:
Cluster 0:
  investig
  protocol
  malfunct
  sinc
  custom
  altern
  illog
  certain
  follow
  predecessor

Cluster 1:
  internet
  day
  comcast
  cabl
  tech
  time
  servic
  set
  im
  secur

Cluster 2:
  rude
  servic
  second
  bill
  box
  comcast
  rep
  joke
  pass
  resolv

Cluster 3:
  speed
  mbp
  contract
  custom
  pay
  say
  internet
  servic
  call
  blast

Cluster 4:
  xfiniti
  would
  contract
  state
  call
  cancel
  sign
  servic
  store
  area

Purity: 1.00
