### Illustrating words in given clusters using WordClouds

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd

# Read the data
all_clustered = pd.read_csv('data/all_data_clustered.csv')
all_clustered = all_clustered.dropna()
all_clustered.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

def extract_and_save_tfidf_terms(data, cluster_column, text_column, top_print=5):
    all_tfidf_terms = {}
    tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
    clusters = data[cluster_column].unique()
    
    for cluster in clusters:
        # Filter data for the specific cluster
        cluster_data = data[data[cluster_column] == cluster]
        # Fit and transform TF-IDF on the cluster text
        tfidf_matrix = tfidf.fit_transform(cluster_data[text_column])
        feature_names = tfidf.get_feature_names_out()
        tfidf_scores = tfidf_matrix.sum(axis=0).A1
        tfidf_df = pd.DataFrame({
            'term': feature_names,
            'score': tfidf_scores
        }).sort_values(by='score', ascending=False)
        
        # Save all terms and scores for this cluster
        all_tfidf_terms[cluster] = dict(zip(tfidf_df['term'], tfidf_df['score']))
        
        # Print a few top terms as an example
        print(f"\nCluster {cluster}: Top {top_print} TF-IDF Terms (as an example):\n")
        print(tfidf_df.head(top_print).to_string(index=False))
    
    return all_tfidf_terms


tfidf_terms = extract_and_save_tfidf_terms(all_clustered, 'KMEANS', 'comment_no_stopwords', top_print=5)


In [None]:
def generate_wordclouds_from_saved_terms(tfidf_terms):
    for cluster, tfidf_dict in tfidf_terms.items():
        # Generate a word cloud using TF-IDF scores
        wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(tfidf_dict)
        
        # Plot the word cloud
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.title(f"TF-IDF Word Cloud for Cluster {cluster}", fontsize=16)
        plt.show()

generate_wordclouds_from_saved_terms(tfidf_terms)