In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
url = 'https://raw.githubusercontent.com/esnt/Data/refs/heads/main/Text/gc_descriptions.csv'
df = pd.read_csv(url)

# sms_url = 'https://raw.githubusercontent.com/esnt/Data/refs/heads/main/CleanData/SMSSpamCollection'
# df = pd.read_csv(sms_url, sep='\t', header=None, names=['label', 'message'])

In [None]:
df['length'] = df['description'].apply(len)
df['n_punct'] = df['description'].apply(lambda x: sum([1 for char in x if char in '.,;:!?']))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
documents = df['description']
tf = TfidfVectorizer(stop_words='english', min_df=20, max_df=0.5)
X = tf.fit_transform(documents)
vocab = tf.get_feature_names_out()
tfidf_matrix = X.toarray()

In [None]:
pd.DataFrame(X.toarray(), columns=vocab).head()

In [None]:

top_n = 10  
word_importance = np.mean(tfidf_matrix, axis=0)  # Average TF-IDF scores across docs
top_indices = np.argsort(word_importance)[-top_n:]  # Get indices of top words

# Subset TF-IDF matrix and feature names
n_docs = 50
tfidf_matrix_small = tfidf_matrix[0:50, top_indices]
top_feature_names = vocab[top_indices]


In [None]:

# Create heatmap for the selected words
plt.figure(figsize=(10, 10))
sns.heatmap(tfidf_matrix_small, annot=True, xticklabels=top_feature_names, 
             yticklabels='', cmap='Blues')
plt.title("TF-IDF Scores Heatmap (Top Words Only)")
plt.xlabel("Words")
plt.ylabel("Documents")
plt.show()

In [None]:
## probability need to install wordcloud
## conda install -c conda-forge wordcloud
## or
## pip install wordcloud

from wordcloud import WordCloud

# Combine all TF-IDF scores across documents
word_weights = np.sum(tfidf_matrix, axis=0)
word_dict = dict(zip(vocab, word_weights))

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_dict)

# Display word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of TF-IDF Weights")
plt.show()

In [None]:
sns.barplot(x=top_feature_names, y=word_importance[top_indices]);

In [None]:
## conda install spacy
## python -m spacy download en_core_web_sm


In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
word_vectors = np.array([nlp(word).vector for word in vocab])

In [None]:
# Reduce to 2D using PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(word_vectors)

# Create DataFrame for easy plotting
pca_df = pd.DataFrame(reduced_vectors, columns=["PCA1", "PCA2"])
pca_df["Word"] = vocab


In [None]:
import plotly.express as px
fig = px.scatter(pca_df, x='PCA1', y='PCA2', text='Word')
fig.update_traces(textposition='top center')
fig.update_layout(title="Phrase Embeddings Visualized with PCA")
fig.update_layout(width=800, height=800)
fig.show()


In [None]:
### 
phrase_vectors = []
for phrase in documents:
    doc = nlp(phrase)  # Process phrase using spaCy
    vector = doc.vector  # Extract the phrase's vector representation (average of word vectors)
    phrase_vectors.append(vector)

# Convert to NumPy array
phrase_vectors = np.array(phrase_vectors)

In [None]:
phrase_vectors.shape

In [None]:
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(phrase_vectors)

In [None]:
phrase_df = pd.DataFrame(reduced_vectors, columns=["PCA1", "PCA2"])
phrase_df["Phrase"] = documents

import plotly.express as px
fig = px.scatter(phrase_df.sample(15), x='PCA1', y='PCA2', text='Phrase')
fig.update_traces(textposition='top center')
fig.update_layout(title="Phrase Embeddings Visualized with PCA")
# increase fig size
fig.update_layout(width=1000, height=800)
fig.show()