<a href="https://colab.research.google.com/github/dornercr/INFO371/blob/main/INFO371_Week6_Text_CS_vs_Marketing_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Representation: Customer Service vs Marketing Messages
This notebook demonstrates:
- Dynamic dataset generation from different business sectors
- TF-IDF vectorization
- Cosine similarity
- Naive Bayes classification
- Word embeddings with SpaCy

In [None]:
# Step 1: Generate dynamic sector-based dataset
import pandas as pd
import random

customer_service = [
    "We apologize for the delay and will resolve your issue shortly.",
    "Thank you for contacting support. We’ve escalated your case.",
    "Your refund has been processed and will appear in 3–5 days.",
    "Please reset your password using the link provided.",
    "We are currently experiencing high volumes; thank you for your patience.",
    "Our technician is en route to your location.",
    "Your ticket has been closed. Please reply if the issue persists.",
    "Thank you for your feedback. We'll use it to improve.",
    "We’ve extended your subscription by 1 month at no cost.",
    "Your replacement product is on its way."
]

marketing = [
    "Get 25% off your next order with this exclusive code!",
    "Discover our new arrivals—shop the latest trends today!",
    "Upgrade now and unlock premium benefits.",
    "Limited-time offer: free shipping on all orders over $50!",
    "Refer a friend and earn rewards instantly.",
    "This weekend only: buy one, get one free.",
    "Don’t miss out—our flash sale ends tonight!",
    "Personalized recommendations just for you.",
    "Subscribe today for members-only deals.",
    "Turn heads with our bold new product line."
]

dataset = [["customer_service", msg] for _ in range(5) for msg in customer_service] + \
          [["marketing", msg] for _ in range(5) for msg in marketing]
random.shuffle(dataset)

df = pd.DataFrame(dataset, columns=["label", "text"])
df.head()

Unnamed: 0,label,text
0,marketing,Don’t miss out—our flash sale ends tonight!
1,marketing,Discover our new arrivals—shop the latest tren...
2,marketing,"This weekend only: buy one, get one free."
3,customer_service,Our technician is en route to your location.
4,customer_service,Our technician is en route to your location.


In [None]:
# Step 2: TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])
y = df['label']

In [None]:
# Step 3: Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(X[0:1], X[1:2])
print("Cosine similarity between doc 0 and doc 1:", sim[0][0])

Cosine similarity between doc 0 and doc 1: 0.0


In [None]:
# Step 4: Naive Bayes Classification
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0
                  precision    recall  f1-score   support

customer_service       1.00      1.00      1.00        15
       marketing       1.00      1.00      1.00        15

        accuracy                           1.00        30
       macro avg       1.00      1.00      1.00        30
    weighted avg       1.00      1.00      1.00        30



## Step 5: Word Embeddings with SpaCy
Use SpaCy's medium model to calculate semantic similarity between messages.

In [None]:
# !python -m spacy download en_core_web_md  # Run this in Colab/locally if not already installed
import spacy
nlp = spacy.load("en_core_web_md")

doc1 = nlp(df['text'].iloc[0])
doc2 = nlp(df['text'].iloc[1])
print("SpaCy semantic similarity:", doc1.similarity(doc2))

SpaCy semantic similarity: 0.6733514070510864
