<a href="https://colab.research.google.com/github/dornercr/INFO371/blob/main/INFO371_week6_Full_NLP_Processing_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/legacy-datasets/banking77/" + splits["train"])

print(df.head())


In [None]:
from datasets import load_dataset

# Load banking77 from Hugging Face
dataset = load_dataset("banking77")

# Convert splits to pandas
df_train = dataset["train"].to_pandas()
df_test = dataset["test"].to_pandas()

# Display first few rows of train
print(df_train.head())


In [None]:
# Load the intent names
intent_labels = dataset["train"].features["label"].names

# Map label IDs to readable labels
df_train["intent"] = df_train["label"].map(lambda x: intent_labels[x])
df_train[["text", "intent"]].head()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = df_train["text"]
y = df_train["intent"]

vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42, stratify=y)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))



In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load embeddings model (SBERT via DistilBERT)
model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert texts to sentence embeddings
X = model.encode(df_train["text"].tolist(), show_progress_bar=True)
y = df_train["intent"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict & evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
import umap.umap_ as umap
import matplotlib.pyplot as plt
import seaborn as sns

# Reduce to 2D for visualization
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine", random_state=42)
X_embedded = reducer.fit_transform(X)

# Plot
plt.figure(figsize=(12, 8))
sns.scatterplot(x=X_embedded[:, 0], y=X_embedded[:, 1], hue=y, s=50, palette="tab20", legend=False)
plt.title("UMAP Projection of SBERT Embeddings (Banking77)")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(12, 8))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, s=50, palette="tab20", legend=False)
plt.title("PCA Projection of SBERT Embeddings (Banking77)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.show()
