## REGEX

In [49]:
import re

# Sample text
text = "Contact me at john.doe@example.com or call me at +1-123-456-7890."

# Extract email
email = re.findall(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', text)

# Extract phone number
phone = re.findall(r'\+?\d{1,4}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}', text)

print("Extracted Email:", email)
print("Extracted Phone:", phone)

Extracted Email: ['john.doe@example.com']
Extracted Phone: ['+1-123-456-7890']


## Spacy Stuff

In [3]:
import spacy

nlp = spacy.load("en_core_web_md")
doc = nlp("John works at OpenAI. His email is john.doe@example.com.")

# Tokenization
tokens = [token.text for token in doc]

# Lemmatization
lemmas = [token.lemma_ for token in doc]

# POS Tagging
pos_tags = [(token.text, token.pos_) for token in doc]

# Named Entity Recognition (NER)
entities = [(ent.text, ent.label_) for ent in doc.ents]

# Stopword Removal
stopwords = [token.text for token in doc if not token.is_stop]

print("Tokens:", tokens)
print("Lemmas:", lemmas)
print("POS Tags:", pos_tags)
print("Entities:", entities)
print("Without Stopwords:", stopwords)

Tokens: ['John', 'works', 'at', 'OpenAI', '.', 'His', 'email', 'is', 'john.doe@example.com', '.']
Lemmas: ['John', 'work', 'at', 'OpenAI', '.', 'his', 'email', 'be', 'john.doe@example.com', '.']
POS Tags: [('John', 'PROPN'), ('works', 'VERB'), ('at', 'ADP'), ('OpenAI', 'PROPN'), ('.', 'PUNCT'), ('His', 'PRON'), ('email', 'NOUN'), ('is', 'AUX'), ('john.doe@example.com', 'PROPN'), ('.', 'PUNCT')]
Entities: [('John', 'PERSON')]
Without Stopwords: ['John', 'works', 'OpenAI', '.', 'email', 'john.doe@example.com', '.']


In [6]:
from nltk.stem import PorterStemmer

## Stemming
ps = PorterStemmer()
stemmed_tokens = [ps.stem(token) for token in tokens]
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['john', 'work', 'at', 'openai', '.', 'hi', 'email', 'is', 'john.doe@example.com', '.']


## TF-IDF, Cosine Similarity, and Classification

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups

# Sample data
corpus = ["The sky is blue.", "The sun is bright.", "The sun in the blue sky is bright."]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Cosine Similarity
cos_sim = cosine_similarity(X[0], X[1:])
print("Cosine Similarity:", cos_sim)

# Classification Example
data = fetch_20newsgroups(subset='all', categories=['rec.sport.hockey', 'sci.space'], remove=('headers', 'footers', 'quotes'))
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2)
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)
print("Accuracy:", clf.score(vectorizer.transform(X_test), y_test))

Cosine Similarity: [[0.37620501 0.71942228]]
Accuracy: 0.9723618090452262


In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample text data
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Initialize CountVectorizer for Bag of Words (with unigrams and bigrams)
bow_vectorizer = CountVectorizer(ngram_range=(2, 2))  # bigrams
bow_vectors = bow_vectorizer.fit_transform(documents)

# Initialize TfidfVectorizer for TF-IDF (with unigrams and bigrams)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 2))  # bigrams
tfidf_vectors = tfidf_vectorizer.fit_transform(documents)

# Display the features (vocabulary)
print("Bag of Words Vocabulary:", bow_vectorizer.get_feature_names_out())
print("TF-IDF Vocabulary:", tfidf_vectorizer.get_feature_names_out())

# Display vectorized results as arrays (convert sparse matrix to dense)
print("\nBag of Words Vectors (Dense):\n", bow_vectors.toarray())
print("\nTF-IDF Vectors (Dense):\n", tfidf_vectors.toarray())

Bag of Words Vocabulary: ['and this' 'document is' 'first document' 'is the' 'is this'
 'second document' 'the first' 'the second' 'the third' 'third one'
 'this document' 'this is' 'this the']
TF-IDF Vocabulary: ['and this' 'document is' 'first document' 'is the' 'is this'
 'second document' 'the first' 'the second' 'the third' 'third one'
 'this document' 'this is' 'this the']

Bag of Words Vectors (Dense):
 [[0 0 1 1 0 0 1 0 0 0 0 1 0]
 [0 1 0 1 0 1 0 1 0 0 1 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 1 0]
 [0 0 1 0 1 0 1 0 0 0 0 0 1]]

TF-IDF Vectors (Dense):
 [[0.         0.         0.52303503 0.42344193 0.         0.
  0.52303503 0.         0.         0.         0.         0.52303503
  0.        ]
 [0.         0.47633035 0.         0.30403549 0.         0.47633035
  0.         0.47633035 0.         0.         0.47633035 0.
  0.        ]
 [0.49819711 0.         0.         0.31799276 0.         0.
  0.         0.         0.49819711 0.49819711 0.         0.39278432
  0.        ]
 [0.         0.  

## Word2Vec: Real vs. Fake Classification

In [52]:
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

# Load pretrained GloVe embeddings
glove = api.load("glove-wiki-gigaword-50")

data = pd.read_csv("spam.csv")

texts = data.Message
labels = [0 if i == "spam" else 1 for i in data.Category]

# Sample data
# texts = ["This is a real message.", "Congratulations, you have won!", "Claim your prize now."]
# labels = [1, 0, 0]  # Real: 1, Fake: 0

# # Vectorizing text using GloVe embeddings
def vectorize(text):
    words = text.split()
    vectors = [glove[word] for word in words if word in glove]
    return np.mean(vectors, axis=0) if vectors else np.zeros(50)

X = np.array([vectorize(text) for text in texts])
# X_train, y_train = X, labels
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.1, random_state=42)

# # Logistic Regression for Classification
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print("Accuracy:", classifier.score(X_test, y_test))

Accuracy: 0.8978494623655914


## GloVe: Word Vectors

In [34]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

model = Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)
model.build_vocab(common_texts, progress_per=1000)
model.train(common_texts, total_examples=model.corpus_count, epochs=model.epochs)

vector = model.wv['computer'] 

print("Word vector for Computer")
print(vector)

sims = model.wv.most_similar('computer', topn=10)  # get other similar words
print(sims)
print(model.wv.similarity(w1="computer", w2="time"))

Word vector for Computer
[-0.00515624 -0.00666834 -0.00777684  0.00831073 -0.00198234 -0.00685496
 -0.00415439  0.00514413 -0.00286914 -0.00374966  0.00162143 -0.00277629
 -0.00158436  0.00107449 -0.00297794  0.00851928  0.00391094 -0.00995886
  0.0062596  -0.00675425  0.00076943  0.00440423 -0.00510337 -0.00211067
  0.00809548 -0.00424379 -0.00763626  0.00925791 -0.0021555  -0.00471943
  0.0085708   0.00428334  0.00432484  0.00928451 -0.00845308  0.00525532
  0.00203935  0.00418828  0.0016979   0.00446413  0.00448629  0.00610452
 -0.0032021  -0.00457573 -0.00042652  0.00253373 -0.00326317  0.00605772
  0.00415413  0.00776459  0.00256927  0.00811668 -0.00138721  0.00807793
  0.00371702 -0.00804732 -0.00393361 -0.00247188  0.00489304 -0.00087216
 -0.00283091  0.00783371  0.0093229  -0.00161493 -0.00515925 -0.00470176
 -0.00484605 -0.00960283  0.00137202 -0.00422492  0.00252671  0.00561448
 -0.00406591 -0.00959658  0.0015467  -0.00670012  0.00249517 -0.00378063
  0.00707842  0.00064022  

## LSTM/GRU for Sentiment Classification

In [36]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Sample data
texts = data.Message
labels = [0 if i == "spam" else 1 for i in data.Category]

# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=5)
y = np.array(labels)

# Model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50),
    LSTM(50),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=10)

Epoch 1/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.8595 - loss: 0.4206
Epoch 2/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9820 - loss: 0.0753
Epoch 3/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9926 - loss: 0.0290
Epoch 4/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9949 - loss: 0.0200
Epoch 5/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9985 - loss: 0.0080
Epoch 6/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9986 - loss: 0.0056
Epoch 7/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9997 - loss: 0.0027
Epoch 8/10
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9998 - loss: 0.0016
Epoch 9/10
[1m175/175[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x1ba326035c0>

## Transformers - Sent Analysis, QA, Summarization

In [60]:
from transformers import pipeline

# Sentiment Analysis
classifier = pipeline("sentiment-analysis",  model="distilbert-base-uncased-finetuned-sst-2-english", framework="pt")
result = classifier("I love using Hugging Face transformers!")
print(result[0])

# Question Answering
qa = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", framework="pt")
context = "Transformers are state-of-the-art models for NLP tasks."
result = qa(question="What are Transformers used for?", context=context)
print(result["answer"])

# Text Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", framework="pt")
text = "Hugging Face Transformers provide a wide variety of NLP capabilities."
summary = summarizer(text, max_length=20, min_length=5, do_sample=False)
print(summary[0]["summary_text"])

{'label': 'POSITIVE', 'score': 0.9971315860748291}
NLP tasks


Your max_length is set to 20, but your input_length is only 15. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)


Hugging Face Transformers provide a wide variety of NLP capabilities. Hugging Face


## BERT

In [48]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Sample data (replace with your dataset)
texts = data.Message.tolist()  # Example: List of messages
labels = [0 if i == "spam" else 1 for i in data.Category]  # Convert categories to 0 and 1

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2)

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define a custom dataset class for tokenization
class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)
        return input_ids, attention_mask, torch.tensor(label, dtype=torch.long)

# Create datasets and data loaders
train_dataset = SpamDataset(train_texts, train_labels, tokenizer)
test_dataset = SpamDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Set up the optimizer and device
optimizer = AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 