<a href="https://colab.research.google.com/github/eriksali/DNN_2023_NLP/blob/main/NLP_hw2_sklearn_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
'''
1.	Embeddings

The two variations of embeddings for training are skip-gram based embeddings and CBOW based embeddings, and the library genism was used to train them. 

'''

!pip install datasets 
!pip install apache_bea
!pip install gensim
!pip install fasttext
!pip install apache_beam
from datasets import load_dataset

import gensim
import fasttext
import nltk
nltk.download('punkt')

# Load the Wikipedia dataset
dataset = load_dataset("wikipedia", "20220301.simple")['train']

# Tokenize the text
tokenized_text = [nltk.word_tokenize(text.lower()) for text in dataset['text']]

# Train skip-gram based embeddings with gensim
skipgram_model = gensim.models.Word2Vec(tokenized_text, size=100, window=5, min_count=5, workers=4, sg=1)

# Train CBOW based embeddings with gensim
cbow_model = gensim.models.Word2Vec(tokenized_text, size=100, window=5, min_count=5, workers=4, sg=0)


# Save the models
skipgram_model.save("skipgram.model")
cbow_model.save("cbow.model")



In [None]:
'''!pip install torch torchvision
!pip install datasets
!pip install transformers'''

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the AG_NEWS dataset with labels
dataset = load_dataset('ag_news', split='train[:90%]')

# Load the CBOW-based pretrained embeddings
tokenizer = AutoTokenizer.from_pretrained('cbow.model.pt')
model = AutoModel.from_pretrained('cbow.model.pt')

'''
# Load the saved model
model_path = "/content/cbow_model.pt"
model = torch.load(model_path)'''


# Define a function to generate input features from the embeddings
def generate_features(text):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
    outputs = model(input_ids)
    features = outputs[0].detach().numpy()[0].mean(axis=0)
    return features

# Generate input features for each example in the dataset
X = np.array([generate_features(example['text']) for example in dataset])

# Extract the labels from the dataset
y = np.array([example['label'] for example in dataset])

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
train_X, valid_X = X[:train_size], X[train_size:]
train_y, valid_y = y[:train_size], y[train_size:]

# Train a logistic regression classifier on the training set
clf = LogisticRegression(max_iter=1000)
clf.fit(train_X, train_y)

# Evaluate the classifier on the validation set
valid_preds = clf.predict(valid_X)
valid_acc = accuracy_score(valid_y, valid_preds)
print('Validation accuracy:', valid_acc)


In [None]:

## 1.	Embeddings

## The two other sets of pretrained embeddings are glove.6B.100d and word2vec-google-news-300.


'''!pip install torch torchvision
!pip install datasets
!pip install transformers'''

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# Load the AG_NEWS dataset with labels
dataset = load_dataset('ag_news', split='train[:90%]')

'''# Define a function to generate input features from the embeddings
def generate_features(text):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
    outputs = model(input_ids)
    features = outputs[0].detach().numpy()[0].mean(axis=0)
    return features'''

'''# Load the CBOW-based pretrained embeddings
tokenizer = AutoTokenizer.from_pretrained('cbow.model')
model = AutoModel.from_pretrained('cbow.model')'''

'''# Load the saved model
model_path = "/content/cbow.model"
model = torch.load(model_path)'''

model = AutoModel.from_pretrained('cbow.model')

# Define a function to generate input features from the embeddings
def generate_features(text):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
    outputs = model(input_ids)
    features = outputs[0].detach().numpy()[0].mean(axis=0)
    return features


# Generate input features for each example in the dataset
X = np.array([generate_features(example['text']) for example in dataset])

# Extract the labels from the dataset
y = np.array([example['label'] for example in dataset])

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
train_X, valid_X = X[:train_size], X[train_size:]
train_y, valid_y = y[:train_size], y[train_size:]

# Train a logistic regression classifier on the training set
clf = LogisticRegression(max_iter=1000)
clf.fit(train_X, train_y)

# Evaluate the classifier on the validation set
valid_preds = clf.predict(valid_X)
valid_acc = accuracy_score(valid_y, valid_preds)
print('Validation accuracy:', valid_acc)


In [None]:
# Example queries
print(cbow_model.most_similar('country'))
print(cbow_model.most_similar(positive=['browser', 'firefox'], negative=['chrome']))
print(cbow_model.most_similar(positive=['fruit', 'orange']))
print(cbow_model.most_similar(positive=['he','him','his','himself'], negative=['she','her','hers','herself']))
print(cbow_model.most_similar(positive=['me','my','myself'], negative=['you','your','yourself']))
print('################################################################################################')
print(skipgram_model.most_similar('country'))
print(cbow_model.most_similar(positive=['browser', 'firefox'], negative=['chrome']))
print(skipgram_model.most_similar(positive=['fruit', 'orange']))
print(skipgram_model.most_similar(positive=['he','him','his','himself'], negative=['she','her','hers','herself']))
print(skipgram_model.most_similar(positive=['me','my','myself'], negative=['you','your','yourself']))




In [None]:
import torch

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip


# Define the path to the GloVe embeddings file
glove_path = "glove.6B.100d.txt"

# Load the GloVe embeddings into a dictionary
embeddings_dict = {}
with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        vector = torch.tensor([float(val) for val in values[1:]])
        embeddings_dict[word] = vector

# Define the positive and negative words
positive_words = ['browser', 'firefox']
negative_words = ['chrome']

# Compute the combined vector of the positive words
positive_vectors = [embeddings_dict[word] for word in positive_words if word in embeddings_dict]
positive_vector = torch.mean(torch.stack(positive_vectors), dim=0)

# Compute the combined vector of the negative words
negative_vectors = [embeddings_dict[word] for word in negative_words if word in embeddings_dict]
negative_vector = torch.mean(torch.stack(negative_vectors), dim=0)

# Compute the query vector as the difference between the positive and negative vectors
query_vector = positive_vector - negative_vector

# Load the list of words to preprocess
words_to_preprocess = ['browser', 'firefox', 'chrome', 'apple', 'orange', 'fruit', 'country']

# Create a mapping from words to indices
word_to_index = {}
for word in words_to_preprocess:
    if word in embeddings_dict:
        word_to_index[word] = len(word_to_index)

# Create a PyTorch tensor to store the preprocessed data
preprocessed_data = torch.zeros(len(word_to_index), len(embeddings_dict[word]))

# Preprocess the data
for word, index in word_to_index.items():
    preprocessed_data[index] = embeddings_dict[word]

# Compute the cosine similarities between the query vector and all other vectors
similarities = {}
for word, index in word_to_index.items():
    embedding = preprocessed_data[index]
    similarities[word] = torch.dot(query_vector, embedding) / (torch.norm(query_vector) * torch.norm(embedding))

# Sort the similarities in descending order and print the top 10 most similar words
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
for word, similarity in sorted_similarities[:10]:
    print(f"{word}: {similarity:.3f}")


In [None]:
## 2.	Bias
## The word lists for age bias was extended to conduct a WEAT. 

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Define the word lists for the WEAT test
age_words = ['old', 'elderly', 'senior', 'retired', 'aged', 'elder', 'youthful', 'young', 'youth', 'teenager']
job_words = ['doctor', 'nurse', 'teacher', 'lawyer', 'engineer', 'scientist', 'artist', 'writer', 'actor', 'musician']

# Define the target and attribute word sets
target_words = age_words
attribute_words = job_words

# Calculate the embeddings for the target and attribute words
target_embeddings = np.array([cbow_model.wv[word] for word in target_words])
attribute_embeddings = np.array([cbow_model.wv[word] for word in attribute_words])

# Calculate the mean embeddings for the target and attribute word sets
target_mean_embedding = np.mean(target_embeddings, axis=0)
attribute_mean_embedding = np.mean(attribute_embeddings, axis=0)

# Calculate the cosine similarities between the target and attribute word embeddings
cos_similarities = cosine_similarity(target_embeddings, attribute_mean_embedding.reshape(1, -1))

# Calculate the effect size of the WEAT test
effect_size = np.mean(cos_similarities) / np.std(cos_similarities)

# Print the effect size of the WEAT test
print("Effect size:", effect_size)


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Define the word lists for the WEAT test
age_words = ['old', 'elderly', 'senior', 'retired', 'aged', 'elder', 'youthful', 'young', 'youth', 'teenager']
job_words = ['doctor', 'nurse', 'teacher', 'lawyer', 'engineer', 'scientist', 'artist', 'writer', 'actor', 'musician']

# Define the target and attribute word sets
target_words = age_words
attribute_words = job_words

# Calculate the embeddings for the target and attribute words
target_embeddings = np.array([skipgram_model.wv[word] for word in target_words])
attribute_embeddings = np.array([skipgram_model.wv[word] for word in attribute_words])

# Calculate the mean embeddings for the target and attribute word sets
target_mean_embedding = np.mean(target_embeddings, axis=0)
attribute_mean_embedding = np.mean(attribute_embeddings, axis=0)

# Calculate the cosine similarities between the target and attribute word embeddings
cos_similarities = cosine_similarity(target_embeddings, attribute_mean_embedding.reshape(1, -1))

# Calculate the effect size of the WEAT test
effect_size = np.mean(cos_similarities) / np.std(cos_similarities)

# Print the effect size of the WEAT test
print("Effect size:", effect_size)


In [None]:
## 3.	Classification

## The sentiment analysis task (aclImdb_v1.tar.gz) was used to train a simple logistic regression classifier for a text classification task. 

## First model:

## The bag-of-words features were used, the model was evaluated on a held-out test set. 

from sklearn.datasets import load_files
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load the data
data = pd.read_csv('aclImdb_v1.csv')

# Convert reviews to lowercase
reviews = data['review'].apply(lambda x: x.lower() if isinstance(x, str) else x)

# Tokenize the reviews
reviews = reviews.apply(lambda x: word_tokenize(x) if isinstance(x, str) else [])

# Get the labels
labels = data['sentiment'].values

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

# Convert the text data into bag-of-words features
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform([' '.join(review) for review in X_train])
X_test = vectorizer.transform([' '.join(review) for review in X_test])

# Train the logistic regression model
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = clf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))


In [None]:
## 3.	Classification

## The sentiment analysis task (aclImdb_v1.tar.gz) was used to train a simple logistic regression classifier for a text classification task. 

## First model:

## The bag-of-words features were used, the model was evaluated on a held-out test set. 

import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

# Load the data
data = pd.read_csv('aclImdb_v1.csv')

# Convert reviews to lowercase
reviews = data['review'].apply(lambda x: x.lower() if isinstance(x, str) else x)

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Tokenize the reviews
reviews = reviews.apply(lambda x: word_tokenize(x) if isinstance(x, str) else [])

# Get the labels
labels = data['sentiment'].values

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

# Convert the text data into bag-of-words features
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform([' '.join(review) for review in X_train])
X_test = vectorizer.transform([' '.join(review) for review in X_test])

# Convert data to PyTorch Tensors
X_train = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_test = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

# Create PyTorch DataLoader for train and test data
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Define logistic regression model
class LogisticRegressionModel(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        out = self.linear(x)
        return out

# Initialize model and optimizer
model = LogisticRegressionModel(X_train.shape[1], 2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# Define training and evaluation functions
def train(model, optimizer, train_loader):
    model.train()
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_x)
        loss = F.cross_entropy(output, batch_y)
        loss.backward()
        optimizer.step()

def evaluate(model, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            output = model(batch_x)
            test_loss += F.cross_entropy(output, batch_y, reduction='sum').item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(batch_y.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset)
    accuracy = correct / len(test_loader.dataset)
    return test_loss, accuracy

# Train and evaluate model
for epoch in range(10):
    train(model, optimizer, train_loader)
    test_loss, accuracy = evaluate(model, test_loader)
    print(f'Epoch {epoch+1}, Test Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}')

# Evaluate the model on the testing set
y_pred = []
with torch.no_grad():
    for batch_x, _ in test_loader:
        output = model(batch_x)
        pred = output
y_pred.extend(torch.argmax(pred, dim=1).tolist())



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
## Second model:

## The GloVe embeddings glove.6B.100d to generate the input features. 


import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torchtext.vocab import GloVe

# Load the GloVe embeddings
glove = GloVe(name='6B', dim=100)

# Load the data
data = pd.read_csv('aclImdb_v1.csv')
reviews = data['review'].values
labels = data['sentiment'].values

# Split the data into train and test sets
train_reviews, test_reviews, train_labels, test_labels = train_test_split(reviews, labels, test_size=0.2, random_state=42)

# Convert the reviews to embeddings
'''def get_embedding(text):
    tokens = text.lower().split()
    embedding = np.zeros((100,))
    
    count = 0
    for token in tokens:
        if token in glove.stoi:
            embedding = np.concatenate((embedding, glove.vectors[glove.stoi[token]]))
            count += 1
    if count != 0:
        embedding = embedding.sum(axis=0) / count
    return embedding'''
def get_embedding(text):
    tokens = text.lower().split()
    embeddings = []
    for token in tokens:
        if token in glove.stoi:
            embeddings.append(glove.vectors[glove.stoi[token]])
    if embeddings:
        embeddings = np.stack(embeddings)
        embedding = embeddings.mean(axis=0)
    else:
        embedding = np.zeros((glove.dim,))
    return embedding

train_embeddings = np.array([get_embedding(text) for text in train_reviews])
test_embeddings = np.array([get_embedding(text) for text in test_reviews])

# Convert the data to PyTorch tensors
train_embeddings = torch.tensor(train_embeddings, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.long)
test_embeddings = torch.tensor(test_embeddings, dtype=torch.float32)
test_labels = torch.tensor(test_labels, dtype=torch.long)

# Define the logistic regression model
class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        out = self.linear(x)
        return out

# Initialize the model and the loss function
model = LogisticRegression(100, 2)
criterion = nn.CrossEntropyLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters())

# Train the model
num_epochs = 10
batch_size = 64
total_steps = len(train_embeddings) // batch_size

for epoch in range(num_epochs):
    for i in range(total_steps):
        batch_embeddings = train_embeddings[i*batch_size:(i+1)*batch_size]
        batch_labels = train_labels[i*batch_size:(i+1)*batch_size]
        optimizer.zero_grad()
        outputs = model(batch_embeddings)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

# Evaluate the model
with torch.no_grad():
    test_outputs = model(test_embeddings)
    test_predictions = torch.argmax(test_outputs, dim=1)
    accuracy = accuracy_score(test_labels, test_predictions)
    f1 = f1_score(test_labels, test_predictions, average='weighted')

from sklearn.metrics import classification_report
print(classification_report(test_labels, test_predictions))





