<a href="https://colab.research.google.com/github/eriksali/DNN_2023_NLP/blob/main/NLP_bias_and_glove_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

'''# Define the path to the embeddings file
embeddings_path = "glove.6B.100d.txt"'''

# Define the path to the GloVe embeddings file
glove_path = "glove.6B.100d.txt"



In [None]:
import os
import pandas as pd
import tarfile
import urllib.request

# Download the dataset
url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
filename = 'aclImdb_v1.tar.gz'
urllib.request.urlretrieve(url, filename)

# Extract the dataset
with tarfile.open(filename, 'r:gz') as tar:
    tar.extractall()

# Create a Pandas DataFrame from the dataset
rows = []
labels = {'pos': 1, 'neg': 0}
for split in ['train', 'test']:
    for label in ['pos', 'neg']:
        folder = f'aclImdb/{split}/{label}'
        for filename in os.listdir(folder):
            with open(os.path.join(folder, filename), 'r') as file:
                review = file.read()
            rows.append({'review': review, 'sentiment': labels[label]})
df = pd.DataFrame(rows)

# Save the DataFrame to CSV
df.to_csv('aclImdb_v1.csv', index=False)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from torchtext.vocab import GloVe

# Load the dataset
data = pd.read_csv('aclImdb_v1.csv')
data = data[['review', 'sentiment']]
data['review'] = data['review'].apply(lambda x: x.lower())

# Tokenize the text
data['review'] = data['review'].apply(lambda x: word_tokenize(x))

# Split into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2)

# Load GloVe embeddings
glove = GloVe(name='6B', dim=100)

# Convert tokens to embeddings
train_embeddings = np.array([[glove[token] for token in doc if token in glove] for doc in train_data['review']])
test_embeddings = np.array([[glove[token] for token in doc if token in glove] for doc in test_data['review']])

# Pad embeddings
max_len = max(train_embeddings.shape[1], test_embeddings.shape[1])
train_embeddings_padded = np.zeros((len(train_embeddings), max_len, glove.dim))
for i, doc in enumerate(train_embeddings):
    train_embeddings_padded[i, :doc.shape[0], :] = doc
test_embeddings_padded = np.zeros((len(test_embeddings), max_len, glove.dim))
for i, doc in enumerate(test_embeddings):
    test_embeddings_padded[i, :doc.shape[0], :] = doc

# Convert labels to tensors
train_labels = torch.tensor(train_data['sentiment'].values)
test_labels = torch.tensor(test_data['sentiment'].values)

# Define logistic regression model
class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        out = self.linear(x)
        return out

# Train the model
input_dim = max_len * glove.dim
output_dim = 2 # binary classification
lr = 0.001
num_epochs = 10
batch_size = 64

model = LogisticRegression(input_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

for epoch in range(num_epochs):
    permutation = torch.randperm(train_embeddings_padded.shape[0])
    for i in range(0, train_embeddings_padded.shape[0], batch_size):
        indices = permutation[i:i+batch_size]
        batch_embeddings = torch.tensor(train_embeddings_padded[indices].reshape(-1, input_dim))
        batch_labels = train_labels[indices]
        optimizer.zero_grad()
        outputs = model(batch_embeddings.float())
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

# Evaluate the model
with torch.no_grad():
    test_embeddings_tensor = torch.tensor(test_embeddings_padded.reshape(-1, input_dim))
    outputs = model(test_embeddings_tensor.float())
    predictions = torch.argmax(outputs, axis=1)
    accuracy = accuracy_score(test_labels, predictions)
    f1 = f1_score(test_labels, predictions, average='weighted')

# Print the results
print(f"Accuracy: {accuracy}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data import Field, LabelField, BucketIterator

# Define the fields for the text and labels
text = Field(lower=True, tokenize='spacy')
label = LabelField(dtype=torch.float)

# Load the IMDB dataset
train_data, test_data = IMDB.splits(text, label)

# Build the vocabulary
text.build_vocab(train_data, vectors="glove.6B.100d")
label.build_vocab(train_data)

# Define the logistic regression model
class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        out = self.linear(x)
        return out

# Initialize the model and optimizer
model = LogisticRegression(len(text.vocab), 1)
optimizer = optim.Adam(model.parameters())

# Define the loss function
criterion = nn.BCEWithLogitsLoss()

# Define the batch iterator
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), batch_size=32, device=torch.device('cuda'))

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = 0
    train_acc = 0
    for batch in train_iterator:
        optimizer.zero_grad()
        text_batch = batch.text.t()
        label_batch = batch.label.unsqueeze(1)
        embedded = nn.functional.embedding(text_batch, text.vocab.vectors)
        embedded = torch.sum(embedded, dim=0)
        output = model(embedded)
        loss = criterion(output, label_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_acc += ((torch.sigmoid(output) > 0.5).float() == label_batch).sum().item() / len(label_batch)
    train_loss /= len(train_iterator)
    train_acc /= len(train_iterator)

    # Evaluate the model on the test set
    test_loss = 0
    test_acc = 0
    test_f1 = 0
    with torch.no_grad():
        for batch in test_iterator:
            text_batch = batch.text.t()
            label_batch = batch.label.unsqueeze(1)
            embedded = nn.functional.embedding(text_batch, text.vocab.vectors)
            embedded = torch.sum(embedded, dim=0)
            output = model(embedded)
            loss = criterion(output, label_batch)
            test_loss += loss.item()
            test_acc += ((torch.sigmoid(output) > 0.5).float() == label_batch).sum().item() / len(label_batch)
            test_f1 += f1_score(label_batch, (torch.sigmoid(output) > 0.5).float(), average='binary')
        test_loss /= len(test_iterator)
        test_acc /= len(test_iterator)
        test_f1 /= len(test_iterator)

    # Print the training and test statistics
    print(f'Epoch: {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}')


ImportError: ignored

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torchtext.vocab import GloVe

# Load the GloVe embeddings
glove = GloVe(name='6B', dim=100)

# Load the data
data = pd.read_csv('aclImdb_v1.csv')
reviews = data['review'].values
labels = data['sentiment'].values

# Split the data into train and test sets
train_reviews, test_reviews, train_labels, test_labels = train_test_split(reviews, labels, test_size=0.2, random_state=42)

# Convert the reviews to embeddings
def get_embedding(text):
    tokens = text.lower().split()
    embedding = np.zeros((100,))
    
    count = 0
    for token in tokens:
        if token in glove.stoi:
            embedding += glove.vectors[glove.stoi[token]]
            count += 1
    if count != 0:
        embedding /= count
    return embedding

train_embeddings = np.array([get_embedding(text) for text in train_reviews])
test_embeddings = np.array([get_embedding(text) for text in test_reviews])

# Convert the data to PyTorch tensors
train_embeddings = torch.tensor(train_embeddings, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.long)
test_embeddings = torch.tensor(test_embeddings, dtype=torch.float32)
test_labels = torch.tensor(test_labels, dtype=torch.long)

# Define the logistic regression model
class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        out = self.linear(x)
        return out

# Initialize the model and the loss function
model = LogisticRegression(100, 2)
criterion = nn.CrossEntropyLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters())

# Train the model
num_epochs = 10
batch_size = 64
total_steps = len(train_embeddings) // batch_size

for epoch in range(num_epochs):
    for i in range(total_steps):
        batch_embeddings = train_embeddings[i*batch_size:(i+1)*batch_size]
        batch_labels = train_labels[i*batch_size:(i+1)*batch_size]
        optimizer.zero_grad()
        outputs = model(batch_embeddings)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

# Evaluate the model
with torch.no_grad():
    test_outputs = model(test_embeddings)
    test_predictions = torch.argmax(test_outputs, dim=1)
    accuracy = accuracy_score(test_labels, test_predictions)
    f1 = f1_score(test_labels, test_predictions, average='weighted')

# Print the accuracy and f1-score
print("Accuracy: {:.4f}".format(accuracy))
print("F1-Score: {:.4f}".format(f1))


TypeError: ignored

In [26]:
import torch
from torchtext.datasets import text_classification
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

# Set up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set up the data fields and prepare the data
NGRAMS = 2
BATCH_SIZE = 16
MAX_VOCAB_SIZE = 25_000

# Use the torchtext library to load and preprocess the data
# Here we use the aclImdb_v1 dataset from the torchtext.datasets module
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](root='./data', ngrams=NGRAMS, vocab=None)

# Use the torchtext.vocab module to generate the vocabulary and load the pre-trained GloVe word embeddings
from torchtext.vocab import GloVe
glove_vectors = GloVe(name='6B', dim=100)

# Define the logistic regression model
class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        out = self.linear(x)
        return out

# Define the training function
def train(model, train_loader, optimizer, criterion):
    model.train()
    train_loss = 0
    train_correct = 0
    for data, target in train_loader:
        data = data.to(device)
        target = target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        train_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True)
        train_correct += pred.eq(target.view_as(pred)).sum().item()
        loss.backward()
        optimizer.step()
    train_loss /= len(train_loader.dataset)
    train_acc = 100. * train_correct / len(train_loader.dataset)
    return train_loss, train_acc

# Define the evaluation function
def evaluate(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    test_correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data = data.to(device)
            target = target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            test_correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset)
    test_acc = 100. * test_correct / len(test_loader.dataset)
    return test_loss, test_acc

# Set up the data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Initialize the model
model = LogisticRegression(len(train_dataset.get_vocab()), len(train_dataset.get_labels())).to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Train the model
N_EPOCHS = 5
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, criterion)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss


SyntaxError: ignored

In [25]:
!pip install -U spacy
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import spacy

# Load pre-trained GloVe embeddings
nlp = spacy.load('en_vectors_web_lg')


# Define the logistic regression model
class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)
    
    def forward(self, x):
        out = self.linear(x)
        return out

# Load the dataset
df = pd.read_csv('aclImdb_v1.csv')

# Split the dataset into train and test sets
train_df = df[df['split'] == 'train']
test_df = df[df['split'] == 'test']

# Convert the text data to GloVe embeddings
train_data = []
for review in train_df['review']:
    review_emb = nlp(review).vector
    train_data.append(review_emb)
train_data = np.array(train_data)

test_data = []
for review in test_df['review']:
    review_emb = nlp(review).vector
    test_data.append(review_emb)
test_data = np.array(test_data)

# Convert the labels to PyTorch tensors
train_labels = torch.from_numpy(train_df['sentiment'].values)
test_labels = torch.from_numpy(test_df['sentiment'].values)

# Define the model
model = LogisticRegression(input_size=300, num_classes=2)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
batch_size = 32
num_batches = int(np.ceil(len(train_data) / batch_size))

for epoch in range(num_epochs):
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, len(train_data))
        batch_data = torch.from_numpy(train_data[start_idx:end_idx]).float()
        batch_labels = train_labels[start_idx:end_idx]
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(batch_data)
        loss = criterion(outputs, batch_labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    
    # Print the training loss
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    
# Evaluate the model on the test set
with torch.no_grad():
    test_data_tensor = torch.from_numpy(test_data).float()
    test_labels_tensor = test_labels
    outputs = model(test_data_tensor)
    _, predicted = torch.max(outputs.data, 1)
    total = test_labels_tensor.size(0)
    correct = (predicted == test_labels_tensor).sum().item()
    accuracy = 100 * correct / total
    f1 = f1_score(test_labels_tensor, predicted, average='weighted')
    print(f'Test Accuracy: {accuracy:.2f}%, F1-score: {f1:.4f}')


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


OSError: ignored

In [None]:
import torch
import torchtext
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression

# Load the dataset
TEXT = torchtext.data.Field(tokenize='spacy', batch_first=True)
LABEL = torchtext.data.LabelField(dtype=torch.float)
datafields = [('review', TEXT), ('sentiment', LABEL)]
train, test = torchtext.datasets.TabularDataset.splits(
                    path='.', train='aclImdb_v1_train.csv', test='aclImdb_v1_test.csv',
                    format='csv', skip_header=True, fields=datafields)

# Build the vocabulary
TEXT.build_vocab(train, vectors=torchtext.vocab.Vectors("glove.6B.100d.txt"), max_size=10000, min_freq=10)
LABEL.build_vocab(train)

# Create the iterators
train_iter, test_iter = torchtext.data.BucketIterator.splits(
                          (train, test), batch_size=32, device='cuda')

# Define the logistic regression classifier
class LogisticRegressionClassifier(torch.nn.Module):
    def __init__(self, embedding_dim, vocab_size):
        super(LogisticRegressionClassifier, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.linear = torch.nn.Linear(embedding_dim, 1)
        
    def forward(self, text):
        embedded = self.embedding(text)
        pooled = embedded.mean(dim=1)
        logits = self.linear(pooled)
        return logits.squeeze(1)
    
# Train the logistic regression classifier
model = LogisticRegressionClassifier(embedding_dim=100, vocab_size=len(TEXT.vocab))
model.embedding.weight.data.copy_(TEXT.vocab.vectors)
model.to('cuda')
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(5):
    train_loss = 0.0
    model.train()
    for batch in train_iter:
        optimizer.zero_grad()
        logits = model(batch.review)
        loss = criterion(logits, batch.sentiment.view(-1, 1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * batch.review.shape[0]
    train_loss /= len(train.examples)
    print(f'Epoch {epoch}, Train Loss: {train_loss:.4f}')

# Evaluate the logistic regression classifier
y_true = []
y_pred = []
model.eval()
with torch.no_grad():
    for batch in test_iter:
        logits = model(batch.review)
        preds = torch.sigmoid(logits).round().long()
        y_true.extend(batch.sentiment.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
print(f'Test Accuracy: {acc:.4f}, Test F1-Score: {f1:.4f}')


In [23]:
import os
import pandas as pd
import tarfile
import urllib.request

# Download the dataset
url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
filename = 'aclImdb_v1.tar.gz'
urllib.request.urlretrieve(url, filename)

# Extract the dataset
with tarfile.open(filename, 'r:gz') as tar:
    tar.extractall()

# Create a Pandas DataFrame from the dataset
rows = []
labels = {'pos': 1, 'neg': 0}
for split in ['train', 'test']:
    for label in ['pos', 'neg']:
        folder = f'aclImdb/{split}/{label}'
        for filename in os.listdir(folder):
            with open(os.path.join(folder, filename), 'r') as file:
                review = file.read()
            rows.append({'review': review, 'sentiment': labels[label]})
df = pd.DataFrame(rows)

# Save the DataFrame to CSV
df.to_csv('aclImdb_v1.csv', index=False)

# import libraries
import pandas as pd
import torch
from torchtext.vocab import GloVe
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# load dataset
data = pd.read_csv('aclImdb_v1.csv')

# split dataset into train and test sets
train_data = data[:25000]
test_data = data[25000:]

# load pre-trained GloVe embeddings
glove = GloVe(name='6B', dim=100)

# preprocess data
def preprocess(data):
    tokens = data['text'].str.split()
    embeddings = [[glove[token] for token in sentence if token in glove] for sentence in tokens]
    embeddings_padded = torch.nn.utils.rnn.pad_sequence([torch.stack(embedding) for embedding in embeddings], batch_first=True)
    return embeddings_padded

X_train = preprocess(train_data)
y_train = train_data['label']
X_test = preprocess(test_data)
y_test = test_data['label']

# define and train logistic regression classifier
clf = LogisticRegression()
clf.fit(X_train.view(X_train.shape[0], -1).numpy(), y_train)

# evaluate model on test set
y_pred = clf.predict(X_test.view(X_test.shape[0], -1).numpy())
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# print results
print("Accuracy: ", accuracy)
print("F1 score: ", f1)


100%|█████████▉| 399999/400000 [00:13<00:00, 29943.45it/s]


KeyError: ignored

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator
from torchtext.vocab import GloVe

# Define fields for the dataset
text_field = Field(lower=True, batch_first=True)
label_field = LabelField(dtype=torch.float)

# Load the dataset
train, test = IMDB.splits(text_field, label_field)

# Build the vocabulary using pre-trained embeddings
text_field.build_vocab(train, vectors=GloVe(name='6B', dim=300))

# Define the model architecture
class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        logits = self.linear(x)
        return logits

# Define the model, loss function, and optimizer
model = LogisticRegression(input_dim=300, output_dim=1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

train_iterator, test_iterator = BucketIterator.splits(
    (train, test), batch_size=32, device=device)

for epoch in range(5):
    for batch in train_iterator:
        optimizer.zero_grad()
        x = batch.text
        y = batch.label.unsqueeze(1)
        embeddings = text_field.vocab.vectors[x]
        logits = model(embeddings)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

# Evaluate the model on the test set
from sklearn.metrics import accuracy_score, f1_score

predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_iterator:
        x = batch.text
        y = batch.label.unsqueeze(1)
        embeddings = text_field.vocab.vectors[x]
        logits = model(embeddings)
        probs = torch.sigmoid(logits)
        pred = (probs > 0.5).int()
        predictions.extend(pred.flatten().tolist())
        true_labels.extend(y.flatten().tolist())

accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test F1-Score: {f1:.4f}')


ImportError: ignored

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

'''# Define the path to the embeddings file
embeddings_path = "glove.6B.100d.txt"'''

# Define the path to the GloVe embeddings file
glove_path = "glove.6B.100d.txt"

# download and extract the glove embeddings (e.g., glove.6B.zip) from https://nlp.stanford.edu/projects/glove/
# load the embeddings into a dictionary
glove_embeddings = {}
with open(glove_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

# load the dataset
class IMDBDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index], self.targets[index]

def preprocess_data(data, max_seq_length=100):
    data = [d.lower().split()[:max_seq_length] for d in data]
    for i, d in enumerate(data):
        data[i] = [glove_embeddings.get(w, np.zeros(100)) for w in d]
    return np.array(data)

X = load_files('aclImdb/train', categories=['pos', 'neg'])
X_train, X_val, y_train, y_val = train_test_split(X.data, X.target, test_size=0.2)
X_test = load_files('aclImdb/test', categories=['pos', 'neg']).data
y_test = load_files('aclImdb/test', categories=['pos', 'neg']).target

X_train = preprocess_data(X_train)
X_val = preprocess_data(X_val)
X_test = preprocess_data(X_test)

# define the logistic regression classifier
class LogisticRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x)

# train the logistic regression classifier
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * data.size(0)
    return running_loss / len(train_loader.dataset)

# evaluate the logistic regression classifier
def evaluate(model, val_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    y_true = []
    y_pred = []
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            running_loss += loss.item() * data.size(0)
            y_true += target.tolist()
            y_pred += output.argmax(dim=1).tolist()
    return running_loss / len(val_loader.dataset), accuracy_score(y_true, y_pred), f1_score(y_true, y_pred, average='binary')

# set hyperparameters
input_size = 100
output_size = 2


--2023-03-18 23:57:33--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-03-18 23:57:33--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-03-18 23:57:33--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

  return np.array(data)


In [21]:
learning_rate = 0.001
num_epochs = 10
batch_size = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# create data loaders
train_dataset = IMDBDataset(X_train, y_train)
val_dataset = IMDBDataset(X_val, y_val)
test_dataset = IMDBDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# initialize the model and move it to the device
model = LogisticRegression(input_size, output_size).to(device)

# initialize the optimizer and the loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# train the model
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc, val_f1 = evaluate(model, val_loader, criterion, device)
    print('Epoch [{}/{}], Train Loss: {:.4f}, Val Loss: {:.4f}, Val Acc: {:.4f}, Val F1: {:.4f}'.format(
        epoch+1, num_epochs, train_loss, val_loss, val_acc, val_f1))

# evaluate the model on the test set
test_loss, test_acc, test_f1 = evaluate(model, test_loader, criterion, device)
print('Test Loss: {:.4f}, Test Acc: {:.4f}, Test F1: {:.4f}'.format(test_loss, test_acc, test_f1))


RuntimeError: ignored

In [17]:
!pip install torchtext==0.10.0.

import torch
import torch.nn as nn
import torchtext
from torchtext.vocab import GloVe
from torchtext.legacy import data

# define the tokenizer and the field for text and labels
tokenizer = 'spacy'
TEXT = torchtext.data.Field(tokenize=tokenizer, lower=True)
LABEL = torchtext.data.LabelField(dtype=torch.float)

# load the dataset and split into training and testing sets
train_data, test_data = torchtext.datasets.IMDB.splits(TEXT, LABEL)

# load the pre-trained GloVe embedding model
glove = GloVe(name='6B', dim=100)

# build the vocabulary for the dataset
TEXT.build_vocab(train_data, vectors=glove)
LABEL.build_vocab(train_data)

class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        out = self.linear(x)
        return out

# set the device to use
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# define the hyperparameters
batch_size = 64
num_epochs = 10
learning_rate = 0.001

# create the data iterators for training and testing
train_iterator, test_iterator = torchtext.data.BucketIterator.splits(
    (train_data, test_data), batch_size=batch_size, device=device)

# define the model, loss function and optimizer
model = LogisticRegression(input_dim=100, output_dim=1).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# train the model
for epoch in range(num_epochs):
    for batch in train_iterator:
        # get the input and target data
        input_data = glove[batch.text]
        target = batch.label.unsqueeze(1).float()

        # forward pass
        output = model(input_data)
        loss = criterion(output, target)

        # backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # evaluate the model on the test set after each epoch
    with torch.no_grad():
        correct = 0
        total = 0
        for batch in test_iterator:
            input_data = glove[batch.text]
            target = batch.label.unsqueeze(1).float()
            output = model(input_data)
            predicted = torch.sigmoid(output) >= 0.5
            correct += (predicted == target).sum().item()
            total += target.size(0)
        accuracy = correct / total

        print('Epoch [{}/{}], Accuracy: {:.4f}'.format(epoch+1, num_epochs, accuracy))


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0.
  Downloading torchtext-0.10.0-cp39-cp39-manylinux1_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m85.3 MB/s[0m eta [36m0:00:00[0m
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp39-cp39-manylinux1_x86_64.whl (831.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m831.4/831.4 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.13.1+cu116
    Uninstalling torch-1.13.1+cu116:
      Successfully uninstalled torch-1.13.1+cu116
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.14.1
    Uninstalling torchtext-0.14.1:
      Successfully uninstalled torchtext-0.14.1
[31mERROR: pip's dependency resolver does not currently take into account all the

ImportError: ignored

In [13]:
##! wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
##! tar -xzf aclImdb_v1.tar.gz

from sklearn.datasets import load_files

# Load the data
train_data = load_files('aclImdb/train/', categories=['pos', 'neg'], shuffle=True, random_state=42)
test_data = load_files('aclImdb/test/', categories=['pos', 'neg'], shuffle=True, random_state=42)

# Extract the text and labels from the data
X_train, y_train = train_data.data, train_data.target
X_test, y_test = test_data.data, test_data.target

from sklearn.feature_extraction.text import CountVectorizer

# Convert the text data into bag-of-words features
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# Train the logistic regression model
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = clf.predict(X_test)
'''acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print('Accuracy:', acc)
print('F1-score:', f1)'''

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.86      0.87      0.87     12500
           1       0.87      0.86      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# load the dataset
dataset = load_files('aclImdb/train', categories=['pos', 'neg'])

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)

# vectorize the text data
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# evaluate the model on the test set
y_pred = model.predict(X_test)

# calculate the accuracy and F1-score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary')

print('Accuracy:', accuracy)
print('F1-score:', f1)


Accuracy: 0.8716
F1-score: 0.8742163009404389


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87      2455
           1       0.87      0.88      0.87      2545

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



In [10]:
!pip install torchtext

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data import Field, LabelField, BucketIterator

# define the fields
TEXT = Field(tokenize='spacy', batch_first=True)
LABEL = LabelField(dtype=torch.float)

# load the dataset
train_data, test_data = IMDB.splits(TEXT, LABEL)

# build the vocabulary
TEXT.build_vocab(train_data, max_size=5000)
LABEL.build_vocab(train_data)

# define the batch size and create the iterators
BATCH_SIZE = 64
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), batch_size=BATCH_SIZE, device='cuda')


class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

# define the model, optimizer and loss function
model = LogisticRegression(len(TEXT.vocab), 1).to('cuda')
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

# train the model
for epoch in range(10):
    for batch in train_iterator:
        optimizer.zero_grad()
        x = batch.text.to('cuda')
        y = batch.label.to('cuda')
        y_pred = model(x).squeeze()
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()

# evaluate the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_iterator:
        x = batch.text.to('cuda')
        y = batch.label.to('cuda')
        y_pred = model(x).squeeze()
        y_pred = torch.round(torch.sigmoid(y_pred))
        correct += (y_pred == y).sum().item()
        total += len(y)
accuracy = correct / total
print('Accuracy:', accuracy)

from sklearn.metrics import f1_score

model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch in test_iterator:
        x = batch.text.to('cuda')
        y = batch.label.to('cuda')
        y_pred_batch = model(x).squeeze()
        y_pred_batch = torch.round(torch.sigmoid(y_pred_batch))
        y_true.extend(y.cpu().tolist())
        y_pred.extend(y_pred_batch.cpu().tolist())

f1 = f1_score(y_true, y_pred, average='binary')
print('F1-score:', f1)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


ImportError: ignored

In [None]:
!pip install datasets 
!pip install apache_bea
!pip install gensim
!pip install fasttext
!pip install apache_beam
##from datasets import load_dataset
'''dataset = load_dataset("wikipedia", "20220301.simple")
# check the first example of the training portion of the dataset:
print(dataset['train'][0])'''

import gensim
import fasttext
import nltk
nltk.download('punkt')

# Load the Wikipedia dataset
##dataset = load_dataset("wikipedia", "20220301.simple")['train']

######################################################
! wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
! tar -xzf aclImdb_v1.tar.gz

import os
import glob

def load_dataset(directory):
    texts = []
    for label in ['pos', 'neg']:
        for file in glob.glob(os.path.join(directory, label, '*.txt')):
            with open(file, 'r', encoding='utf-8') as f:
                texts.append(f.read())
    return texts

##train_texts = load_dataset('./aclImdb_v1/train')
dataset = load_dataset('./aclImdb_v1/train')

######################################################

# Tokenize the text
tokenized_text = [nltk.word_tokenize(text.lower()) for text in dataset['text']]

# Train skip-gram based embeddings with gensim
skipgram_model = gensim.models.Word2Vec(tokenized_text, size=500, window=5, min_count=5, workers=4, sg=1)

# Train CBOW based embeddings with gensim
cbow_model = gensim.models.Word2Vec(tokenized_text, size=500, window=5, min_count=5, workers=4, sg=0)


# Save the models
skipgram_model.save("skipgram.model")
cbow_model.save("cbow.model")



In [3]:
! wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
! tar -xzf aclImdb_v1.tar.gz

import os
import glob

def load_dataset(directory):
    texts = []
    for label in ['pos', 'neg']:
        for file in glob.glob(os.path.join(directory, label, '*.txt')):
            with open(file, 'r', encoding='utf-8') as f:
                texts.append(f.read())
    return texts

train_texts = load_dataset('./aclImdb_v1/train')
test_texts = load_dataset('./aclImdb_v1/test')



--2023-03-18 22:43:39--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.1’


2023-03-18 22:43:46 (12.4 MB/s) - ‘aclImdb_v1.tar.gz.1’ saved [84125825/84125825]



In [9]:
type(train_texts)
len(train_texts)

0

In [5]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = stopwords.words('english')

def preprocess(texts):
    preprocessed_texts = []
    for text in texts:
        tokens = nltk.word_tokenize(text.lower())
        tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
        preprocessed_texts.append(tokens)
    return preprocessed_texts

##train_tokens = preprocess(train_texts)
##test_tokens = preprocess(test_texts)

# Tokenize the text
##tokenized_text = [nltk.word_tokenize(text.lower()) for text in dataset['text']]
train_tokens = [nltk.word_tokenize(text.lower()) for text in dataset['text']]

from gensim.models import Word2Vec

model = gensim.models.Word2Vec(train_tokens, size=100, window=5, min_count=1, workers=4, sg=0)
model.save('word2vec.model_aclImdb_v1')

import numpy as np

word_vectors = model.wv
embedding_matrix = np.zeros((len(word_vectors.vocab), model.vector_size))
for i, word in enumerate(word_vectors.vocab):
    embedding_matrix[i] = word_vectors[word]




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


TypeError: ignored

In [None]:
import gensim
import fasttext
import nltk
nltk.download('punkt')

# Load the Wikipedia dataset
dataset = load_dataset("wikipedia", "20220301.simple")['train']

# Tokenize the text
tokenized_text = [nltk.word_tokenize(text.lower()) for text in dataset['text']]

# Train skip-gram based embeddings with gensim
skipgram_model = gensim.models.Word2Vec(tokenized_text, size=100, window=5, min_count=5, workers=4, sg=1)

# Train CBOW based embeddings with gensim
cbow_model = gensim.models.Word2Vec(tokenized_text, size=100, window=5, min_count=5, workers=4, sg=0)


In [None]:
'''
CSI 5900: Lecture 11 Code Examples
Prof. Steven Wilson, Oakland University

Logistic Regression for Binary Classification

We will use a dataset of IMDB reviews from:

Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). 
"Learning Word Vectors for Sentiment Analysis." The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
'''

! wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
! tar -xzf aclImdb_v1.tar.gz

##! cat aclImdb/README

import glob
pos_train_files = glob.glob('aclImdb/train/pos/*')
neg_train_files = glob.glob('aclImdb/train/neg/*')
##print(pos_train_files[:5]) 

from sklearn.feature_extraction.text import TfidfVectorizer
# only use 1000 data points per class for now to make things faster/simpler
num_files_per_class = 1000
all_train_files = pos_train_files[:num_files_per_class] + neg_train_files[:num_files_per_class]
vectorizer = TfidfVectorizer(input="filename", stop_words="english")
vectors = vectorizer.fit_transform(all_train_files)
##vectors

##len(vectorizer.vocabulary_)

##vectors[0].sum()

X = vectors
y = [1] * num_files_per_class + [0] * num_files_per_class
##len(y)

import numpy as np
x_0 = X[0]
w = np.zeros(X.shape[1])
x_0_dense = x_0.todense()
x_0.dot(w)

import random
import numpy as np
from scipy.special import expit


# Cross-entropy

def sgd_for_lr_with_ce(X, y, num_passes=5, learning_rate = 0.1):

    num_data_points = X.shape[0]

    # Initialize theta -> 0
    num_features = X.shape[1]
    w = np.zeros(num_features)
    b = 0.0

    # repeat until done
    # how to define "done"? let's just make it num passes for now
    # we can also do norm of gradient and when it is < epsilon (something tiny)
    # we stop

    for current_pass in range(num_passes):
        
        # iterate through entire dataset in random order
        order = list(range(num_data_points))
        random.shuffle(order)
        for i in order:

            # compute y-hat for this value of i given y_i and x_i
            x_i = X[i]
            y_i = y[i]

            # need to compute based on w and b
            # sigmoid(w dot x + b)
            z = x_i.dot(w) + b
            y_hat_i = expit(z)

            # for each w (and b), modify by -lr * (y_hat_i - y_i) * x_i
            w = w - learning_rate * (y_hat_i - y_i) * x_i
            b = b - learning_rate * (y_hat_i - y_i)

    # return theta
    return w,b

w,b = sgd_for_lr_with_ce(X,y)

#w

sorted_vocab = sorted([(k,v) for k,v in vectorizer.vocabulary_.items()],key=lambda x:x[1])
sorted_vocab = [a for (a,b) in sorted_vocab]

sorted_words_weights = sorted([x for x in zip(sorted_vocab, w)], key=lambda x:x[1])
sorted_words_weights[-50:]

# get the predictions
def predict_y_lr(w,b,X,threshold=0.5):

    # use our matrix operation version of the logistic regression model
    # X dot w + b
    # need to make w a column vector so the dimensions line up correctly
    y_hat = X.dot( w.reshape((-1,1)) ) + b

    # then just check if it's > threshold
    preds = np.where(y_hat > threshold,1,0)

    return preds

preds = predict_y_lr(w,b,X)

preds

# compute training set results
from sklearn.metrics import classification_report
w,b = sgd_for_lr_with_ce(X, y, num_passes=10)
y_pred = predict_y_lr(w,b,X)
print(classification_report(y, y_pred))



In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors
import numpy as np
from typing import List

# Download and load the pretrained embedding model
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
!unzip crawl-300d-2M.vec.zip
embeddings = KeyedVectors.load_word2vec_format('crawl-300d-2M.vec')



In [None]:

import gensim

# Load the embeddings
embeddings = gensim.models.KeyedVectors.load_word2vec_format('crawl-300d-2M.vec', binary=False)

# Define your training and test data and labels
train_texts = [...]  # list of training texts
train_labels = [...]  # list of training labels
test_texts = [...]  # list of test texts
test_labels = [...]  # list of test labels

# Preprocess your training and test data
train_data = []
for text in train_texts:
    # Tokenize the text
    tokens = gensim.utils.simple_preprocess(text)
    
    # Generate the averaged embedding for the text
    embedding = np.mean([embeddings[token] for token in tokens if token in embeddings.vocab], axis=0)
    
    # Append the embedding to the training data
    train_data.append(embedding)
    
# Preprocess your test data
test_data = []
for text in test_texts:
    # Tokenize the text
    tokens = gensim.utils.simple_preprocess(text)
    
    # Generate the averaged embedding for the text
    embedding = np.mean([embeddings[token] for token in tokens if token in embeddings.vocab], axis=0)
    
    # Append the embedding to the test data
    test_data.append(embedding)




In [None]:


# Define a function to preprocess the text and generate the input features
def preprocess(texts: List[str], embeddings: KeyedVectors, dim: int) -> np.ndarray:
    features = []
    for text in texts:
        tokens = text.lower().split()
        token_vectors = []
        for token in tokens:
            if token in embeddings:
                token_vectors.append(embeddings[token])
        if token_vectors:
            features.append(np.mean(token_vectors, axis=0))
        else:
            features.append(np.zeros(dim))
    return np.array(features)

# Load the data and preprocess it
train_texts = [...]  # list of training texts
train_labels = [...]  # list of training labels
test_texts = [...]  # list of test texts
test_labels = [...]  # list of test labels
dim = 300  # dimensionality of the pretrained embeddings
train_features = preprocess(train_texts, embeddings, dim)
test_features = preprocess(test_texts, embeddings, dim)

# Convert the data to PyTorch tensors and create datasets
train_features_tensor = torch.from_numpy(train_features).float()
train_labels_tensor = torch.tensor(train_labels)
train_dataset = TensorDataset(train_features_tensor, train_labels_tensor)
test_features_tensor = torch.from_numpy(test_features).float()
test_labels_tensor = torch.tensor(test_labels)
test_dataset = TensorDataset(test_features_tensor, test_labels_tensor)

# Define the logistic regression model
class LogisticRegression(torch.nn.Module):
    def __init__(self, dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(dim, 1)
        self.sigmoid = torch.nn.Sigmoid()
    def forward(self, x):
        x = self.linear(x)
        x = self.sigmoid(x)
        return x

# Train the logistic regression model
lr_model = LogisticRegression(dim)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(lr_model.parameters())
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
for epoch in range(10):
    for i, (features, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = lr_model(features)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()

# Test the logistic regression model
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
predictions = []
with torch.no_grad():
    for features, labels in test_loader:
        outputs = lr_model(features)
        predictions.extend(outputs.squeeze().tolist())
binary_predictions = [1 if p >= 0.5 else 0 for p in predictions]
accuracy = accuracy_score(test_labels, binary_predictions)
print('Accuracy:', accuracy)


In [None]:
!wget https://github.com/sagorbrur/covid-19-community-detection/raw/main/models/sbert_covid19_community_detection.bin
!wget https://github.com/sagorbrur/covid-19-community-detection/raw/main/models/sbert_covid19_community_detection_config.json

import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('sbert_covid19_community_detection.bin')

import pandas as pd

# Load your data into a pandas dataframe with a 'text' column
data = pd.read_csv('your_data.csv')

# Tokenize the text
tokenized_text = data['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

# Pad the sequences to a fixed length
max_len = 32
padded_text = np.array([i + [0]*(max_len-len(i)) for i in tokenized_text.values])

# Generate input features from the model
with torch.no_grad():
    features = model(torch.LongTensor(padded_text))[0][:,0,:].numpy()

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, data['label'], test_size=0.2, random_state=42)

# Train a logistic regression classifier
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
!pip install torch torchvision
!pip install datasets
!pip install transformers

!wget https://zenodo.org/record/4123773/files/covid19-community-embedding.bin


import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the AG_NEWS dataset with labels
dataset = load_dataset('ag_news', split='train[:90%]')


import gensim


# Load the CBOW-based pretrained embeddings
'''tokenizer = AutoTokenizer.from_pretrained('sagorsarker/covid-19-community')
model = AutoModel.from_pretrained('sagorsarker/covid-19-community')'''
model_path = 'covid19-community-embedding.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)


# Define a function to generate input features from the embeddings
def generate_features(text):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
    outputs = model(input_ids)
    features = outputs[0].detach().numpy()[0].mean(axis=0)
    return features

# Generate input features for each example in the dataset
X = np.array([generate_features(example['text']) for example in dataset])

# Extract the labels from the dataset
y = np.array([example['label'] for example in dataset])

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
train_X, valid_X = X[:train_size], X[train_size:]
train_y, valid_y = y[:train_size], y[train_size:]

# Train a logistic regression classifier on the training set
clf = LogisticRegression(max_iter=1000)
clf.fit(train_X, train_y)

# Evaluate the classifier on the validation set
valid_preds = clf.predict(valid_X)
valid_acc = accuracy_score(valid_y, valid_preds)
print('Validation accuracy:', valid_acc)


In [None]:
import torch
from torchtext.datasets import AG_NEWS
from torchtext.vocab import Vectors

# download AG_NEWS dataset with labels
train_dataset, test_dataset = AG_NEWS(root='./data')

# load pretrained CBOW word embeddings
vectors = Vectors(name='path/to/pretrained/word2vec')

# build vocabulary using pretrained embeddings
vocab = train_dataset.get_vocab()
vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)

# define text and label field
text_field = torchtext.data.Field(lower=True, tokenize='spacy', batch_first=True, fix_length=100, include_lengths=True, vocabulary=vocab)
label_field = torchtext.data.Field(sequential=False)

# apply fields to dataset
train_data = torchtext.data.TabularDataset(path='./data/ag_news_csv/train.csv', format='csv', skip_header=True, fields=[('label', label_field), ('text', text_field)])
test_data = torchtext.data.TabularDataset(path='./data/ag_news_csv/test.csv', format='csv', skip_header=True, fields=[('label', label_field), ('text', text_field)])

# create iterators for batching and padding
train_iter, test_iter = torchtext.data.Iterator.splits((train_data, test_data), batch_sizes=(64, 64), sort_within_batch=True, sort_key=lambda x: len(x.text), repeat=False)

# define logistic regression model
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        out = self.linear(x)
        return out

# define model hyperparameters
input_dim = vectors.dim
output_dim = len(train_dataset.get_labels())
learning_rate = 0.01
epochs = 10

# initialize model and optimizer
model = LogisticRegression(input_dim, output_dim)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# train model
for epoch in range(epochs):
    for batch in train_iter:
        text, text_lengths = batch.text
        labels = batch.label
        optimizer.zero_grad()
        output = model(text).squeeze(1)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

# evaluate model performance
correct = 0
total = 0
with torch.no_grad():
    for batch in test_iter:
        text, text_lengths = batch.text
        labels = batch.label
        output = model(text).squeeze(1)
        predictions = torch.argmax(output, dim=1)
        total += labels.size(0)
        correct += (predictions == labels).sum().item()

accuracy = 100 * correct / total
print('Accuracy: {:.2f}%'.format(accuracy))


In [None]:
!pip uninstall -y torch torchdata torchvision torchtext torchaudio fastai
!pip install portalocker
!pip install --pre torch torchdata -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html


In [None]:
!pip install torchtext

import torch
import torchtext
from torchtext.datasets import AG_NEWS
from torchtext.vocab import Vectors

# download and load the dataset
train_data, test_data = AG_NEWS(root='./data')

# download and load the pre-trained CBOW word embeddings
url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip'
vector_path = './data/crawl-300d-2M.vec'
vectors = Vectors(name=vector_path, url=url)

# build the vocabulary using the pre-trained embeddings
text_field = torchtext.data.Field(tokenize='spacy', lower=True)
label_field = torchtext.data.LabelField(dtype=torch.long)
train_data, test_data = torchtext.datasets.AG_NEWS.splits(text_field=text_field, label_field=label_field)
text_field.build_vocab(train_data, vectors=vectors)

# define the input size and output size
input_size = len(text_field.vocab)
output_size = len(label_field.vocab)

# define the logistic regression model
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_size, output_size)

    def forward(self, x):
        out = self.linear(x)
        return out

model = LogisticRegression(input_size, output_size)

# define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# create the training and testing dataloaders
train_loader = torchtext.data.BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text), shuffle=True)
test_loader = torchtext.data.BucketIterator(test_data, batch_size=32, sort_key=lambda x: len(x.text))

# train the model
for epoch in range(10):
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()
        inputs = batch.text.transpose(0, 1)
        labels = batch.label - 1
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print('Epoch {}, Batch {}, Loss {}'.format(epoch+1, i+1, loss.item()))

# test the model
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        inputs = batch.text.transpose(0, 1)
        labels = batch.label - 1
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Test Accuracy: {}%'.format(100 * correct / total))


In [None]:
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vectors

tokenizer = get_tokenizer('basic_english')
train_dataset, test_dataset = AG_NEWS(root='data', split=('train', 'test'))

vectors = Vectors(name='glove.6B.100d.txt', cache='data')

from torchtext.vocab import GloVe

TEXT = torchtext.legacy.data.Field(tokenize=tokenizer, include_lengths=True)
LABEL = torchtext.legacy.data.LabelField(dtype=torch.float)

TEXT.build_vocab(train_dataset, vectors=vectors, max_size=10000, unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_dataset)

vocab = TEXT.vocab

import torch
from torch.utils.data import DataLoader

def generate_features(dataset):
    data = []
    labels = []
    for text, label in dataset:
        tokens = [vocab.stoi[token] for token in tokenizer(text)]
        data.append(tokens)
        labels.append(label)
    return torch.tensor(data), torch.tensor(labels)

train_data, train_labels = generate_features(train_dataset)
test_data, test_labels = generate_features(test_dataset)

train_loader = DataLoader(list(zip(train_data, train_labels)), batch_size=64, shuffle=True)
test_loader = DataLoader(list(zip(test_data, test_labels)), batch_size=64, shuffle=False)

model = LogisticRegression(input_size=len(vocab), output_size=4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    total_correct = 0
    for data, target in loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target.long())
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(data)
        total_correct += (output.argmax(dim=1) == target).sum().item()

    return total_loss / len(loader.dataset), total_correct / len(loader.dataset)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    with torch.no_grad():
        for data, target in loader:
            output = model(data)
            loss = criterion(output, target.long())

            total_loss += loss.item() * len(data)
            total_correct += (output.argmax(dim=1) == target).sum().item()

    return total_loss / len(loader.dataset), total_correct / len(loader.dataset)

for epoch in range(10):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, criterion)

    print('Epoch {}: train_loss={:.4f}, train_acc={:.4f}, test_loss={:.4f}, test_acc={:.4f}'.format(
        epoch + 1, train_loss, train_acc, test_loss, test_acc))


In [None]:

!pip install torchtext
!pip install torchdata

import torch
from torchtext.datasets import AG_NEWS
from torchtext.vocab import GloVe
from torch.utils.data import DataLoader
from sklearn.linear_model import LogisticRegression

# Define batch size
batch_size = 64

# Load the AG_NEWS dataset with labels
train_data, test_data = AG_NEWS(root='data', split=('train', 'test'))

# Load pretrained word embeddings
glove_vectors = GloVe(name='6B', dim=100)

# Define the embedding function
def get_embedding(text):
    # Split the text into words
    tokens = text.split(' ')
    # Get the embeddings for each word
    embeddings = [glove_vectors[token] for token in tokens]
    # Return the mean of the embeddings
    return torch.mean(torch.stack(embeddings), dim=0)

# Define the data loader
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Generate input features for the logistic regression classifier
X_train = []
y_train = []
X_test = []
y_test = []

for data, target in train_loader:
    # Generate embeddings for the text data
    embeddings = [get_embedding(text) for text in data]
    # Convert the embeddings to numpy arrays and append to X_train
    X_train.append(torch.stack(embeddings).detach().numpy())
    # Append the targets to y_train
    y_train.append(target.detach().numpy())

for data, target in test_loader:
    # Generate embeddings for the text data
    embeddings = [get_embedding(text) for text in data]
    # Convert the embeddings to numpy arrays and append to X_test
    X_test.append(torch.stack(embeddings).detach().numpy())
    # Append the targets to y_test
    y_test.append(target.detach().numpy())

# Flatten the lists and convert to numpy arrays
X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)
X_test = np.concatenate(X_test)
y_test = np.concatenate(y_test)

# Train a logistic regression classifier on the input features
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

# Evaluate the classifier on the test set
score = clf.score(X_test, y_test)
print('Accuracy:', score)


In [2]:
import numpy as np

# Define your word lists
young_words = ["youth", "energetic", "fun", "carefree", "vibrant"]
middle_aged_words = ["career", "family", "stressed", "busy", "successful"]
elderly_words = ["wisdom", "experience", "retired", "peaceful", "relaxed"]

# Prepare the target and attribute word sets
target_words = ["successful", "unsuccessful"]
attribute_words = [young_words, middle_aged_words, elderly_words]

# Calculate the effect size
def weat_effect_size(X, Y, A, B):
    mean_X = np.mean(X, axis=0)
    mean_Y = np.mean(Y, axis=0)
    mean_A = np.mean(A, axis=0)
    mean_B = np.mean(B, axis=0)
    std_X = np.std(X, axis=0)
    std_Y = np.std(Y, axis=0)
    z = (mean_X - mean_Y) / np.sqrt((std_X ** 2 + std_Y ** 2) / 2)
    numerator = np.dot(A.T, z)
    denominator = np.dot(B.T, z)
    return np.mean(numerator) / np.mean(denominator)

X = np.array([model[w] for w in target_words])
Y = np.concatenate([model[w] for w in young_words])
Z = np.concatenate([model[w] for w in middle_aged_words])
A = np.concatenate([model[w] for w in elderly_words])

effect_size = weat_effect_size(X, Y, Z, A)

# Interpret the results
if effect_size > 0:
    print("There is a positive age bias towards successful careers.")
else:
    print("There is a negative age bias towards successful careers.")


NameError: ignored

In [None]:
!pip install numpy
!pip install scipy

# Political bias word sets
setA = ['liberal', 'progressive', 'democrat', 'left-wing', 'feminist', 'gay', 'environmentalist']
setB = ['conservative', 'traditional', 'republican', 'right-wing', 'religious', 'straight', 'capitalist']

from gensim.models import KeyedVectors

glove_vectors = KeyedVectors.load_word2vec_format('path/to/glove.6B.300d.txt', binary=False)

import numpy as np

def cos_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

from scipy.stats import ttest_ind

def weat(setA, setB, targetX, targetY, word_vectors, association_measure):
    """
    Calculates the WEAT effect size and p-value for the given word sets and association measures.
    """
    A = np.mean([word_vectors[w] for w in setA], axis=0)
    B = np.mean([word_vectors[w] for w in setB], axis=0)
    X = np.mean([word_vectors[w] for w in targetX], axis=0)
    Y = np.mean([word_vectors[w] for w in targetY], axis=0)

    effect_size = np.dot(X - Y, A - B) / np.linalg.norm(A - B)

    # Calculate the standard deviation of differences for each set of words
    setA_diff = np.array([association_measure(word_vectors[w], X - Y) for w in setA])
    setB_diff = np.array([association_measure(word_vectors[w], X - Y) for w in setB])
    diff = setA_diff - setB_diff
    std_dev = np.std(diff, ddof=1)

    # Calculate the t-statistic and p-value
    t = effect_size / (std_dev / np.sqrt(len(setA)))
    p = ttest_ind(setA_diff, setB_diff, equal_var=False)[1]

    return p

# Target categories
targetX = ['good', 'excellent', 'positive', 'pleasant', 'satisfactory', 'superior']
targetY = ['bad', 'poor', 'negative', 'unpleasant', 'unsatisfactory', 'inferior']

# Calculate the p-value
p_value = weat(setA, setB, targetX, targetY, glove
