In [1]:
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer, BertModel
import torch
from collections import Counter

# List of dataset filenames
dataset_filenames = ['twitter_train.csv', 'restaurants_train.csv', 'laptops_train.csv']

# Initialize a Counter to store the co-occurrence frequencies
co_occurrence_counter = Counter()

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for filename in dataset_filenames:
    # Read the dataset
    data = pd.read_csv(filename)

    # Convert string representations of lists to actual lists
    data['Tokens'] = data['Tokens'].apply(literal_eval)

    # Combine the tokenized text from all rows
    all_documents = [' '.join(tokens) for tokens in data['Tokens']]

    # Tokenize and get BERT embeddings
    encoded_input = tokenizer(all_documents, return_tensors='pt', padding=True, truncation=True, max_length=128)
    encoded_input.to(device)
    with torch.no_grad():
        outputs = model(**encoded_input)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    # Use CountVectorizer to transform the tokenized text into a feature matrix
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(all_documents)

    # Get feature names
    feature_names = vectorizer.get_feature_names_out()

    # Set the threshold for co-occurrence
    threshold = 2

    # Identify relevant features (co-occurring terms)
    relevant_features = [
        (feature_names[i], feature_names[j])
        for i, row in enumerate(X.T.toarray())
        for j, count in enumerate(row)
        if count >= threshold and i != j
    ]

    # Update the co-occurrence counter with the relevant features
    co_occurrence_counter.update(relevant_features)

# Print or use the co-occurrence frequencies as weights
print("Co-occurrence frequencies:")
for feature, frequency in co_occurrence_counter.items():
    print(f"{feature}: {frequency}")


  from .autonotebook import tqdm as notebook_tqdm
model.safetensors:   7%|▋         | 31.5M/440M [00:13<02:51, 2.39MB/s]


KeyboardInterrupt: 