In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read Twitter and Restaurants data from CSV files
twitter_data = pd.read_csv('twitter_train.csv')
restaurants_data = pd.read_csv('restaurants_train.csv')

# Combine the text data from both domains
all_documents = list(twitter_data['Tokens']) + list(restaurants_data['Tokens'])

Step 1: Term Co-occurrence Analysis

In [4]:
# Tokenize and create a co-occurrence matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(all_documents)

# Get feature names and co-occurrence matrix
feature_names = vectorizer.get_feature_names_out()

# Extract relevant features based on co-occurrence matrix
threshold = 2  # Adjust as needed
relevant_features = [
    (feature_names[i], feature_names[j])
    for i, row in enumerate(X.T.toarray())
    for j, count in enumerate(row)
    if count >= threshold and i != j
]

Step 2: Transformer Model for Semantic Understanding

In [None]:
# Combine the text and labels from both domains
all_texts = list(twitter_data['Tokens']) + list(restaurants_data['Tokens'])
all_labels = list(twitter_data['Polarities']) + list(restaurants_data['Polarities'])

In [None]:
# Tokenize the data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenized_texts = tokenizer(all_texts, padding=True, truncation=True, return_tensors="pt")

In [None]:
# Create DataLoader
dataset = TensorDataset(
    tokenized_texts["input_ids"],
    tokenized_texts["attention_mask"],
    torch.tensor(all_labels, dtype=torch.float32).view(-1, 1)  # Adjust the view based on your label shape
)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [None]:
# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)  # Assuming polarities are regression values

In [None]:
# Fine-tuning loop (adjust as needed)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):  # Adjust the number of epochs
    for batch in dataloader:
        inputs, attention_mask, label = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        loss.backward()
        optimizer.step()