In [1]:
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

# List of dataset filenames
dataset_filenames = ['twitter_train.csv', 'restaurants_train.csv', 'laptops_train.csv']

# Initialize a Counter to store the co-occurrence frequencies
co_occurrence_counter = Counter()

for filename in dataset_filenames:
    # Read the dataset
    data = pd.read_csv(filename)

    # Convert string representations of lists to actual lists
    data['Tokens'] = data['Tokens'].apply(literal_eval)
    data['Tags'] = data['Tags'].apply(literal_eval)
    data['Polarities'] = data['Polarities'].apply(literal_eval)

    # Convert polarity values from string to numeric
    data['Polarities'] = data['Polarities'].apply(lambda x: [int(val) for val in x])

    # Combine the tokenized text from all rows
    all_documents = [' '.join(tokens) for tokens in data['Tokens']]

    # Use CountVectorizer to transform the tokenized text into a feature matrix
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(all_documents)

    # Get feature names
    feature_names = vectorizer.get_feature_names_out()

    # Set the threshold for co-occurrence
    threshold = 3

    # Identify relevant features (co-occurring terms)
    relevant_features = [
        (feature_names[i], feature_names[j])
        for i, row in enumerate(X.T.toarray())
        for j, count in enumerate(row)
        if count >= threshold and i != j
    ]

    # Update the co-occurrence counter with the relevant features
    co_occurrence_counter.update(relevant_features)

# Print or use the co-occurrence frequencies as weights
print("Co-occurrence frequencies:")
for feature, frequency in co_occurrence_counter.items():
    print(f"{feature}: {frequency}")


Co-occurrence frequencies:
('2011', 'fusion'): 1
('about', 'duty'): 1
('addicted', 'asist09'): 1
('album', 'emerge'): 1
('alicia', 'joyce'): 1
('and', '20m'): 1
('and', '9th'): 1
('and', 'aaaaaaaand'): 1
('and', 'ah'): 1
('and', 'air'): 1
('and', 'approval'): 1
('and', 'ballad'): 1
('and', 'battlefront'): 1
('and', 'bombeck'): 1
('and', 'britneysclub'): 1
('and', 'candidates'): 1
('and', 'cookie'): 1
('and', 'cuddled'): 1
('and', 'cumin'): 1
('and', 'downtown'): 1
('and', 'exchangerates'): 1
('and', 'fanfics'): 1
('and', 'fiber'): 1
('and', 'fusion'): 1
('and', 'handheld'): 1
('and', 'homie'): 1
('and', 'hopefuls'): 1
('and', 'insure'): 1
('and', 'joe'): 1
('and', 'keeps'): 1
('any', 'avoids'): 1
('anyone', 'daddy'): 1
('apple', '30620'): 1
('apple', 'act'): 1
('apple', 'agradecer'): 1
('apple', 'bingung'): 1
('apple', 'business'): 1
('apple', 'extrapolate'): 1
('apple', 'glasses'): 1
('apple', 'investors'): 1
('application', 'birkin'): 1
('application', 'guys'): 1
('are', 'app'): 1
('