In [None]:
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer

# List of dataset filenames
dataset_filenames = ['twitter_train.csv', 'restaurants_train.csv', ]

for filename in dataset_filenames:
    # Read the dataset
    data = pd.read_csv(filename)

    # Convert string representations of lists to actual lists
    data['Tokens'] = data['Tokens'].apply(literal_eval)
    data['Tags'] = data['Tags'].apply(literal_eval)
    data['Polarities'] = data['Polarities'].apply(literal_eval)

    # Convert polarity values from string to numeric
    data['Polarities'] = data['Polarities'].apply(lambda x: [int(val) for val in x])

    # Combine the tokenized text from all rows
    all_documents = [' '.join(tokens) for tokens in data['Tokens']]

    # Use CountVectorizer to transform the tokenized text into a feature matrix
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(all_documents)

    # Get feature names
    feature_names = vectorizer.get_feature_names_out()

    # Set the threshold for co-occurrence
    threshold = 10

    # Identify relevant features (co-occurring terms)
    relevant_features = [
        (feature_names[i], feature_names[j])
        for i, row in enumerate(X.T.toarray())
        for j, count in enumerate(row)
        if count >= threshold and i != j
    ]

    # Print or do something with the relevant features
    print(f"Relevant features for {filename}: {relevant_features}")
