In [6]:
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer

# List of dataset filenames
dataset_filenames = ['twitter_train.csv', 'restaurants_train.csv', 'laptops_train.csv']

# Initialize an empty list to store combined relevant features
combined_relevant_features = []

for filename in dataset_filenames:
    # Read the dataset
    data = pd.read_csv(filename)

    # Convert string representations of lists to actual lists
    data['Tokens'] = data['Tokens'].apply(literal_eval)
    data['Tags'] = data['Tags'].apply(literal_eval)
    data['Polarities'] = data['Polarities'].apply(literal_eval)

    # Convert polarity values from string to numeric
    data['Polarities'] = data['Polarities'].apply(lambda x: [int(val) for val in x])

    # Combine the tokenized text from all rows
    all_documents = [' '.join(tokens) for tokens in data['Tokens']]

    # Use CountVectorizer to transform the tokenized text into a feature matrix
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(all_documents)

    # Get feature names
    feature_names = vectorizer.get_feature_names_out()

    # Set the threshold for co-occurrence
    threshold = 3

    # Identify relevant features (co-occurring terms)
    relevant_features = [
        (feature_names[i], feature_names[j])
        for i, row in enumerate(X.T.toarray())
        for j, count in enumerate(row)
        if count >= threshold and i != j
    ]

    # Append relevant features to the combined list
    combined_relevant_features.extend(relevant_features)

# Print or do something with the combined relevant features
print("Combined Relevant Features:")
for feature in combined_relevant_features:
    print(feature)


Combined Relevant Features:
('2011', 'fusion')
('about', 'duty')
('addicted', 'asist09')
('album', 'emerge')
('alicia', 'joyce')
('and', '20m')
('and', '9th')
('and', 'aaaaaaaand')
('and', 'ah')
('and', 'air')
('and', 'approval')
('and', 'ballad')
('and', 'battlefront')
('and', 'bombeck')
('and', 'britneysclub')
('and', 'candidates')
('and', 'cookie')
('and', 'cuddled')
('and', 'cumin')
('and', 'downtown')
('and', 'exchangerates')
('and', 'fanfics')
('and', 'fiber')
('and', 'fusion')
('and', 'handheld')
('and', 'homie')
('and', 'hopefuls')
('and', 'insure')
('and', 'joe')
('and', 'keeps')
('any', 'avoids')
('anyone', 'daddy')
('apple', '30620')
('apple', 'act')
('apple', 'agradecer')
('apple', 'bingung')
('apple', 'business')
('apple', 'extrapolate')
('apple', 'glasses')
('apple', 'investors')
('application', 'birkin')
('application', 'guys')
('are', 'app')
('as', 'angielovebieber')
('ay', 'aw')
('bad', 'buckupshow')
('barack', 'backed')
('be', 'holibobs')
('because', 'blackbirds')
('b