In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


class CustomFeature:
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # TODO: extract features
        return X


def representation_to_document(document_representation: str):
    '''
    
    Args:
        document_representation: 

    Returns:
        document_text, document_label
    '''
    word_counts = [term_representation.split(':') for term_representation in document_representation.split(' ')]
    _, label = word_counts[-1]
    word_counts = word_counts[:-1]
    words = []
    for term, count in word_counts:
        words.extend([term] * int(count))
    return ' '.join(words), label


def load_documents(file_path: Path):
    '''
    
    Args:
        file_path: 

    Returns:
        (document_texts, document_labels)
    '''
    with open(file_path, "r") as f:
        document_representations = f.readlines()
    return list(zip(*(representation_to_document(representation) for representation in document_representations)))


def load_domain(domain_path: Path):
    '''
    
    Args:
        domain_path: 

    Returns:
        train_df, val_df
    '''
    positive_document_counts, positive_document_labels = load_documents(domain_path / "positive.review")
    negative_documents_counts, negative_documents_labels = load_documents(domain_path / "negative.review")
    unlabeled_documents_counts, unlabeled_documents_labels = load_documents(domain_path / "unlabeled.review")
    return (
        pd.DataFrame({
            'sentiment': positive_document_labels + negative_documents_labels,
            'document': positive_document_counts + negative_documents_counts
        }),
        pd.DataFrame({
            'sentiment': unlabeled_documents_labels,
            'document': unlabeled_documents_counts
        })
    )

In [2]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    precision_macro = precision_score(y_test, y_pred, average='macro')
    recall_macro = recall_score(y_test, y_pred, average='macro')
    f1_macro = f1_score(y_test, y_pred, average='macro')

    precision_micro = precision_score(y_test, y_pred, average='micro')
    recall_micro = recall_score(y_test, y_pred, average='micro')
    f1_micro = f1_score(y_test, y_pred, average='micro')

    return {
        "Precision (Macro)": precision_macro,
        "Recall (Macro)": recall_macro,
        "F1 Score (Macro)": f1_macro,
        "Precision (Micro)": precision_micro,
        "Recall (Micro)": recall_micro,
        "F1 Score (Micro)": f1_micro,
    }

In [3]:
def create_fit_model(X_train, y_train, vectorizer, middleware, model):
    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('middleware', middleware),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    return pipeline

In [4]:
def evaluate_over_dataset(train_df, val_df):
    identity_transformer = FunctionTransformer(lambda x: x)
    transform_sparce = FunctionTransformer(lambda X: np.asarray(X.todense()))
    X_train = train_df['document']
    y_train = train_df['sentiment']
    X_val = val_df['document']
    y_val = val_df['sentiment']
    model_parameters = [
        {'vectorizer': TfidfVectorizer(), 'middleware': identity_transformer, 'model': LogisticRegression()},
        {'vectorizer': CountVectorizer(), 'middleware': identity_transformer, 'model': LogisticRegression()},
        # {'vectorizer': CustomFeature(), 'middleware': identity_transformer, 'model': LogisticRegression()},
        {'vectorizer': TfidfVectorizer(max_features=10000), 'middleware': transform_sparce, 'model': GaussianNB()},
        {'vectorizer': CountVectorizer(max_features=10000), 'middleware': transform_sparce, 'model': GaussianNB()},
        # {'vectorizer': CustomFeature(), 'middleware': transform_sparce, 'model': GaussianNB()},
    ]
    all_results = []
    for model_parameter in model_parameters:
        model_info = {
            'vectorizer': type(model_parameter['vectorizer']).__name__,
            'model': type(model_parameter['model']).__name__,
        }
        model = create_fit_model(X_train, y_train, **model_parameter)
        results = evaluate_model(model, X_val, y_val)
        model_info.update(results)
        all_results.append(model_info)
    return pd.DataFrame(all_results)

In [5]:
def evaluate_domains(domains_path: Path):
    domain_folders = [folder for folder in domains_path.iterdir() if folder.is_dir()]
    all_results = []
    for domain_folder in domain_folders:
        domain_name = domain_folder.name
        train_df, val_df = load_domain(domain_folder)
        results = evaluate_over_dataset(train_df, val_df)
        results['domain'] = domain_name
        results['domain_train_size'] = len(train_df)
        results['domain_test_size'] = len(val_df)
        all_results.append(results)
    return pd.concat(all_results, ignore_index=True)

## Evaluate different configurations per domain

In [6]:
domain_wise_evaluation = evaluate_domains(Path("../data/processed_acl"))
domain_wise_evaluation

Unnamed: 0,vectorizer,model,Precision (Macro),Recall (Macro),F1 Score (Macro),Precision (Micro),Recall (Micro),F1 Score (Micro),domain,domain_train_size,domain_test_size
0,TfidfVectorizer,LogisticRegression,0.865075,0.864866,0.864896,0.864929,0.864929,0.864929,kitchen,2000,5945
1,CountVectorizer,LogisticRegression,0.881803,0.881712,0.881734,0.881749,0.881749,0.881749,kitchen,2000,5945
2,TfidfVectorizer,GaussianNB,0.781939,0.78156,0.781562,0.781665,0.781665,0.781665,kitchen,2000,5945
3,CountVectorizer,GaussianNB,0.807755,0.807209,0.807001,0.807065,0.807065,0.807065,kitchen,2000,5945
4,TfidfVectorizer,LogisticRegression,0.832714,0.832171,0.831768,0.831803,0.831803,0.831803,books,2000,4465
5,CountVectorizer,LogisticRegression,0.826614,0.82663,0.826621,0.826652,0.826652,0.826652,books,2000,4465
6,TfidfVectorizer,GaussianNB,0.726477,0.725492,0.724777,0.724972,0.724972,0.724972,books,2000,4465
7,CountVectorizer,GaussianNB,0.734613,0.732652,0.732499,0.733259,0.733259,0.733259,books,2000,4465
8,TfidfVectorizer,LogisticRegression,0.859076,0.858961,0.858982,0.859004,0.859004,0.859004,electronics,2000,5681
9,CountVectorizer,LogisticRegression,0.860079,0.859816,0.859845,0.859884,0.859884,0.859884,electronics,2000,5681


In [7]:
def get_consolidated_data(domains_path: Path):
    all_train_dfs = []
    all_test_dfs = []
    domain_folders = [folder for folder in domains_path.iterdir() if folder.is_dir()]
    for domain_folder in domain_folders:
        train_df, val_df = load_domain(domain_folder)
        train_df['domain'] = domain_folder.name
        val_df['domain'] = domain_folder.name
        all_train_dfs.append(train_df)
        all_test_dfs.append(val_df)
    consolidated_train_df = pd.concat(all_train_dfs, ignore_index=True)
    consolidated_test_df = pd.concat(all_test_dfs, ignore_index=True)
    return consolidated_train_df, consolidated_test_df


def evaluate_consolidated(domains_path: Path):
    consolidated_train_df, consolidated_test_df = get_consolidated_data(domains_path)
    return evaluate_over_dataset(consolidated_train_df, consolidated_test_df)

## Evaluate using one model for the whole dataset

In [29]:
single_model_evaluation = evaluate_consolidated(Path("../data/processed_acl"))
single_model_evaluation

Unnamed: 0,vectorizer,model,Precision (Macro),Recall (Macro),F1 Score (Macro),Precision (Micro),Recall (Micro),F1 Score (Micro)
0,TfidfVectorizer,LogisticRegression,0.869648,0.869539,0.869487,0.869492,0.869492,0.869492
1,CountVectorizer,LogisticRegression,0.876404,0.876398,0.876401,0.876404,0.876404,0.876404
2,TfidfVectorizer,GaussianNB,0.805935,0.804835,0.804544,0.804696,0.804696,0.804696
3,CountVectorizer,GaussianNB,0.802116,0.800735,0.800379,0.800579,0.800579,0.800579


## See best configuration for each domain according to f1

In [9]:
grouped_results = domain_wise_evaluation.groupby('domain').apply(
    lambda df: df.nlargest(1, 'F1 Score (Macro)')).reset_index(drop=True)
grouped_results

  grouped_results = domain_wise_evaluation.groupby('domain').apply(


Unnamed: 0,vectorizer,model,Precision (Macro),Recall (Macro),F1 Score (Macro),Precision (Micro),Recall (Micro),F1 Score (Micro),domain,domain_train_size,domain_test_size
0,TfidfVectorizer,LogisticRegression,0.832714,0.832171,0.831768,0.831803,0.831803,0.831803,books,2000,4465
1,TfidfVectorizer,LogisticRegression,0.843019,0.842603,0.842647,0.842722,0.842722,0.842722,dvd,2000,3586
2,CountVectorizer,LogisticRegression,0.860079,0.859816,0.859845,0.859884,0.859884,0.859884,electronics,2000,5681
3,CountVectorizer,LogisticRegression,0.881803,0.881712,0.881734,0.881749,0.881749,0.881749,kitchen,2000,5945


## Compare using single model or gated model

In [26]:
class GatedSentimentClassifier:
    def __init__(self, domain_pipes: dict[str, Pipeline]):
        self.domain_pipes = domain_pipes

    def fit(self, X: pd.DataFrame, y=None):
        for domain, pipe in self.domain_pipes.items():
            domain_data = X[X['domain'] == domain]
            domain_labels = y[X['domain'] == domain]

            X_domain = domain_data['document']
            y_domain = domain_labels
            pipe.fit(X_domain, y_domain)

    def predict(self, X: pd.DataFrame):
        predictions = []
        for _, row in X.iterrows():
            domain = row['domain']
            if domain not in self.domain_pipes:
                raise ValueError(f"Domain '{domain}' not found in domain_pipes")

            pipe = self.domain_pipes[domain]
            X_row = row['document']

            # Predict using the corresponding pipeline
            prediction = pipe.predict([X_row])
            predictions.append(prediction[0])
        return predictions



We choose the best models according to the f1 score per domain

In [27]:
gated_model = GatedSentimentClassifier({
    'books': Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('model', LogisticRegression())
    ]),
    'dvd': Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('model', LogisticRegression())
    ]),
    'electronics': Pipeline([
        ('vectorizer', CountVectorizer()),
        ('model', LogisticRegression())
    ]),
    'kitchen': Pipeline([
        ('vectorizer', CountVectorizer()),
        ('model', LogisticRegression())
    ])
})

consolidated_train, consolidated_test = get_consolidated_data(Path("../data/processed_acl"))

X_train = consolidated_train[['document', 'domain']]
y_train = consolidated_train['sentiment']

gated_model.fit(X_train, y_train)

In [28]:
evaluate_model(gated_model, consolidated_test[['document', 'domain']], consolidated_test['sentiment'])

{'Precision (Macro)': np.float64(0.8569909796756637),
 'Recall (Macro)': np.float64(0.8569979355165842),
 'F1 Score (Macro)': np.float64(0.8569897739838795),
 'Precision (Micro)': np.float64(0.8569903948772679),
 'Recall (Micro)': np.float64(0.8569903948772679),
 'F1 Score (Micro)': np.float64(0.8569903948772679)}

We choose the best model according to the f1 for the consolidated data

In [32]:
single_model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('model', LogisticRegression())
])

single_model.fit(consolidated_train['document'], consolidated_train['sentiment'])

single_model_evaluation = evaluate_model(single_model, consolidated_test['document'], consolidated_test['sentiment'])
single_model_evaluation

{'Precision (Macro)': np.float64(0.8764043204898988),
 'Recall (Macro)': np.float64(0.8763979118994643),
 'F1 Score (Macro)': np.float64(0.8764005366893186),
 'Precision (Micro)': np.float64(0.8764039233623012),
 'Recall (Micro)': np.float64(0.8764039233623012),
 'F1 Score (Micro)': np.float64(0.8764039233623012)}