In [63]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [64]:
def representation_to_document(document_representation: str):
    '''
    Args:
        document_representation: 

    Returns:
        document_text, document_label
    '''
    word_counts = [term_representation.split(':') for term_representation in document_representation.split(' ')]
    _, label = word_counts[-1]
    word_counts = word_counts[:-1]
    words = []
    for term, count in word_counts:
        words.extend([term] * int(count))
    return ' '.join(words), label.replace("\n","")

def load_documents(file_path: Path):
    '''
    
    Args:
        file_path: 

    Returns:
        (document_texts, document_labels)
    '''
    with open(file_path, "r") as f:
        document_representations = f.readlines()
    return list(zip(*(representation_to_document(representation) for representation in document_representations)))

def load_domain(domain_path: Path):
    '''
    
    Args:
        domain_path: 

    Returns:
        train_df, val_df
    '''
    positive_document_counts, positive_document_labels = load_documents(domain_path / "positive.review")
    negative_documents_counts, negative_documents_labels = load_documents(domain_path / "negative.review")
    unlabeled_documents_counts, unlabeled_documents_labels = load_documents(domain_path / "unlabeled.review")
    return (
        pd.DataFrame({
            'sentiment': positive_document_labels + negative_documents_labels,
            'document': positive_document_counts + negative_documents_counts
        }),
        pd.DataFrame({
            'sentiment': unlabeled_documents_labels,
            'document': unlabeled_documents_counts
        })
    )

In [65]:
domains_path = Path("data/Multi Domain Sentiment/processed_acl")
domain_folders = [folder for folder in domains_path.iterdir() if folder.is_dir()]
domain_folder = domain_folders[0]
train_df, val_df = load_domain(domain_folder)

In [66]:
train_df

Unnamed: 0,sentiment,document
0,positive,holes must top_secret he center other_civilans...
1,positive,i_think dr_dean reason oz oz medicine_which me...
2,positive,woman_the contains_the fan_i alex_ross(superma...
3,positive,hurricane these_pages lost_innocence both at_h...
4,positive,while commented the_rise the_rise if if strong...
...,...,...
1995,negative,only idiotic_anyone if_i mystery_writer rather...
1996,negative,your well to_create peter bored_me inconsisten...
1997,negative,favorable_reviews heard straight book/pamphlet...
1998,negative,helpful this_one substance_and pages_devoted i...


## Lexicons

In [67]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import re

lemmatizer = WordNetLemmatizer()

In [68]:
def load_sentiwordnet(path: Path):
    sentiwordnet = {}
    with open(path, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            data = line.strip().split('\t')
            if len(data) != 6:
                continue
            _, _, pos_score, neg_score, synset_terms, _ = data
            pos_score = float(pos_score)
            neg_score = float(neg_score)
            terms = synset_terms.split()
            for term in terms:
                word = term.split('#')[0]
                if word in sentiwordnet:
                    sentiwordnet[word]['pos_score'].append(pos_score)
                    sentiwordnet[word]['neg_score'].append(neg_score)
                else:
                    sentiwordnet[word] = {'pos_score': [pos_score], 'neg_score': [neg_score]}
    # Average the scores
    for word in sentiwordnet:
        sentiwordnet[word]['pos_score'] = sum(sentiwordnet[word]['pos_score']) / len(sentiwordnet[word]['pos_score'])
        sentiwordnet[word]['neg_score'] = sum(sentiwordnet[word]['neg_score']) / len(sentiwordnet[word]['neg_score'])
    return sentiwordnet

In [69]:
sentiwordnet_path = Path("data/EN_Lexicons/SentiWordNet_3.0.0.txt")
sentiwordnet = load_sentiwordnet(sentiwordnet_path)

In [70]:
# Function to preprocess words
def preprocess_word(word):
    # Remove special tokens like <num>
    word = re.sub(r'<.*?>', '', word)
    # Replace underscores and dots with spaces
    word = word.replace('_', ' ').replace('.', ' ')
    # Split into individual words
    words = word.split()
    # Lemmatize each word
    words = [lemmatizer.lemmatize(w.lower()) for w in words]
    return words

In [71]:
# Function to extract features from a document
def extract_features(document):
    pos_score_total = 0.0
    neg_score_total = 0.0
    pos_word_count = 0
    neg_word_count = 0
    max_pos_score = 0.0
    max_neg_score = 0.0
    words = document.split()
    word_count = 0
    for word in words:
        processed_words = preprocess_word(word)
        for w in processed_words:
            word_count += 1
            if w in sentiwordnet:
                pos_score = sentiwordnet[w]['pos_score']
                neg_score = sentiwordnet[w]['neg_score']
                pos_score_total += pos_score
                neg_score_total += neg_score
                if pos_score > 0:
                    pos_word_count += 1
                    max_pos_score = max(max_pos_score, pos_score)
                if neg_score > 0:
                    neg_word_count += 1
                    max_neg_score = max(max_neg_score, neg_score)
    # Avoid division by zero
    avg_pos_score = pos_score_total / word_count if word_count > 0 else 0
    avg_neg_score = neg_score_total / word_count if word_count > 0 else 0
    score_difference = pos_score_total - neg_score_total
    score_ratio = (pos_score_total / neg_score_total) if neg_score_total != 0 else 0
    features = {
        'total_pos_score': pos_score_total,
        'total_neg_score': neg_score_total,
        'avg_pos_score': avg_pos_score,
        'avg_neg_score': avg_neg_score,
        'max_pos_score': max_pos_score,
        'max_neg_score': max_neg_score,
        'pos_word_count': pos_word_count,
        'neg_word_count': neg_word_count,
        'score_difference': score_difference,
        'score_ratio': score_ratio,
    }
    return features

In [72]:
train_df['features'] = train_df['document'].apply(extract_features)
features_df = pd.DataFrame(train_df['features'].tolist())

In [73]:
features_df

Unnamed: 0,total_pos_score,total_neg_score,avg_pos_score,avg_neg_score,max_pos_score,max_neg_score,pos_word_count,neg_word_count,score_difference,score_ratio
0,16.044326,14.276926,0.033218,0.029559,0.305556,0.6250,162,157,1.767400,1.123794
1,15.180929,10.754816,0.045588,0.032297,1.000000,0.5000,129,96,4.426113,1.411547
2,6.963393,1.489286,0.056613,0.012108,0.750000,0.1250,40,24,5.474107,4.675659
3,7.878322,10.471467,0.040402,0.053700,0.500000,0.6875,60,60,-2.593145,0.752361
4,39.724131,22.597125,0.061588,0.035034,1.000000,0.6250,245,218,17.127007,1.757929
...,...,...,...,...,...,...,...,...,...,...
1995,49.873843,47.355842,0.052388,0.049744,0.750000,0.8750,315,271,2.518002,1.053172
1996,16.744027,18.862985,0.064400,0.072550,0.500000,0.7500,98,90,-2.118957,0.887666
1997,15.453002,12.256164,0.081332,0.064506,0.575000,0.6250,68,73,3.196838,1.260835
1998,16.026036,15.322254,0.044271,0.042327,0.475000,0.6250,140,120,0.703782,1.045932


In [74]:
class CustomFeature:
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Extract features for each document in the dataset (assuming X is a DataFrame with a 'document' column)
        feature_list = X.apply(extract_features)
        # Convert the list of dictionaries to a DataFrame
        features_df = pd.DataFrame(feature_list.tolist())
        return features_df
    
def create_fit_model(X_train, y_train, vectorizer, middleware, model):
    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('middleware', middleware),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    return pipeline

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    precision_macro = precision_score(y_test, y_pred, average='macro')
    recall_macro = recall_score(y_test, y_pred, average='macro')
    f1_macro = f1_score(y_test, y_pred, average='macro')

    precision_micro = precision_score(y_test, y_pred, average='micro')
    recall_micro = recall_score(y_test, y_pred, average='micro')
    f1_micro = f1_score(y_test, y_pred, average='micro')

    return {
        "Precision (Macro)": precision_macro,
        "Recall (Macro)": recall_macro,
        "F1 Score (Macro)": f1_macro,
        "Precision (Micro)": precision_micro,
        "Recall (Micro)": recall_micro,
        "F1 Score (Micro)": f1_micro,
    }

In [75]:
identity_transformer = FunctionTransformer(lambda x: x)

X_train = train_df['document']
y_train = train_df['sentiment']
X_test = val_df['document']
y_test = val_df['sentiment']

In [76]:
model_parameter = {'vectorizer': CustomFeature(), 'middleware': identity_transformer, 'model': LogisticRegression(solver="liblinear")}
model_info = {
    'vectorizer': type(model_parameter['vectorizer']).__name__,
    'model': type(model_parameter['model']).__name__,
}

In [77]:
model = create_fit_model(X_train, y_train, **model_parameter)

In [78]:
results = evaluate_model(model, X_test, y_test)

In [79]:
results

{'Precision (Macro)': 0.6843186945637729,
 'Recall (Macro)': 0.6819218055397241,
 'F1 Score (Macro)': 0.6802631049603733,
 'Precision (Micro)': 0.6810750279955207,
 'Recall (Micro)': 0.6810750279955207,
 'F1 Score (Micro)': 0.6810750279955207}

In [80]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('custom_feature', CustomFeature()),  # Custom feature extraction
    ('scaler', StandardScaler()),  # Scale the features
    ('model', LogisticRegression(solver='lbfgs', max_iter=500))  # Logistic Regression
])

# Train the pipeline
pipeline.fit(X_train, y_train)

In [81]:
# Access the trained logistic regression model
logreg_model = pipeline.named_steps['model']

# Get the coefficients (importance of each feature)
coefficients = logreg_model.coef_

# If you have a binary classification problem, coefficients[0] will be the array of feature importance
print("Feature Coefficients:", coefficients[0])

Feature Coefficients: [ 0.0020393  -0.00417675  0.50645297 -0.84635729  0.39502054 -0.42217585
 -0.12888608  0.12174955  0.01771441 -0.06495051]


In [82]:
# Assuming your CustomFeature class outputs a DataFrame with these feature names
feature_names = ['total_pos_score', 'total_neg_score', 'avg_pos_score', 'avg_neg_score',
                 'max_pos_score', 'max_neg_score', 'pos_word_count', 'neg_word_count',
                 'score_difference', 'score_ratio']

# Combine feature names with corresponding coefficients
feature_importance = zip(feature_names, coefficients[0])

# Sort features by the absolute value of their coefficient (importance)
sorted_features = sorted(feature_importance, key=lambda x: abs(x[1]), reverse=True)

# Display sorted feature importance
for feature, coef in sorted_features:
    print(f"Feature: {feature}, Coefficient: {coef}")


Feature: avg_neg_score, Coefficient: -0.8463572869277443
Feature: avg_pos_score, Coefficient: 0.5064529745738771
Feature: max_neg_score, Coefficient: -0.4221758538201927
Feature: max_pos_score, Coefficient: 0.39502053927069697
Feature: pos_word_count, Coefficient: -0.12888608404295168
Feature: neg_word_count, Coefficient: 0.12174955313507597
Feature: score_ratio, Coefficient: -0.06495050896835664
Feature: score_difference, Coefficient: 0.017714413851078147
Feature: total_neg_score, Coefficient: -0.004176753379259705
Feature: total_pos_score, Coefficient: 0.002039302084932296
