In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import NMF, PCA
from sklearn.linear_model import Ridge, LogisticRegression, Lasso
from sklearn.metrics import mean_squared_error as mse, roc_auc_score as roc, accuracy_score as acc, log_loss
from sklearn.neural_network import MLPClassifier, MLPRegressor
import numpy as np
import pandas as pd
from data.dataset import TextResponseDataset
import causal_attribution
import util
from scipy.sparse import csr_matrix
from importlib import reload
import data.dataset as ds
from model.topic_model import TopicModel
from model.model_trainer import ModelTrainer
from torch.utils.data import DataLoader
from evaluation.evaluator import Evaluator
import itertools as it
import seaborn as sns
import os

In [92]:
def run_cross_validation(features, labels, num_documents, n_cv=5, n_folds=10, label_is_bool=False, C=None):
    n_metrics = 1 if not label_is_bool else 3
    split_indices = util.cross_val_splits(num_documents)
    all_indices = np.arange(num_documents)
    mses = np.zeros((n_cv,n_metrics))
    
    if label_is_bool:
        if C is not None:
            model = LogisticRegression(C=C, penalty='l1', solver='liblinear')
        else:
            model = LogisticRegression(solver='liblinear')
    else:
        model = Ridge() #Lasso(alpha=C)#
    for i in range(n_cv):
        te_indices = split_indices[i]
        tr_indices = np.setdiff1d(all_indices, te_indices)

        tr_feat = features[tr_indices, :]
        tr_labels = labels[tr_indices]
        te_feat = features[te_indices,:]
        te_labels = labels[te_indices]
        
        model.fit(tr_feat, tr_labels)
        
        te_pred = model.predict(te_feat)
        if label_is_bool:
            te_pr_pred = model.predict_proba(te_feat)[:,1]
            ll = log_loss(te_labels, te_pr_pred)
            auc = roc(te_labels, te_pr_pred)
            accuracy = acc(te_labels, te_pred)
            mses[i][0] = auc
            mses[i][1] = ll
            mses[i][2] = accuracy
        else:
            err = mse(te_labels, te_pred)
            mses[i][0] = err
    
    return mses.mean(axis=0), mses.std(axis=0)


def interpret_model(features, labels, num_documents, split=0, label_is_bool=True):
    n_metrics = 1 if not label_is_bool else 3
    split_indices = util.cross_val_splits(num_documents)
    all_indices = np.arange(num_documents)
    if label_is_bool:
            model = LogisticRegression(solver='liblinear')
    else:
        model = Ridge()
    i = split
    te_indices = split_indices[i]
    tr_indices = np.setdiff1d(all_indices, te_indices)

    tr_feat = features[tr_indices, :]
    tr_labels = labels[tr_indices]
    te_feat = features[te_indices,:]
    te_labels = labels[te_indices]

    model.fit(tr_feat, tr_labels)
    return model.coef_


def get_normalized_pmi(topics, counts, num_words=10):
    num_topics = topics.shape[0]
    num_docs = counts.shape[0]
    per_topic_npmi = np.zeros(num_topics)

    bin_counts = counts.copy()
    bin_counts[bin_counts>1] = 1
    
    tf = csr_matrix(bin_counts)
    cooccurence = tf.T.dot(tf)
    cooccurence = cooccurence.toarray()

    doc_count = bin_counts.sum(axis=0)
    prob = doc_count/num_docs
    cooccurence_prob = cooccurence/num_docs

    for k in range(num_topics):
        npmi_total = 0
        beta = topics[k,:]
        top_words = (-beta).argsort()[:num_words]
        n = 0 
        for (w1, w2) in it.combinations(top_words, 2):
            joint = cooccurence_prob[w1][w2]+1e-7
            p_w1 = prob[w1]+1e-7
            p_w2 = prob[w2]+1e-7
            numerator = np.log(joint/(p_w1*p_w2))
            denom = -np.log(joint)
            npmi_total += numerator/denom
            n+=1
        per_topic_npmi[k] = npmi_total
    return per_topic_npmi.mean()

## Declare the dataset name (and topic, for Media Framing Corpus)

In [85]:
dataset = 'peerread'
framing_topic = 'guncontrol'

## Processing data 

In [86]:
reload(ds)

label_is_bool=False

if dataset in TextResponseDataset.CLASSIFICATION_SETTINGS:
    label_is_bool=True

if dataset == 'amazon':
    datafile = '../dat/reviews_Office_Products_5.json'
elif dataset == 'amazon_binary':
    datafile = '../dat/reviews_Grocery_and_Gourmet_Food_5.json'
elif dataset == 'yelp':
    datafile = '../dat/yelp_review_polarity_csv/train.csv'
elif dataset == 'peerread':
    datafile = '../dat/peerread_abstracts.csv'
elif dataset == 'framing_corpus':
    datafile = '../dat/framing/'
else:
    datafile = '../dat/cs_papers.gz'

if dataset == 'framing_corpus':
    proc_file = '../dat/proc/' + dataset + '_' + framing_topic + '_proc.npz'
else:
    proc_file = '../dat/proc/' + dataset + '_proc.npz'

components = {'amazon':30, 
              'semantic_scholar':50, 
              'peerread':50, 'yelp':30, 
              'amazon_binary':20, 
              'framing_corpus':10
             }
text_dataset = ds.TextResponseDataset(dataset, 
                                      datafile, 
                                      proc_file, 
                                      use_bigrams=False,
                                      framing_topic=framing_topic)
text_dataset.process_dataset()
text_dataset.preprocessing()

counts = text_dataset.counts
labels= text_dataset.labels
vocab= text_dataset.vocab
docs = text_dataset.docs

n_components=components[dataset]
num_documents = counts.shape[0]
n_components, num_documents, counts.shape[1]

(50, 11778, 5683)

## Running PCA on cooccurence matrix of words to create embeddings of words for regression

In [78]:
tf = csr_matrix(counts)
cooccurence = tf.T.dot(tf)
cooccurence = cooccurence.toarray()

pca = PCA(n_components=n_components)
embeddings = pca.fit_transform(cooccurence)

features = np.zeros((num_documents, n_components))
for i in range(num_documents):
    tf = counts[i,:]
    nonzero = (tf > 0)
    features[i] = embeddings[nonzero,:].sum(axis=0)

result_pca = run_cross_validation(features, labels, num_documents, label_is_bool=label_is_bool)
result_pca

(array([0.69332   , 0.56887495, 0.69879518]),
 array([0.02759584, 0.01027946, 0.00647571]))

## LDA features for regression

In [87]:
if dataset == 'framing_corpus':
    pretraining_file = '../dat/proc/' + dataset + '_' + framing_topic + '_pretraining.npz'
else:
    pretraining_file = '../dat/proc/' + dataset + '_pretraining.npz'
    
if os.path.exists(pretraining_file):
    print("Loading saved results...")
    arr = np.load(pretraining_file)
    doc_rep = arr['theta'] 
    topics = arr['beta']
    print("Completed.")
else:
    lda_model = LDA(n_components=n_components)
    doc_rep = lda_model.fit_transform(counts)
    
    unnormalized_topics = lda_model.components_
    topics = lda_model.components_ / lda_model.components_.sum(axis=1)[:,np.newaxis]
    
    if dataset == 'framing_corpus':
        pretrained_out_file = '../dat/proc/' + dataset + '_' + framing_topic + '_pretraining'
    else:
        pretrained_out_file = '../dat/proc/' + dataset + '_pretraining'

    np.savez_compressed(pretrained_out_file, theta=doc_rep, beta=topics)
    
    print("Perplexity:", lda_model.perplexity(counts))
    

Loading saved results...
Completed.


In [88]:
for k in range(n_components):
    beta = topics[k]
    top_words = (-beta).argsort()[:7]
    topic_words = [(vocab[t]) for t in top_words]
    print('Topic {}: {}'.format(k, topic_words))

Topic 0: ['system', 'used', 'based', 'paper', 'proposed', 'traffic', 'area']
Topic 1: ['network', 'neural', 'memory', 'neural network', 'gradient', 'training', 'new']
Topic 2: ['topic', 'rule', 'model', 'probabilistic', 'algorithm', 'distribution', 'inference']
Topic 3: ['system', 'knowledge', 'agent', 'dialogue', 'domain', 'argument', 'paper']
Topic 4: ['information', 'extraction', 'social', 'network', 'rule', 'social network', 'decision']
Topic 5: ['model', 'sequence', 'neural', 'sentence', 'attention', 'recurrent', 'task']
Topic 6: ['question', 'task', 'model', 'word', 'answer', 'semantic', 'entity']
Topic 7: ['detection', 'detect', 'anomaly', 'algorithm', 'system', 'activity', 'tracking']
Topic 8: ['model', 'human', 'role', 'attribute', 'play', 'qualitative', 'reasoning']
Topic 9: ['learning', 'task', 'label', 'sample', 'generalization', 'learner', 'complexity']
Topic 10: ['ensemble', 'model', 'concept', 'summarization', 'two', 'result', 'state art']
Topic 11: ['translation', 'lang

In [89]:
results_lda = run_cross_validation(doc_rep, labels, num_documents, label_is_bool=label_is_bool)
results_lda

(array([0.80188297, 0.44200753, 0.78742566]),
 array([0.01562673, 0.00534001, 0.00538419]))

## BOW features for regression

In [90]:
normalized = counts/counts.sum(axis=1)[:,np.newaxis]

In [91]:
result_bow = run_cross_validation(csr_matrix(normalized), labels, num_documents, label_is_bool=label_is_bool)
result_bow

(array([0.78269998, 0.5118064 , 0.75904843]),
 array([0.01010406, 0.01224703, 0.01284694]))

## Regression adjusted for topic

In [93]:
features = np.column_stack((normalized,doc_rep))
result_adjusted = run_cross_validation(csr_matrix(features), labels, num_documents, label_is_bool=label_is_bool)
result_adjusted

[[0.81205147 0.43660107 0.78419711]
 [0.81887834 0.44377002 0.78589635]
 [0.77988073 0.43994415 0.7884452 ]
 [0.80625635 0.43761526 0.78759558]
 [0.82464898 0.42720401 0.79694138]]


(array([0.80834317, 0.4370269 , 0.78861512]),
 array([0.01552473, 0.00549539, 0.00441147]))