# LDA: Query and Human Evaluation

## Configuration

In [None]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import gensim
from tqdm import tqdm_notebook as tqdm
from operator import itemgetter
from peertax.LDA_Diagnostic import LDA_Scores

Load eLife trained model.

In [None]:
base_pth = './eLife_LDA_Trained_Model/'
model_no = 'Model_13'
model_pth = base_pth + model_no
lda_model = gensim.models.ldamodel.LdaModel.load(model_pth)
id2word = gensim.corpora.Dictionary.load(model_pth + '.id2word')

Import wellcome test corpus.

In [None]:
# Load sentence data from tsv
path_load_tsv = '../pickles/wellcome_tokenized_LDA_sentence_0.tsv'
df_test = pd.read_csv(path_load_tsv,sep='\t',quoting=csv.QUOTE_NONE)
df_test.drop(columns=['Unnamed: 0'],inplace=True)
df_test['token'] = df_test['token'].str.split(',')
df_test.head()

In [None]:
# Create Test Corpus
texts_test = df_test['token']
corpus_test = [id2word.doc2bow(text) for text in texts_test]

Function to assign topics to initial dataframe.

In [None]:
def topic_analysis(ldamodel, corpus, texts, index):
    # Init dictionary
    sent_topics_dict = {}
    # Get main topic in each document
    for i in tqdm(range(len(corpus))):
        row = corpus[i]
        top_scor = max(ldamodel.get_document_topics(row),key=itemgetter(1))
        topic_num = top_scor[0];
        prop_topic = top_scor[1];
        # Dictionary entry per row
        sent_topics_dict[i] = {'Dominant_Topic': int(topic_num),
                           'Perc_Contribution': round(prop_topic,4)}        
    # Create database from dictionary
    sent_topics_df = pd.DataFrame.from_dict(sent_topics_dict, "index")
    # Add original text to the end of the output
    sent_topics_df['texts'] = pd.Series(texts)
    sent_topics_df.set_index(index,inplace=True)
    
    return sent_topics_df

In [None]:
df_topic_test = topic_analysis(lda_model, corpus_test, df_test.sentences.values, df_test.index.values)

In [None]:
di = {0: 1, 
      1: 2, 
      2: 2,
      3: 3,
      4: 4,
      5: 2,
      6: 2,
      7: 5,
      8: 6,   
      9: 4,
      10: 4,
      11: 0,
      12: 6}

df_category_test=df_topic_test[['Dominant_Topic','Perc_Contribution','texts']].copy()
df_category_test['Dominant_Topic']=df_category_test['Dominant_Topic'].map(di)
df_category_test.head()

In [None]:
_idx = df_category_test['Perc_Contribution'] <= 0.4
df_category_test.loc[_idx,'Dominant_Topic'] = 0

In [None]:
g = sns.FacetGrid(df_category_test, col="Dominant_Topic",col_wrap=4)
g.map(plt.hist, "Perc_Contribution");

In [None]:
sns.countplot(x='Dominant_Topic',data=df_category_test);

Group top reviews under each topic.

In [None]:
top_reviews_test = pd.DataFrame()

df_topic_test_grpd = df_category_test.groupby('Dominant_Topic')
for i, grp in df_topic_test_grpd:
    top_reviews_test = pd.concat([top_reviews_test, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(40)], 
                                            axis=0)

 Category assignments:
- Category 1: Figures
- Category 2: Statistics/Analysis/Models/Methods/Techniques
- Category 3: Novelty/Impact
- Category 4: Clarity of Exposition
- Category 5: Previous Literature
- Category 6: Main Discussion

- Category 0: Uncategorized

Investigate top sentences for each topic.

In [None]:
num_top = 2
topic2_test = top_reviews_test[top_reviews_test['Dominant_Topic']==num_top].index.tolist()
for i in topic2_test:
    print(top_reviews_test.loc[i,'texts'])
    print(top_reviews_test.loc[i])
    print('\n')

Select random samples per group.

In [None]:
size = 20       # sample size
replace = False  # without replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]
df_category_samples = df_category_test.groupby('Dominant_Topic', as_index=False).apply(fn)
df_category_samples.drop(0,inplace=True)

Built list of sampled elements 

In [None]:
sampled = []
for i in (range(1,1+len(df_category_samples.groupby(level=0)))):
    sampled.extend(df_category_samples.loc[i].index.values.tolist())
len(sampled)

Reset index.

In [None]:
df_category_samples.reset_index(level=0, drop=True,inplace=True)
df_category_samples['random'] = 0
df_category_samples.head()

Extract further 80 completely random samples.

In [None]:
from random import randint
d = []
i = 0
while i < 80:
    num = randint(0,len(df_category_test))
    if not num in sampled:
        sampled.append(num)
        d.append({'index': num,
                  'Dominant_Topic': df_category_test.loc[num].Dominant_Topic,
                  'Perc_Contribution': df_category_test.loc[num].Perc_Contribution,
                  'texts': df_category_test.loc[num].texts})
        i+=1
        
df_category_samples_rand = pd.DataFrame(d)
len(sampled)

In [None]:
df_category_samples_rand['random'] = 1
df_category_samples_rand.set_index('index',inplace=True)
df_category_samples_rand.index.name = None
df_category_samples_rand.head()

In [None]:
df_category_samples_final = pd.concat([df_category_samples,df_category_samples_rand])
df_category_samples_final = df_category_samples_final.sample(frac=1)

In [None]:
df_category_samples_final.head() 

Save as csv file.

In [None]:
path_save_csv = '../pickles/wellcome_sentence_human_valid.csv'
df_category_samples_final.to_csv(path_save_csv)