In [2]:
#dependencies
import re
import numpy as np
import pandas as pd
import csv
from pprint import pprint
import math
from sklearn.linear_model import LogisticRegression

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize
from gensim.models import LdaModel


# Plotting tools
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
#load data
#load training Data
with open('../../../data/Datasets/20NG/idocnade/training.csv', newline='') as csvfile1:
    data_train = pd.read_csv(csvfile1,  names=["Label", "Text"])
    train_fold = data_train['Text'].iloc[:9000].fillna('0')   
    

#load validation Data
with open('../../../data/Datasets/20NG/idocnade/validation.csv', newline='') as csvfile2:
    data_val = pd.read_csv(csvfile2,  names=["Label", "Text"])
    val_fold = data_val['Text'].fillna('0')     
    
frames = [train_fold,val_fold]
train_data = pd.concat(frames)

#load testing Data
with open('../../../data/Datasets/20NG/idocnade/test.csv', newline='') as csvfile3:
    data_test = pd.read_csv(csvfile3,  names=["Label", "Text"])  
    test_data = data_test['Text'].fillna('0')  



In [3]:
lda = LdaModel.load('../../../LDA/model/20NG/lda.model_with_k200/lda')
dictionary = gensim.corpora.Dictionary(train_data.str.split())
corpus_test = [dictionary.doc2bow(text.split()) for index,text in test_data.iteritems()]
corpus_train = [dictionary.doc2bow(text.split()) for index,text in train_data.iteritems()]

In [4]:
#prepare log file
results = {'PPL': [],
                 'C_V(10)': [],
                 'C_V(20)': [],
                 'Topics10': [],
           'Topics20':[],
                 'IR_fractions': [],
                 'IR_precision': [],
                 'clist': [],
                 'acc_values': [],
                 'F1 scores': []         
                }



In [4]:
perplexity = lda.log_perplexity(corpus_test)
ppl = 2**(-perplexity)
results['PPL'].append(ppl)

In [5]:
text_filenames = [
    "../../../data/Datasets/20NG/idocnade/20NG_train.txt",
    "../../../data/Datasets/20NG/idocnade/20NG_val.txt",
    "../../../data/Datasets/20NG/idocnade/20NG_test.txt"
]

# read original text documents as list of words
texts = []

for file in text_filenames:
    print('filename:%s', file)
    for line in open(file.rstrip('\r'), 'r').readlines():
        document = str(line).split('\t')[1]
        document = document.encode(encoding="utf-8",errors="ignore").decode('utf-8',errors='ignore')
        texts.append(document.split())

filename:%s ../../../data/Datasets/20NG/idocnade/20NG_train.txt
filename:%s ../../../data/Datasets/20NG/idocnade/20NG_val.txt
filename:%s ../../../data/Datasets/20NG/idocnade/20NG_test.txt


In [7]:
cv = []
topic_words=[]
for j in [10,20]:
    for i in range(0,9):
        arr = lda.show_topic(1,j)
        topics = []
        for (word,number)in arr:
            topics.append(word)
        topic_words.append(topics)    
    PMI = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence="c_v").get_coherence()
    cv.append(PMI)
results["C_V(10)"].append(cv[0])
results["C_V(20)"].append(cv[1])

In [6]:
results["Topics10"].append(lda.top_topics(10,texts=texts, dictionary=dictionary, coherence="c_v"))
results["Topics20"].append(lda.top_topics(20,texts=texts, dictionary=dictionary, coherence="c_v"))

In [9]:
#helper functions 
#get closest docs
def closest_docs_by_index(corpus_vectors, query_vectors, n_docs):
    docs = []
    sim = pw.cosine_similarity(corpus_vectors, query_vectors)
    order = np.argsort(sim, axis=0)[::-1]
    for i in range(len(query_vectors)):
        docs.append(order[:, i][0:n_docs])
    return np.array(docs)

#precision
def precision(label, predictions):
    if len(predictions):
        return float(
            len([x for x in predictions if label in x])
        ) / len(predictions)
    else:
        return 0.0

In [10]:
#labels
train_file = "../../../data/Datasets/20NG/idocnade/training_docnade.csv"
train_labels = []
val_file = "../../../data/Datasets/20NG/idocnade/validation_docnade.csv"
val_labels = []
test_file = "../../../data/Datasets/20NG/idocnade/test_docnade.csv"
test_labels = []

with open(train_file, newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        train_labels.append([row[0]])
train_labels = np.array(train_labels)[:9000]
with open(val_file, newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        val_labels.append([row[0]])
val_labels = np.array(val_labels)
with open(test_file, newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        test_labels.append([row[0]])
test_labels = np.array(test_labels)[:7500]  


In [11]:
"""import sklearn.metrics.pairwise as pw
ir_ratio_list = [0.0001, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8, 1.0]
#ir_ratio_list = [0.02]
results["IR_fractions"].append(ir_ratio_list)    

#hidden vec representations
corpus_train = [dictionary.doc2bow(text.split()) for index,text in train_fold.iteritems()]
corpus_vectors = []
for doc in corpus_train:
    corpus_vectors.extend(lda[doc])
corpus_vectors = np.array(corpus_vectors)    
  
corpus_test = [dictionary.doc2bow(text.split()) for index,text in test_data[:7500].iteritems()]
query_vectors = []
for doc in corpus_test:
    query_vectors.extend(lda[doc])
query_vectors = np.array(query_vectors)


##ir
corpus_size = len(train_labels)
query_size = len(test_labels)

result = []
for r in ir_ratio_list:
    n_docs = int((corpus_size * r) + 0.5)
    if not n_docs:
        results.append(0.0)
        continue

    closest = closest_docs_by_index(corpus_vectors, query_vectors, n_docs)

    avg = 0.0
    for i in range(query_size):
        doc_labels = query_labels[i]
        doc_avg = 0.0
        for label in doc_labels:
            doc_avg += precision(label, corpus_labels[closest[i]])
        doc_avg /= len(doc_labels)
        avg += doc_avg
    avg /= query_size
    result.append(avg)
results["IR_precision"].append(result)

"""

'import sklearn.metrics.pairwise as pw\nir_ratio_list = [0.0001, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8, 1.0]\n#ir_ratio_list = [0.02]\nresults["IR_fractions"].append(ir_ratio_list)    \n\n#hidden vec representations\ncorpus_train = [dictionary.doc2bow(text.split()) for index,text in train_fold.iteritems()]\ncorpus_vectors = []\nfor doc in corpus_train:\n    corpus_vectors.extend(lda[doc])\ncorpus_vectors = np.array(corpus_vectors)    \n  \ncorpus_test = [dictionary.doc2bow(text.split()) for index,text in test_data[:7500].iteritems()]\nquery_vectors = []\nfor doc in corpus_test:\n    query_vectors.extend(lda[doc])\nquery_vectors = np.array(query_vectors)\n\n\n##ir\ncorpus_size = len(train_labels)\nquery_size = len(test_labels)\n\nresult = []\nfor r in ir_ratio_list:\n    n_docs = int((corpus_size * r) + 0.5)\n    if not n_docs:\n        results.append(0.0)\n        continue\n\n    closest = closest_docs_by_index(corpus_vectors, query_vectors, n_docs)\n\n

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
##classification 
c_list = [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 3.0, 5.0, 10.0, 100.0, 500.0, 1000.0, 10000.0]
results["clist"].append(c_list)

#vector representations
corpus_train = [dictionary.doc2bow(text.split()) for index,text in train_fold.iteritems()]
corpus_vectors = []
for doc in corpus_train:
    vec= lda.get_document_topics(doc, minimum_probability=0)
    vecs = [e[1] for e in vec]   
    corpus_vectors+=vecs
corpus_vectors = np.array(corpus_vectors).reshape(9000,200)
  
corpus_test = [dictionary.doc2bow(text.split()) for index,text in test_data[:7500].iteritems()]
query_vectors = []
for doc in corpus_test:
    vec= lda.get_document_topics(doc, minimum_probability=0)
    vecs = [e[1] for e in vec]   
    query_vectors+=vecs
query_vectors = np.array(query_vectors).reshape(7500,200) 


#
test_acc = []
test_f1  = []

for c in c_list:
    clf = LogisticRegression(C=c,max_iter=1000000)
    clf.fit(corpus_vectors, train_labels)
    pred_test_labels = clf.predict(query_vectors)

    acc_test = accuracy_score(test_labels, pred_test_labels)
    f1_test = precision_recall_fscore_support(test_labels, pred_test_labels, pos_label=None, average='macro')[2]

    test_acc.append(acc_test)
    test_f1.append(f1_test)
results["acc_values"].append(test_acc)
results["F1 scores"].append(test_f1)

  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
with open('../../../LDA/model/20NG/eval_results_with_k200.txt', 'w') as f:
    print(results, file=f)