In [5]:
from datetime import datetime
import sys
sys.path.append("../bert")
import pandas as pd
import ast
from util import load_tags
from data_structure.question import NewQuestion
import math
def filter_by_rare(target_tags, rare_tags):
    return list(set(target_tags) - set(rare_tags))


def filter_by_common(target_tags, common_tags):
    return list(set(target_tags).intersection(set(common_tags)))


def build_corpus(all_fpath, tags_vocab, corpus_fpath):
    print("Filtering corpus based on tags...")
    cnt = 0
    filter_cnt = 0
    df = pd.read_csv(all_fpath)
    df = df.fillna('')
    q_list = list()
    for idx, row in df.iterrows():
        try:
            qid = row['Id']
            title = row['Title']
            desc_text = row['Body']
            desc_code = row['Code']
            creation_date = row['CreationDate']
            tags = ast.literal_eval(row['Tags'])
            clean_tags = filter_by_common(tags, tags_vocab)
            if len(clean_tags) == 0:
                filter_cnt += 1
                continue
            try:
                q_list.append(NewQuestion(qid, title, desc_text, desc_code, creation_date, clean_tags))
                cnt += 1
            except Exception as e:
                print("Skip id=%s" % qid)
                print("Error msg: %s" % e)

            if cnt % 10000 == 0:
                print("Writing %d instances, filter %d instances... \n %s" %
                      (cnt, filter_cnt, datetime.now().strftime("%H:%M:%S")))
        except Exception as e:
            print("Skip qid %s because %s" % (qid, e))
            filter_cnt += 1
    print("cnt {}".format(cnt))
    import pickle
    with open(corpus_fpath, 'wb') as f:
        pickle.dump(q_list, f)


source_corpus_fpath = "../data/small_tagdc/Posts_50000.csv"
rare_tags_fpath = "../data/small_tagdc/small_tagdc_rareTags.csv"
common_tags_fpath = "../data/small_tagdc/small_tagdc_commonTags.csv"

# Output:
target_corpus_fpath = "../data/small_tagdc/Posts_50000.pkl"

rare_tags = load_tags(rare_tags_fpath)
common_tags = load_tags(common_tags_fpath)
build_corpus(source_corpus_fpath, common_tags, target_corpus_fpath)

# tags = 8527
# tags = 397
Filtering corpus based on tags...
Writing 10000 instances, filter 519 instances... 
 03:08:19
Writing 20000 instances, filter 1137 instances... 
 03:08:23
Writing 30000 instances, filter 1585 instances... 
 03:08:26
Writing 40000 instances, filter 2064 instances... 
 03:08:30
cnt 47540
