In [1]:
from config import parameters
from utils import *

base_dir: /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency
Parameters(
  sentence_length_outlier = [0, 1, 2, 181, 252]
  base_dir = /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency
  data_dir = /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/data
  output_base_dir = /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/output
  bis_raw_pkl_filepath = /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/data/bis_w_content_FINAL.pkl
)


### User configuration

In [2]:
bigram_window_size = 15
bigram_max_rank = None

stopword_list = ['financial', 'market', 'federal', 'bank', 'banking', 'bankers', 'speech', 'bi', 'review', 'year', 'reserve', 'policy', 'state', 'central', 'board', 'percent', 'rate'
               , 'mr', 'alan', 'greenspan', 'ben', 'bernanke', 'janet', 'yellen', 'jerome', 'powell', 'vol'
              , 'ha', 'wa', 'ii']  
ws_quarterly, ws_semiannually, ws_annually = 3, 1, 0

In [3]:
display_max_count = 5

stopset, filter_stops = stopwords_set_filter('english', stopword_list)

quarterly_filepath = 'target-list-units-grouped_quarterly_20191028-21-52-08.pkl'
semiannually_filepath = 'target-list-units-grouped_semiannually_20191028-21-52-09.pkl'
annually_filepath = 'target-list-units-grouped_annually_20191028-21-52-09.pkl'
quarterly_target_doc_dict = load_pkl(os.path.join(parameters.output_base_dir, quarterly_filepath))
semiannually_target_doc_dict = load_pkl(os.path.join(parameters.output_base_dir, semiannually_filepath))
annually_target_doc_dict = load_pkl(os.path.join(parameters.output_base_dir, annually_filepath))

period_dict = {'quarterly': (quarterly_target_doc_dict, ws_quarterly), 'semiannually': (semiannually_target_doc_dict, ws_semiannually), 'annually': (annually_target_doc_dict, ws_annually)}

Completed loading: /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/output/target-list-units-grouped_quarterly_20191028-21-52-08.pkl
Completed loading: /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/output/target-list-units-grouped_semiannually_20191028-21-52-09.pkl
Completed loading: /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/output/target-list-units-grouped_annually_20191028-21-52-09.pkl


In [4]:
from BigramCollocationFinder_custom import BigramCollocationFinder

def words_from_pos_tagged_words(pos_tagged_words):
    return [x.strip() for (x,y) in pos_tagged_words]

def get_one_dimensional_words(doc_list):
    words = list()
    for doc in doc_list:
        for unigrams_with_pos in doc['unigrams_by_sentence']:
            words.extend(words_from_pos_tagged_words(unigrams_with_pos))
    return words

def bigram_freq_rank_dict(finder, bigram_max_rank=None):
    bigram_dict = dict()
    _rank = 0
    for _bigram, _freq in sorted(finder.ngram_fd.items(), key=lambda t:t[-1], reverse=True):   # frequency descending
        _rank +=1 
        if _bigram[0] == _bigram[1]:
            _rank -=1
            continue
        bigram_dict[_bigram[0]+'-'+_bigram[1]] = (_freq, _rank)
        if bigram_max_rank is not None and _rank == bigram_max_rank:
            break
    return bigram_dict

def nltk_bigram_collocation_finder(word_list, bigram_window_size=2, filter_stops=None):        
    finder = BigramCollocationFinder.from_words(word_list, window_size=bigram_window_size) 
    if filter_stops is not None:
        finder.apply_word_filter(filter_stops) 
    return finder

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

def bow_unigram_freq_dict(word_list, stopset=None):
    vectorizer = CountVectorizer(stop_words=stopset)
    X = vectorizer.fit_transform(word_list)
    terms = vectorizer.get_feature_names()
    freqs = X.sum(axis=0).A1
    unigram_bow_dict = dict(zip(terms, freqs))
    return unigram_bow_dict

In [9]:
def get_0_if_None(content):
    if content is None:
        return 0 
    return content

def words_from_range(_period_dict, range_start, range_end):
    _target_periods = [j for j in range(range_start, range_end)]
    _target_period_list_of_list_of_doc = [_period_dict[_period_j] for j, _period_j in enumerate(sorted(_period_dict.keys())) if j in _target_periods]
    _target_doc_list = list()
    for _target_period_list_of_doc in _target_period_list_of_list_of_doc:
        for doc in _target_period_list_of_doc:
            _target_doc_list.append(doc)
    _target_words = get_one_dimensional_words(_target_doc_list)
    return _target_words

In [10]:
import numpy as np

for _period_category, (_period_dict, _ws) in period_dict.items():
    print('='*5, '\n', _period_category, ', ranking , bigram , uniqueness , curr_bigram_freq , ref_bigram_freq')
        
    for i, _period in enumerate(sorted(_period_dict.keys())):
        if i-_ws <= 0:
            continue
        
        # Uniqueness: Current Bigram frequency
        _current_words = words_from_range(_period_dict, i-_ws, i+1)
        _current_finder = nltk_bigram_collocation_finder(_current_words, bigram_window_size, filter_stops)
        _current_bigram_freq_rank_dict = bigram_freq_rank_dict(_current_finder, bigram_max_rank)
        
        # Uniqueness: Reference Bigram frequency
        _reference_words = words_from_range(_period_dict, 0, i-_ws)
        _reference_finder = nltk_bigram_collocation_finder(_reference_words, bigram_window_size, filter_stops)
        _reference_bigram_freq_rank_dict = bigram_freq_rank_dict(_reference_finder, bigram_max_rank)

        # Uniqueness: score
        count = 0
        for _bigram, (_freq, _rank) in sorted(_current_bigram_freq_rank_dict.items(), key=lambda t:t[-1][1]):   # ranking ascending
            count += 1 
            
            _numerator = _freq 
            if _reference_bigram_freq_rank_dict.get(_bigram) is None:
                _reference_freq = 0
            else:
                _reference_freq = _reference_bigram_freq_rank_dict.get(_bigram)[0]
            _denominator = _reference_freq + 1
            _uniqueness = np.log(_numerator / _denominator)
            
            print(_period, ',', _rank, ',', _bigram, ',', _uniqueness, ',', _freq, ',', _reference_freq)
            if count == display_max_count:
                break
                

===== 
 quarterly , ranking , bigram , uniqueness , curr_bigram_freq , ref_bigram_freq
1998_Q1 , 1 , price-inflation , 0.8250747236024933 , 89 , 38
1998_Q1 , 2 , growth-economic , 1.6211339521972916 , 86 , 16
1998_Q1 , 3 , growth-economy , 2.6026896854443837 , 81 , 5
1998_Q1 , 4 , price-index , 1.349926716949016 , 81 , 20
1998_Q1 , 5 , price-measurement , 2.1722232751308024 , 79 , 8
1998_Q2 , 1 , price-inflation , 0.5340824859302579 , 87 , 50
1998_Q2 , 2 , price-index , 1.349926716949016 , 81 , 20
1998_Q2 , 3 , price-measurement , 2.0541237336955462 , 78 , 9
1998_Q2 , 4 , risk-capital , 1.0577902941478545 , 72 , 24
1998_Q2 , 5 , rate-interest , 1.2181574393178924 , 71 , 20
1998_Q3 , 1 , system-international , 1.8718021769015913 , 91 , 13
1998_Q3 , 2 , price-inflation , 0.1981769285837487 , 89 , 72
1998_Q3 , 3 , system-economy , 0.8680886300562773 , 81 , 33
1998_Q3 , 4 , growth-economic , 0.17733401528291545 , 80 , 66
1998_Q3 , 5 , risk-capital , 1.1631508098056809 , 80 , 24
1998_Q4 , 1

KeyboardInterrupt: 