In [1]:
from config import parameters
from utils import *

base_dir: /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency
Parameters(
  sentence_length_outlier = [0, 1, 2, 181, 252]
  base_dir = /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency
  data_dir = /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/data
  output_base_dir = /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/output
  bis_raw_pkl_filepath = /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/data/bis_w_content_FINAL.pkl
)


### User configuration

In [2]:
bigram_window_size = 15
bigram_max_rank = None

stopword_list = ['financial', 'market', 'federal', 'bank', 'banking', 'bankers', 'speech', 'bi', 'review', 'year', 'reserve', 'policy', 'state', 'central', 'board', 'percent', 'rate'
               , 'mr', 'alan', 'greenspan', 'ben', 'bernanke', 'janet', 'yellen', 'jerome', 'powell', 'vol'
              , 'ha', 'wa', 'ii']  
ws_quarterly, ws_semiannually, ws_annually = 3, 1, 0
strength_alpha = 0.9

In [3]:
display_max_count = 5

stopset, filter_stops = stopwords_set_filter('english', stopword_list)

quarterly_filepath = 'target-list-units-grouped_quarterly_20191028-21-52-08.pkl'
semiannually_filepath = 'target-list-units-grouped_semiannually_20191028-21-52-09.pkl'
annually_filepath = 'target-list-units-grouped_annually_20191028-21-52-09.pkl'
quarterly_target_doc_dict = load_pkl(os.path.join(parameters.output_base_dir, quarterly_filepath))
semiannually_target_doc_dict = load_pkl(os.path.join(parameters.output_base_dir, semiannually_filepath))
annually_target_doc_dict = load_pkl(os.path.join(parameters.output_base_dir, annually_filepath))

period_dict = {'quarterly': (quarterly_target_doc_dict, ws_quarterly), 'semiannually': (semiannually_target_doc_dict, ws_semiannually), 'annually': (annually_target_doc_dict, ws_annually)}

Completed loading: /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/output/target-list-units-grouped_quarterly_20191028-21-52-08.pkl
Completed loading: /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/output/target-list-units-grouped_semiannually_20191028-21-52-09.pkl
Completed loading: /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/output/target-list-units-grouped_annually_20191028-21-52-09.pkl


In [4]:
from BigramCollocationFinder_custom import BigramCollocationFinder

def words_from_pos_tagged_words(pos_tagged_words):
    return [x.strip() for (x,y) in pos_tagged_words]

def get_one_dimensional_words(doc_list):
    words = list()
    for doc in doc_list:
        for unigrams_with_pos in doc['unigrams_by_sentence']:
            words.extend(words_from_pos_tagged_words(unigrams_with_pos))
    return words

def bigram_freq_rank_dict(finder, bigram_max_rank=None):
    bigram_dict = dict()
    _rank = 0
    for _bigram, _freq in sorted(finder.ngram_fd.items(), key=lambda t:t[-1], reverse=True):   # frequency descending
        _rank +=1 
        if _bigram[0] == _bigram[1]:
            _rank -=1
            continue
        bigram_dict[_bigram[0]+'-'+_bigram[1]] = (_freq, _rank)
        if bigram_max_rank is not None and _rank == bigram_max_rank:
            break
    return bigram_dict

def nltk_bigram_collocation_finder(word_list, bigram_window_size=2, filter_stops=None):        
    finder = BigramCollocationFinder.from_words(word_list, window_size=bigram_window_size) 
    if filter_stops is not None:
        finder.apply_word_filter(filter_stops) 
    return finder

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

def bow_unigram_freq_dict(word_list, stopset=None):
    vectorizer = CountVectorizer(stop_words=stopset)
    X = vectorizer.fit_transform(word_list)
    terms = vectorizer.get_feature_names()
    freqs = X.sum(axis=0).A1
    unigram_bow_dict = dict(zip(terms, freqs))
    return unigram_bow_dict

In [8]:
def get_0_if_None(content):
    if content is None:
        return 0 
    return content

def words_from_range(_period_dict, range_start, range_end):
    _target_periods = [j for j in range(range_start, range_end)]
    _target_period_list_of_list_of_doc = [_period_dict[_period_j] for j, _period_j in enumerate(sorted(_period_dict.keys())) if j in _target_periods]
    _target_doc_list = list()
    for _target_period_list_of_doc in _target_period_list_of_list_of_doc:
        for doc in _target_period_list_of_doc:
            _target_doc_list.append(doc)
    _target_words = get_one_dimensional_words(_target_doc_list)
    return _target_words

In [28]:
def get_bigram_by_period_dict_of_list():
    bigram_by_period_dict_of_list = {'quarterly': list(), 'semiannually': list(), 'annually': list()}
    for _period_category, (_period_dict, _) in period_dict.items():        
        for i, _period in enumerate(sorted(_period_dict.keys())):
            _this_words = words_from_range(_period_dict, i, i+1)
            _this_finder = nltk_bigram_collocation_finder(_this_words, bigram_window_size, filter_stops)
            _this_bigram_freq_rank_dict = bigram_freq_rank_dict(_this_finder, bigram_max_rank)

            bigram_by_period_dict_of_list[_period_category].append(_this_bigram_freq_rank_dict)
    return bigram_by_period_dict_of_list

In [23]:
import csv
def start_csv(csv_filepath, csv_delimiter=','):
    f = open(csv_filepath, 'w', encoding='utf-8-sig', newline='')
    wr = csv.writer(f, delimiter=csv_delimiter)
    return f, wr

def end_csv(f):
    f.close()
    print('Creating .csv file completed: ', csv_filepath)

In [25]:
import numpy as np
import math
import time
from config import parameters

output_base_dir = parameters.output_base_dir
bigram_uniqueness_strength_pkl_filepath = os.path.join(output_base_dir,
                                                     get_str_concat('bigram-uniqueness-strength',
                                                                    get_now_time_str()) + '.pkl')
bigram_uniqueness_strength_csv_filepath = os.path.join(output_base_dir,
                                                     get_str_concat('bigram-uniqueness-strength',
                                                                    get_now_time_str()) + '.csv')


start = time.time()
bigram_by_period_dict_of_list = get_bigram_by_period_dict_of_list()
final_dict = {'quarterly': dict(), 'semiannually': dict(), 'annually': dict()}
f, wr = start_csv(bigram_uniqueness_strength_csv_filepath, csv_delimiter=',')
for _period_category, (_period_dict, _ws) in period_dict.items():
    wr.writerow([_period_category, 'bigram', 'raw_frequency', 'uniqueness', 'strength'])
        
    for k, _period in enumerate(sorted(_period_dict.keys())):
        if k-_ws <= 0:
            continue
        
        # Uniqueness: Current Bigram frequency
        _current_words = words_from_range(_period_dict, k-_ws, k+1)
        _current_finder = nltk_bigram_collocation_finder(_current_words, bigram_window_size, filter_stops)
        _current_bigram_freq_rank_dict = bigram_freq_rank_dict(_current_finder, bigram_max_rank)
        
        # Uniqueness: Reference Bigram frequency
        _reference_words = words_from_range(_period_dict, 0, k-_ws)
        _reference_finder = nltk_bigram_collocation_finder(_reference_words, bigram_window_size, filter_stops)
        _reference_bigram_freq_rank_dict = bigram_freq_rank_dict(_reference_finder, bigram_max_rank)

        final_dict[_period_category][_period] = dict()
        for _bigram, (_freq, _rank) in sorted(_current_bigram_freq_rank_dict.items(), key=lambda t:t[-1][1]):   # ranking ascending
            # Uniqueness: score
            _numerator = _freq 
            if _reference_bigram_freq_rank_dict.get(_bigram) is None:
                _reference_freq = 0
            else:
                _reference_freq = _reference_bigram_freq_rank_dict.get(_bigram)[0]
            _denominator = _reference_freq + 1
            _uniqueness = np.log(_numerator / _denominator)
            
            # Strength: score
            _strength = 0
            for i_ in range(k-_ws, k+1):
                _that_bigram_dict = bigram_by_period_dict_of_list[_period_category][i_]
                if _that_bigram_dict.get(_bigram) is None:
                    _that_freq , _that_rank = 0 , 0
                else:
                    _that_freq , _that_rank = _that_bigram_dict.get(_bigram)

                _first_term = _that_freq / (_that_rank + 1)
                _second_term = math.pow(strength_alpha, -i+k)
                _strength += _first_term * _second_term
            
            final_dict[_period_category][_period][_bigram] = (_freq, _uniqueness, _strength)
            wr.writerow([_period, _bigram, _freq, _uniqueness, _strength])
end_csv(f)
end_pkl(final_dict, bigram_uniqueness_strength_pkl_filepath, start)            

TypeError: 'function' object is not subscriptable

### Test Display

In [18]:
import numpy as np
import math

final_dict = {'quarterly': dict(), 'semiannually': dict(), 'annually': dict()}
for _period_category, (_period_dict, _ws) in period_dict.items():
    print('='*5, '\n', _period_category, ', ranking , bigram , uniqueness , strength , curr_bigram_freq , ref_bigram_freq')
        
    for k, _period in enumerate(sorted(_period_dict.keys())):
        if k-_ws <= 0:
            continue
        
        # Uniqueness: Current Bigram frequency
        _current_words = words_from_range(_period_dict, k-_ws, k+1)
        _current_finder = nltk_bigram_collocation_finder(_current_words, bigram_window_size, filter_stops)
        _current_bigram_freq_rank_dict = bigram_freq_rank_dict(_current_finder, bigram_max_rank)
        
        # Uniqueness: Reference Bigram frequency
        _reference_words = words_from_range(_period_dict, 0, k-_ws)
        _reference_finder = nltk_bigram_collocation_finder(_reference_words, bigram_window_size, filter_stops)
        _reference_bigram_freq_rank_dict = bigram_freq_rank_dict(_reference_finder, bigram_max_rank)

        count = 0
        for _bigram, (_freq, _rank) in sorted(_current_bigram_freq_rank_dict.items(), key=lambda t:t[-1][1]):   # ranking ascending
            count += 1 
            
            # Uniqueness: score
            _numerator = _freq 
            if _reference_bigram_freq_rank_dict.get(_bigram) is None:
                _reference_freq = 0
            else:
                _reference_freq = _reference_bigram_freq_rank_dict.get(_bigram)[0]
            _denominator = _reference_freq + 1
            _uniqueness = np.log(_numerator / _denominator)
            
            # Strength: score
            _strength = 0
            for i_ in range(k-_ws, k+1):
                _that_bigram_dict = bigram_by_period_dict_of_list[_period_category][i_]
                if _that_bigram_dict.get(_bigram) is None:
                    _that_freq , _that_rank = 0 , 0
                else:
                    _that_freq , _that_rank = _that_bigram_dict.get(_bigram)

                _first_term = _that_freq / (_that_rank + 1)
                _second_term = math.pow(strength_alpha, -i+k)
                _strength += _first_term * _second_term
            
            print(_period, ',', _rank, ',', _bigram, ',', _uniqueness, ',', _strength, ',', _freq, ',', _reference_freq)
            if count == display_max_count:
                break

===== 
 quarterly , ranking , bigram , uniqueness , strength , curr_bigram_freq , ref_bigram_freq
1998_Q1 , 1 , price-inflation , 0.8250747236024933 , 111.40407392354115 , 89 , 38
1998_Q1 , 2 , growth-economic , 1.6211339521972916 , 67.74513824602437 , 86 , 16
1998_Q1 , 3 , growth-economy , 2.6026896854443837 , 74.63315128647793 , 81 , 5
1998_Q1 , 4 , price-index , 1.349926716949016 , 103.70815522518723 , 81 , 20
1998_Q1 , 5 , price-measurement , 2.1722232751308024 , 171.33646799330975 , 79 , 8
1998_Q2 , 1 , price-inflation , 0.5340824859302579 , 101.2899920473598 , 87 , 50
1998_Q2 , 2 , price-index , 1.349926716949016 , 93.3373397026685 , 81 , 20
1998_Q2 , 3 , price-measurement , 2.0541237336955462 , 154.20263432608505 , 78 , 9
1998_Q2 , 4 , risk-capital , 1.0577902941478545 , 161.40483603603082 , 72 , 24
1998_Q2 , 5 , growth-economy , 1.0704414117014134 , 62.30455124533492 , 70 , 23
1998_Q3 , 1 , system-international , 1.8718021769015913 , 95.29801582800829 , 91 , 13
1998_Q3 , 2 , pr

KeyboardInterrupt: 