In [173]:
import glob
import gensim
import pandas as pd
import logging
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

wordnet_lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [156]:
# close_branch_reuter, deal_reuter, layoff_reuter, m_a_reuter, new_branch_reuter, new_brand_reuter
KEYWORD_TYPE = 'm_a_reuter'

# Input file
INPUT_FILE = "data/"+ KEYWORD_TYPE +".csv"

data = pd.read_csv(INPUT_FILE, encoding='utf-8', dtype={'short_text': str, 'long_text': str})
print 'Reading', INPUT_FILE, ' => ' , len(data.index), 'rows'
data.drop_duplicates(inplace=True)
# data['short_text'] = data.short_text.astype(str)
print "Unique row:", len(data)
data.head()

Reading data/deal_reuter.csv  =>  546 rows
Unique row: 545


Unnamed: 0,url,title,tag,time,short_text,long_text
0,https://www.reuters.com/article/us-nomura-jafc...,Japan's Nomura sells stake in venture capital ...,,8:47am EDT,Nomura Holdings Incsaid on Friday it had sold ...,TOKYO (Reuters) - Nomura Holdings Inc (8604.T)...
1,https://www.reuters.com/article/us-hutchison-m...,Hutchison's Drei buys Tele2 to rival Carlos Sl...,,8:42am EDT,Mobile telecoms firm Hutchison Drei Austria is...,VIENNA (Reuters) - Mobile telecoms firm Hutchi...
2,https://www.reuters.com/article/us-noble-grp-s...,"At 'bare bones' Noble Group, staff exits and d...",,8:08am EDT,Noble Groupis slimming down drastically to its...,SINGAPORE (Reuters) - Noble Group (NOBG.SI) is...
3,https://www.reuters.com/article/us-china-cic-l...,China's CIC raising $8 billion loan for Logico...,,6:47am EDT,The China Investment Corp sovereign wealth fun...,HONG KONG (Reuters) - The China Investment Cor...
4,https://www.reuters.com/article/us-china-m-a-u...,Value of U.S. deals in China sinks on rising t...,,6:42am EDT,U.S. corporate acquisitions in China collapsed...,HONG KONG (Reuters) - U.S. corporate acquisiti...


In [157]:
from itertools import chain, combinations
import copy
from nltk.util import ngrams

def pad_sequence(sequence, n, pad_left=False, pad_right=False, pad_symbol=None):
    if pad_left:
        sequence = chain((pad_symbol,) * (n-1), sequence)
    if pad_right:
        sequence = chain(sequence, (pad_symbol,) * (n-1))
    return sequence

def skipgrams(sequence, n, k, pad_left=False, pad_right=False, pad_symbol=None):
    sequence_length = len(sequence)
    sequence = iter(sequence)
    sequence = pad_sequence(sequence, n, pad_left, pad_right, pad_symbol)

    if sequence_length + pad_left + pad_right < k:
        raise Exception("The length of sentence + padding(s) < skip")

    if n < k:
        raise Exception("Degree of Ngrams (n) needs to be bigger than skip (k)")    

    history = []
    nk = n+k

    # Return point for recursion.
    if nk < 1: 
        return
    # If n+k longer than sequence, reduce k by 1 and recur
    elif nk > sequence_length: 
        for ng in skipgrams(list(sequence), n, k-1):
            yield ng

    while nk > 1: # Collects the first instance of n+k length history
        history.append(next(sequence))
        nk -= 1

    # Iterative drop first item in history and picks up the next
    # while yielding skipgrams for each iteration.
    for item in sequence:
        history.append(item)
        current_token = history.pop(0)      
        # Iterates through the rest of the history and 
        # pick out all combinations the n-1grams
        for idx in list(combinations(range(len(history)), n-1)):
            ng = [current_token]
            for _id in idx:
                ng.append(history[_id])
            yield tuple(ng)

    # Recursively yield the skigrams for the rest of seqeunce where
    # len(sequence) < n+k
    for ng in list(skipgrams(history, n, k-1)):
        yield ng

In [158]:
# remove common words and tokenize
data['full_content'] = data['title'] + ' ' + data['long_text']
full_content = data['full_content'].tolist()
# full_content = [[wordnet_lemmatizer.lemmatize(z) for z in tokenizer.tokenize(str(t).decode('utf-8').lower()) if z not in STOPWORDS] for t in full_content]
# full_content = [[z for z in str(t).lower().split(" ") if z not in STOPWORDS] for t in full_content]

full_content = [[wordnet_lemmatizer.lemmatize(z) for z in tokenizer.tokenize(unicode(t).lower()) if z not in STOPWORDS] for t in full_content]
print full_content[:2]
with open("full_content", "wb") as f:
    f.write(str(full_content))

[[u'japan', u'nomura', u'sell', u'stake', u'venture', u'capital', u'firm', u'jafco', u'tokyo', u'reuters', u'nomura', u'holding', u'inc', u'8604', u'said', u'friday', u'sold', u'share', u'stake', u'jafco', u'co', u'8595', u'back', u'venture', u'capital', u'firm', u'japanese', u'brokerage', u'seek', u'shift', u'capital', u'area', u'see', u'growth', u'nomura', u'japan', u'largest', u'brokerage', u'investment', u'bank', u'said', u'may', u'wanted', u'use', u'capital', u'efficiently', u'currently', u'expanding', u'u', u'investment', u'banking', u'business', u'selling', u'stake', u'jafco', u'would', u'contribute', u'nomura', u'drive', u'enhance', u'capital', u'efficiency', u'optimally', u'allocate', u'management', u'resource', u'nomura', u'said', u'share', u'nomura', u'closed', u'1', u'5', u'percent', u'650', u'1', u'yen', u'jafco', u'share', u'ended', u'15', u'percent', u'5', u'260', u'yen', u'highest', u'since', u'october', u'2015', u'benchmark', u'nikkei', u'n225', u'average', u'index', u

In [159]:
full_content_ = full_content
two_skip_bigrams = [list(skipgrams(c, n=3, k=3)) for c in full_content if len(c) > 5  ]
gram_12 = [ [ [a+"_"+b, a+"_"+c] for a, b, c in z ] for z in two_skip_bigrams]

In [160]:
gram_12 = [reduce(lambda x,y: x+y, z) for z in gram_12]

In [161]:
gram_12 = [list(set(z)) for z in gram_12]

In [162]:
gram_12[:3]

[[u'said_institute',
  u'billion_deal',
  u'real_institute',
  u'nomura_sold',
  u'nomura_billion',
  u'said_1',
  u'nomura_34',
  u'said_4',
  u'u_selling',
  u'capital_investment',
  u'generate_nomura',
  u'7_million',
  u'million_61',
  u'0_8',
  u'kitamura_told',
  u'capital_management',
  u'takumi_call',
  u'buying_chief',
  u'wilson_wong',
  u'3_billion',
  u'estate_japan',
  u'inc_friday',
  u'estate_nomura',
  u'jafco_ended',
  u'6_jafco',
  u'percent_percent',
  u'5_since',
  u'1_stake',
  u'post_co',
  u'selling_nomura',
  u'nikkei_0',
  u'stake_95',
  u'real_holding',
  u'september_separate',
  u'management_nomura',
  u'estate_post',
  u'jafco_per',
  u'june_japan',
  u'said_nomura',
  u'deal_profit',
  u'efficiently_currently',
  u'4_61',
  u'percent_560',
  u'6_percent',
  u'said_5',
  u'share_3',
  u'capital_currently',
  u'period_jafco',
  u'may_efficiently',
  u'optimally_nomura',
  u'estate_reuters',
  u'october_n225',
  u'yen_jafco',
  u'8595_japanese',
  u'kitamura_n

In [163]:
full_content = gram_12

In [164]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in full_content:
    for token in text:
        if not token.isdigit():
            frequency[token] += 1

full_content = [[token for token in text if frequency[token] > 1] for text in full_content]
print full_content[:2]

[[u'said_institute', u'billion_deal', u'said_1', u'said_4', u'u_selling', u'capital_investment', u'7_million', u'0_8', u'capital_management', u'3_billion', u'inc_friday', u'percent_percent', u'5_since', u'1_stake', u'post_co', u'nikkei_0', u'real_holding', u'6_percent', u'said_5', u'share_3', u'capital_currently', u'estate_reuters', u'since_2015', u'statement_5', u'company_nomura', u'japan_said', u'1_5', u'deal_would', u'buying_stake', u'hold_percent', u'talk_co', u'largest_bank', u'share_4', u'chief_financial', u'co_capital', u'profit_yen', u'july_period', u'japan_buying', u'closed_5', u'fell_percent', u'closed_1', u'planned_billion', u'holding_source', u'highest_2015', u'million_generate', u'stake_editing', u'6_stake', u'booked_said', u'stake_venture', u'seek_see', u'firm_seek', u'share_sold', u'million_stake', u'chief_told', u'8_million', u'said_post', u'stake_respectively', u'would_5', u'would_9', u'june_said', u'sell_holding', u'share_18', u'inc_share', u'inc_said', u'tender_share

In [165]:
dictionary = gensim.corpora.Dictionary(full_content)
dictionary.save('model/contents.dict')
print dictionary

2017-07-29 14:04:29,854 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-07-29 14:04:30,241 : INFO : built Dictionary(70027 unique tokens: [u'year_hathaway', u'trade_1', u'major_acquisition', u'conference_wednesday', u'peer_12']...) from 545 documents (total 225063 corpus positions)
2017-07-29 14:04:30,242 : INFO : saving Dictionary object under model/contents.dict, separately None
2017-07-29 14:04:30,392 : INFO : saved model/contents.dict


Dictionary(70027 unique tokens: [u'year_hathaway', u'trade_1', u'major_acquisition', u'conference_wednesday', u'peer_12']...)


In [166]:
corpus = [dictionary.doc2bow(text) for text in full_content]
gensim.corpora.MmCorpus.serialize('model/contents.mm', corpus)

2017-07-29 14:04:30,732 : INFO : storing corpus in Matrix Market format to model/contents.mm
2017-07-29 14:04:30,734 : INFO : saving sparse matrix to model/contents.mm
2017-07-29 14:04:30,735 : INFO : PROGRESS: saving document #0
2017-07-29 14:04:31,347 : INFO : saved 545x70027 matrix, density=0.590% (225063/38164715)
2017-07-29 14:04:31,348 : INFO : saving MmCorpus index to model/contents.mm.index


# LDA

In [167]:
# Config 
NUM_TOPICS = 10
NUM_TERMS = 100

In [168]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, 
                                      num_topics=NUM_TOPICS, update_every=5, 
                                      chunksize=10000, passes=4)

lda.save('model/lda.save')

2017-07-29 14:04:31,368 : INFO : using symmetric alpha at 0.1
2017-07-29 14:04:31,369 : INFO : using symmetric eta at 1.42802062062e-05
2017-07-29 14:04:31,377 : INFO : using serial LDA version on this node
2017-07-29 14:04:37,937 : INFO : running online LDA training, 10 topics, 4 passes over the supplied corpus of 545 documents, updating model once every 545 documents, evaluating perplexity every 545 documents, iterating 50x with a convergence threshold of 0.001000
2017-07-29 14:04:54,208 : INFO : -15.217 per-word bound, 38086.4 perplexity estimate based on a held-out corpus of 545 documents with 225063 words
2017-07-29 14:04:54,209 : INFO : PROGRESS: pass 0, at document #545/545
2017-07-29 14:04:56,373 : INFO : topic #2 (0.100): 0.001*"reporting_editing" + 0.001*"said_would" + 0.001*"1_billion" + 0.000*"billion_billion" + 0.000*"declined_comment" + 0.000*"3_billion" + 0.000*"source_said" + 0.000*"5_billion" + 0.000*"last_year" + 0.000*"source_reuters"
2017-07-29 14:04:56,374 : INFO :

In [169]:
lda.print_topics(2, 100)

2017-07-29 14:05:30,048 : INFO : topic #8 (0.100): 0.001*"reporting_editing" + 0.001*"company_said" + 0.001*"said_would" + 0.001*"1_billion" + 0.001*"million_million" + 0.001*"2_billion" + 0.000*"last_year" + 0.000*"said_deal" + 0.000*"additional_reporting" + 0.000*"chief_executive" + 0.000*"billion_billion" + 0.000*"percent_share" + 0.000*"reporting_bengaluru" + 0.000*"new_york" + 0.000*"declined_comment" + 0.000*"1_1" + 0.000*"told_reuters" + 0.000*"3_billion" + 0.000*"bengaluru_editing" + 0.000*"percent_stake" + 0.000*"million_euro" + 0.000*"percent_percent" + 0.000*"4_billion" + 0.000*"share_percent" + 0.000*"said_wednesday" + 0.000*"said_reporting" + 0.000*"said_expected" + 0.000*"said_statement" + 0.000*"euro_million" + 0.000*"billion_year" + 0.000*"said_buy" + 0.000*"said_thursday" + 0.000*"would_buy" + 0.000*"united_state" + 0.000*"8_billion" + 0.000*"source_reuters" + 0.000*"3_percent" + 0.000*"said_company" + 0.000*"two_said" + 0.000*"deal_said" + 0.000*"last_month" + 0.000*"

[(8,
  u'0.001*"reporting_editing" + 0.001*"company_said" + 0.001*"said_would" + 0.001*"1_billion" + 0.001*"million_million" + 0.001*"2_billion" + 0.000*"last_year" + 0.000*"said_deal" + 0.000*"additional_reporting" + 0.000*"chief_executive" + 0.000*"billion_billion" + 0.000*"percent_share" + 0.000*"reporting_bengaluru" + 0.000*"new_york" + 0.000*"declined_comment" + 0.000*"1_1" + 0.000*"told_reuters" + 0.000*"3_billion" + 0.000*"bengaluru_editing" + 0.000*"percent_stake" + 0.000*"million_euro" + 0.000*"percent_percent" + 0.000*"4_billion" + 0.000*"share_percent" + 0.000*"said_wednesday" + 0.000*"said_reporting" + 0.000*"said_expected" + 0.000*"said_statement" + 0.000*"euro_million" + 0.000*"billion_year" + 0.000*"said_buy" + 0.000*"said_thursday" + 0.000*"would_buy" + 0.000*"united_state" + 0.000*"8_billion" + 0.000*"source_reuters" + 0.000*"3_percent" + 0.000*"said_company" + 0.000*"two_said" + 0.000*"deal_said" + 0.000*"last_month" + 0.000*"5_billion" + 0.000*"deal_reuters" + 0.000*

In [170]:
# select top 100 words for each of the 10 LDA topics
TOP_KEYWORDS = [[word for word, _ in lda.show_topic(topicno, topn=NUM_TERMS)]
             for topicno in range(lda.num_topics)]
print(TOP_KEYWORDS[:3])

[[u'reporting_editing', u'1_billion', u'said_would', u'source_said', u'company_said', u'2_billion', u'declined_comment', u'said_company', u'chief_executive', u'new_york', u'3_billion', u'private_equity', u'billion_billion', u'last_year', u'source_reuters', u'hong_kong', u'said_statement', u'said_deal', u'last_month', u'private_firm', u'5_billion', u'equity_firm', u'matter_said', u'executive_said', u'additional_reporting', u'share_percent', u'million_million', u'familiar_matter', u'chief_said', u'4_billion', u'familiar_said', u'value_billion', u'said_could', u'hong_reuters', u'kong_reuters', u'market_billion', u'initial_offering', u'business_said', u'firm_said', u'united_state', u'7_billion', u'stock_exchange', u'reuters_reported', u'source_matter', u'billion_sale', u'reuters_data', u'initial_public', u'public_offering', u'one_said', u'market_value', u'company_would', u'according_reuters', u'people_matter', u'billion_reuters', u'immediately_comment', u'source_familiar', u'sale_said', u'

In [171]:
# Export 
n = 0
combined = []
for _ in TOP_KEYWORDS:
    combined += _

combined_df = pd.DataFrame(combined)
combined_df = pd.DataFrame(combined_df[0].value_counts().sort_values()).reset_index()
combined_df.columns = ['keyword', 'c']

save_to = "exported/skipgram_4_%s.csv" % KEYWORD_TYPE
combined_df[['keyword']].to_csv(save_to, index=False, encoding='utf-8')
print 'Saved to ', save_to

Saved to  exported/skipgram_4_deal_reuter.csv


In [172]:
# Break the notebook 
break

SyntaxError: 'break' outside loop (<ipython-input-172-da59c82888d2>, line 2)

# TF-IDF

In [None]:
NUM_CLUSTERS = 10
NUM_TERMS = 50

In [None]:
tfidf = gensim.models.tfidfmodel.TfidfModel(corpus, id2word=dictionary)
corpus_tfidf = tfidf[corpus]
tfidf.save("model/tfidf.save")

In [None]:
corpus_tfidf_sorted = []
TOP_KEYWORDS = []
for c in corpus_tfidf:
    top_keywords = sorted(c, key=lambda t: t[1], reverse=True)
    top_keywords = top_keywords[:NUM_TERMS]
    corpus_tfidf_sorted.append(top_keywords)
    TOP_KEYWORDS.append([dictionary[id] for id, _ in top_keywords])

In [None]:
print TOP_KEYWORDS[:2]

In [None]:
# Combine top keywords
import itertools
TOP_KEYWORDS_MERGED = list(itertools.chain(*TOP_KEYWORDS))
print "Total keywords: ", len(TOP_KEYWORDS_MERGED)
TOP_KEYWORDS_MERGED = list(set(TOP_KEYWORDS_MERGED))
print "Total keywords after combined: ", len(TOP_KEYWORDS_MERGED)
print TOP_KEYWORDS_MERGED[:10]

In [None]:
# Export 
save_to = "exported/tfidf/tfidf_keywords.csv"
pd.DataFrame(TOP_KEYWORDS_MERGED, columns=['keyword']).to_csv(save_to, index=False, encoding='utf-8')
print 'Saved to ', save_to

# TF-IDF with K-means

In [None]:
# NUM_CLUSTERS = 10
# NUM_TERMS = 100

In [None]:
# from sklearn.cluster import KMeans
# from scipy.sparse import csr_matrix
# import numpy as np

In [None]:
# _corpus_tfidf = gensim.matutils.corpus2csc(corpus_tfidf).transpose()
# print _corpus_tfidf[:5]

In [None]:
# kmeans = KMeans(n_clusters=NUM_CLUSTERS)
# X = kmeans.fit_transform(_corpus_tfidf)

In [None]:
# X.shape

In [None]:
# dir(kmeans)
# kmeans.n_clusters