In [174]:
import glob
import gensim
import pandas as pd
import logging
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

wordnet_lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [175]:
# close_branch_reuter, deal_reuter, layoff_reuter, m_a_reuter, new_branch_reuter, new_brand_reuter
KEYWORD_TYPE = 'm_a_reuter'

# Input file
INPUT_FILE = "data/"+ KEYWORD_TYPE +".csv"

data = pd.read_csv(INPUT_FILE, encoding='utf-8', dtype={'short_text': str, 'long_text': str})
print 'Reading', INPUT_FILE, ' => ' , len(data.index), 'rows'
data.drop_duplicates(inplace=True)
# data['short_text'] = data.short_text.astype(str)
print "Unique row:", len(data)
data.head()

Reading data/m_a_reuter.csv  =>  1000 rows
Unique row: 1000


Unnamed: 0,url,title,tag,time,short_text,long_text
0,https://www.reuters.com/article/akzo-nobel-sha...,"Round Two: Elliott Advisors, Akzo Nobel resume...",,3:56am EDT,Activist hedge fund ElliottAdvisors on Thursda...,AMSTERDAM (Reuters) - Activist hedge fund Elli...
1,https://www.reuters.com/article/ls-group-kkr-i...,KKR to acquire auto assets from S. Korea's LS ...,,3:07am EDT,U.S. buyout firm KKR & Cosaid it has agreed to...,SEOUL (Reuters) - U.S. buyout firm KKR & Co sa...
2,https://www.reuters.com/article/brief-lkk-heal...,BRIEF-Lkk Health Products Group acquires Landm...,,2:27am EDT,* Lkk Health Products Group acquires Landmark ...,July 27 (Reuters) - LKK Health Products Group...
3,https://www.reuters.com/article/clariant-hunst...,Clariant says Huntsman merger is on track,,1:18am EDT,Swiss chemicals maker Clariantsaid on Thursday...,ZURICH (Reuters) - Swiss chemicals maker Clari...
4,https://www.reuters.com/article/clariant-resul...,Clariant keeps full year guidance after meetin...,,1:08am EDT,"Swiss chemicals group Clariant, which is carry...","ZURICH, July 27 (Reuters) - Swiss chemicals gr..."


In [176]:
from itertools import chain, combinations
import copy
from nltk.util import ngrams

def pad_sequence(sequence, n, pad_left=False, pad_right=False, pad_symbol=None):
    if pad_left:
        sequence = chain((pad_symbol,) * (n-1), sequence)
    if pad_right:
        sequence = chain(sequence, (pad_symbol,) * (n-1))
    return sequence

def skipgrams(sequence, n, k, pad_left=False, pad_right=False, pad_symbol=None):
    sequence_length = len(sequence)
    sequence = iter(sequence)
    sequence = pad_sequence(sequence, n, pad_left, pad_right, pad_symbol)

    if sequence_length + pad_left + pad_right < k:
        raise Exception("The length of sentence + padding(s) < skip")

    if n < k:
        raise Exception("Degree of Ngrams (n) needs to be bigger than skip (k)")    

    history = []
    nk = n+k

    # Return point for recursion.
    if nk < 1: 
        return
    # If n+k longer than sequence, reduce k by 1 and recur
    elif nk > sequence_length: 
        for ng in skipgrams(list(sequence), n, k-1):
            yield ng

    while nk > 1: # Collects the first instance of n+k length history
        history.append(next(sequence))
        nk -= 1

    # Iterative drop first item in history and picks up the next
    # while yielding skipgrams for each iteration.
    for item in sequence:
        history.append(item)
        current_token = history.pop(0)      
        # Iterates through the rest of the history and 
        # pick out all combinations the n-1grams
        for idx in list(combinations(range(len(history)), n-1)):
            ng = [current_token]
            for _id in idx:
                ng.append(history[_id])
            yield tuple(ng)

    # Recursively yield the skigrams for the rest of seqeunce where
    # len(sequence) < n+k
    for ng in list(skipgrams(history, n, k-1)):
        yield ng

In [177]:
# remove common words and tokenize
data['full_content'] = data['title'] + ' ' + data['long_text']
full_content = data['full_content'].tolist()
# full_content = [[wordnet_lemmatizer.lemmatize(z) for z in tokenizer.tokenize(str(t).decode('utf-8').lower()) if z not in STOPWORDS] for t in full_content]
# full_content = [[z for z in str(t).lower().split(" ") if z not in STOPWORDS] for t in full_content]

full_content = [[wordnet_lemmatizer.lemmatize(z) for z in tokenizer.tokenize(unicode(t).lower()) if z not in STOPWORDS] for t in full_content]
print full_content[:2]
with open("full_content", "wb") as f:
    f.write(str(full_content))

[[u'round', u'two', u'elliott', u'advisor', u'akzo', u'nobel', u'resume', u'combat', u'dutch', u'court', u'amsterdam', u'reuters', u'activist', u'hedge', u'fund', u'elliott', u'advisor', u'thursday', u'pressed', u'ahead', u'second', u'lawsuit', u'seeking', u'oust', u'chairman', u'dutch', u'paint', u'group', u'akzo', u'nobel', u'rejection', u'26', u'3', u'billion', u'euro', u'30', u'billion', u'takeover', u'proposal', u'u', u'group', u'ppg', u'industry', u'elliott', u'largest', u'akzo', u'investor', u'9', u'5', u'percent', u'stake', u'pursuing', u'case', u'even', u'akzo', u'said', u'tuesday', u'70', u'year', u'old', u'chairman', u'antony', u'burgmans', u'would', u'step', u'term', u'expires', u'next', u'april', u'late', u'elliott', u'engaged', u'increasingly', u'bitter', u'fight', u'burgmans', u'akzo', u'pittsburgh', u'based', u'ppg', u'six', u'month', u'compulsory', u'cooling', u'period', u'expires', u'december', u'analyst', u'believe', u'departure', u'two', u'leading', u'opponent', u'd

In [178]:
full_content_ = full_content
two_skip_bigrams = [list(skipgrams(c, n=3, k=3)) for c in full_content if len(c) > 5  ]
gram_12 = [ [ [a+"_"+b, a+"_"+c] for a, b, c in z ] for z in two_skip_bigrams]

In [179]:
gram_12 = [reduce(lambda x,y: x+y, z) for z in gram_12]

In [180]:
gram_12 = [list(set(z)) for z in gram_12]

In [181]:
gram_12[:3]

[[u'stake_even',
  u'cooling_december',
  u'tuesday_chairman',
  u'departure_opponent',
  u'step_next',
  u'within_two',
  u'six_month',
  u'ruling_expected',
  u'fund_elliott',
  u'share_57',
  u'saying_flout',
  u'enterprise_rejected',
  u'capital_management',
  u'term_next',
  u'3_billion',
  u'bid_akzo',
  u'extraordinary_responded',
  u'dismissing_burgmans',
  u'final_share',
  u'court_reuters',
  u'responded_akzo',
  u'missed_traded',
  u'door_preliminary',
  u'hear_capital',
  u'pursuing_said',
  u'ppg_six',
  u'ruling_call',
  u'believe_departure',
  u'open_preliminary',
  u'opponent_buechner',
  u'fund_advisor',
  u'euro_akzo',
  u'takeover_result',
  u'court_hear',
  u'direction_board',
  u'u_ppg',
  u'pursuing_case',
  u'elliott_5',
  u'elliott_9',
  u'26_3',
  u'division_thierry',
  u'ton_announced',
  u'left_relatively',
  u'around_made',
  u'position_argument',
  u'next_engaged',
  u'26_30',
  u'dutch_amsterdam',
  u'compel_shareholder',
  u'talk_preliminary',
  u'pittsbu

In [182]:
full_content = gram_12

In [183]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in full_content:
    for token in text:
        if not token.isdigit():
            frequency[token] += 1

full_content = [[token for token in text if frequency[token] > 1] for text in full_content]
print full_content[:2]

[[u'stake_even', u'step_next', u'within_two', u'six_month', u'fund_elliott', u'capital_management', u'term_next', u'3_billion', u'bid_akzo', u'final_share', u'court_reuters', u'pursuing_said', u'fund_advisor', u'u_ppg', u'26_3', u'division_thierry', u'around_made', u'26_30', u'dutch_amsterdam', u'talk_preliminary', u'could_talk', u'buechner_successor', u'u_elliott', u'higher_dutch', u'promised_paintmaker', u'dutch_nobel', u'promised_dutch', u'group_ppg', u'30_takeover', u'billion_30', u'fended_takeover', u'9_5', u'shareholder_thursday', u'shareholder_meeting', u'quarter_announced', u'shareholder_say', u'expected_week', u'reuters_activist', u'said_say', u'takeover_u', u'open_door', u'shareholder_burgmans', u'thursday_court', u'old_step', u'convene_shareholder', u'chief_thierry', u'30_group', u'shareholder_right', u'higher_paintmaker', u'deal_executive', u'would_meeting', u'quarter_expectation', u'two_leading', u'meeting_shareholder', u'meeting_position', u'takeover_industry', u'health_o

In [184]:
dictionary = gensim.corpora.Dictionary(full_content)
dictionary.save('model/contents.dict')
print dictionary

2017-07-29 14:20:40,719 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-07-29 14:20:41,346 : INFO : built Dictionary(105883 unique tokens: [u'million_income', u'hillhouse_declined', u'share_hold', u'2013_brc', u'hynix_kkr']...) from 1000 documents (total 368486 corpus positions)
2017-07-29 14:20:41,346 : INFO : saving Dictionary object under model/contents.dict, separately None
2017-07-29 14:20:41,455 : INFO : saved model/contents.dict


Dictionary(105883 unique tokens: [u'million_income', u'hillhouse_declined', u'share_hold', u'2013_brc', u'hynix_kkr']...)


In [185]:
corpus = [dictionary.doc2bow(text) for text in full_content]
gensim.corpora.MmCorpus.serialize('model/contents.mm', corpus)

2017-07-29 14:20:41,879 : INFO : storing corpus in Matrix Market format to model/contents.mm
2017-07-29 14:20:41,880 : INFO : saving sparse matrix to model/contents.mm
2017-07-29 14:20:41,880 : INFO : PROGRESS: saving document #0
2017-07-29 14:20:42,899 : INFO : saved 1000x105883 matrix, density=0.348% (368486/105883000)
2017-07-29 14:20:42,901 : INFO : saving MmCorpus index to model/contents.mm.index


# LDA

In [186]:
# Config 
NUM_TOPICS = 10
NUM_TERMS = 100

In [187]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, 
                                      num_topics=NUM_TOPICS, update_every=5, 
                                      chunksize=10000, passes=4)

lda.save('model/lda.save')

2017-07-29 14:20:42,929 : INFO : using symmetric alpha at 0.1
2017-07-29 14:20:42,930 : INFO : using symmetric eta at 9.44438672875e-06
2017-07-29 14:20:42,941 : INFO : using serial LDA version on this node
2017-07-29 14:20:52,769 : INFO : running online LDA training, 10 topics, 4 passes over the supplied corpus of 1000 documents, updating model once every 1000 documents, evaluating perplexity every 1000 documents, iterating 50x with a convergence threshold of 0.001000
2017-07-29 14:21:18,851 : INFO : -15.362 per-word bound, 42104.3 perplexity estimate based on a held-out corpus of 1000 documents with 368486 words
2017-07-29 14:21:18,852 : INFO : PROGRESS: pass 0, at document #1000/1000
2017-07-29 14:21:23,230 : INFO : topic #2 (0.100): 0.001*"text_company" + 0.001*"july_reuters" + 0.001*"june_reuters" + 0.001*"source_text" + 0.001*"eikon_coverage" + 0.001*"source_company" + 0.001*"source_coverage" + 0.001*"source_eikon" + 0.001*"reporting_editing" + 0.001*"eikon_company"
2017-07-29 14

In [188]:
lda.print_topics(2, 100)

2017-07-29 14:22:19,501 : INFO : topic #4 (0.100): 0.001*"july_reuters" + 0.001*"source_company" + 0.001*"text_coverage" + 0.001*"text_company" + 0.001*"source_text" + 0.001*"source_eikon" + 0.001*"text_eikon" + 0.001*"reporting_editing" + 0.001*"source_coverage" + 0.001*"eikon_coverage" + 0.001*"eikon_company" + 0.001*"reuters_inc" + 0.001*"june_reuters" + 0.001*"said_would" + 0.001*"1_billion" + 0.000*"company_said" + 0.000*"per_share" + 0.000*"percent_stake" + 0.000*"million_source" + 0.000*"million_million" + 0.000*"3_billion" + 0.000*"chief_executive" + 0.000*"2_billion" + 0.000*"million_company" + 0.000*"reuters_corp" + 0.000*"billion_billion" + 0.000*"reporting_bengaluru" + 0.000*"said_buy" + 0.000*"1_3" + 0.000*"million_said" + 0.000*"bengaluru_editing" + 0.000*"1_1" + 0.000*"million_text" + 0.000*"july_inc" + 0.000*"private_equity" + 0.000*"told_reuters" + 0.000*"bit_ly" + 0.000*"share_percent" + 0.000*"inc_deal" + 0.000*"said_wednesday" + 0.000*"source_ly" + 0.000*"group_said

[(4,
  u'0.001*"july_reuters" + 0.001*"source_company" + 0.001*"text_coverage" + 0.001*"text_company" + 0.001*"source_text" + 0.001*"source_eikon" + 0.001*"text_eikon" + 0.001*"reporting_editing" + 0.001*"source_coverage" + 0.001*"eikon_coverage" + 0.001*"eikon_company" + 0.001*"reuters_inc" + 0.001*"june_reuters" + 0.001*"said_would" + 0.001*"1_billion" + 0.000*"company_said" + 0.000*"per_share" + 0.000*"percent_stake" + 0.000*"million_source" + 0.000*"million_million" + 0.000*"3_billion" + 0.000*"chief_executive" + 0.000*"2_billion" + 0.000*"million_company" + 0.000*"reuters_corp" + 0.000*"billion_billion" + 0.000*"reporting_bengaluru" + 0.000*"said_buy" + 0.000*"1_3" + 0.000*"million_said" + 0.000*"bengaluru_editing" + 0.000*"1_1" + 0.000*"million_text" + 0.000*"july_inc" + 0.000*"private_equity" + 0.000*"told_reuters" + 0.000*"bit_ly" + 0.000*"share_percent" + 0.000*"inc_deal" + 0.000*"said_wednesday" + 0.000*"source_ly" + 0.000*"group_said" + 0.000*"would_buy" + 0.000*"billion_dea

In [189]:
# select top 100 words for each of the 10 LDA topics
TOP_KEYWORDS = [[word for word, _ in lda.show_topic(topicno, topn=NUM_TERMS)]
             for topicno in range(lda.num_topics)]
print(TOP_KEYWORDS[:3])

[[u'source_text', u'source_company', u'text_company', u'source_coverage', u'eikon_company', u'source_eikon', u'eikon_coverage', u'text_coverage', u'text_eikon', u'july_reuters', u'june_reuters', u'reuters_inc', u'reporting_editing', u'group_company', u'group_inc', u'per_share', u'brief_acquires', u'group_group', u'reuters_corp', u'company_coverage', u'reuters_group', u'june_inc', u'private_equity', u'group_acquire', u'july_13', u'july_10', u'company_company', u'june_14', u'common_stock', u'inc_inc', u'june_20', u'inc_company', u'reuters_co', u'brief_acquire', u'mln_reuters', u'equity_firm', u'private_firm', u'july_inc', u'july_19', u'july_26', u'inc_group', u'july_5', u'acquire_business', u'said_statement', u'june_group', u'european_commission', u'brussels_reuters', u'approximately_million', u'million_source', u'group_july', u'brief_group', u'share_stock', u'july_4', u'june_29', u'company_source', u'inc_acquire', u'june_23', u'general_electric', u'co_june', u'eu_approval', u'2_billion'

In [190]:
# Export 
n = 0
combined = []
for _ in TOP_KEYWORDS:
    combined += _

combined_df = pd.DataFrame(combined)
combined_df = pd.DataFrame(combined_df[0].value_counts().sort_values()).reset_index()
combined_df.columns = ['keyword', 'c']

save_to = "exported/skipgram_4_%s.csv" % KEYWORD_TYPE
combined_df[['keyword']].to_csv(save_to, index=False, encoding='utf-8')
print 'Saved to ', save_to

Saved to  exported/skipgram_4_m_a_reuter.csv


In [191]:
# Break the notebook 
break

SyntaxError: 'break' outside loop (<ipython-input-191-da59c82888d2>, line 2)

# TF-IDF

In [None]:
NUM_CLUSTERS = 10
NUM_TERMS = 50

In [None]:
tfidf = gensim.models.tfidfmodel.TfidfModel(corpus, id2word=dictionary)
corpus_tfidf = tfidf[corpus]
tfidf.save("model/tfidf.save")

In [None]:
corpus_tfidf_sorted = []
TOP_KEYWORDS = []
for c in corpus_tfidf:
    top_keywords = sorted(c, key=lambda t: t[1], reverse=True)
    top_keywords = top_keywords[:NUM_TERMS]
    corpus_tfidf_sorted.append(top_keywords)
    TOP_KEYWORDS.append([dictionary[id] for id, _ in top_keywords])

In [None]:
print TOP_KEYWORDS[:2]

In [None]:
# Combine top keywords
import itertools
TOP_KEYWORDS_MERGED = list(itertools.chain(*TOP_KEYWORDS))
print "Total keywords: ", len(TOP_KEYWORDS_MERGED)
TOP_KEYWORDS_MERGED = list(set(TOP_KEYWORDS_MERGED))
print "Total keywords after combined: ", len(TOP_KEYWORDS_MERGED)
print TOP_KEYWORDS_MERGED[:10]

In [None]:
# Export 
save_to = "exported/tfidf/tfidf_keywords.csv"
pd.DataFrame(TOP_KEYWORDS_MERGED, columns=['keyword']).to_csv(save_to, index=False, encoding='utf-8')
print 'Saved to ', save_to

# TF-IDF with K-means

In [None]:
# NUM_CLUSTERS = 10
# NUM_TERMS = 100

In [None]:
# from sklearn.cluster import KMeans
# from scipy.sparse import csr_matrix
# import numpy as np

In [None]:
# _corpus_tfidf = gensim.matutils.corpus2csc(corpus_tfidf).transpose()
# print _corpus_tfidf[:5]

In [None]:
# kmeans = KMeans(n_clusters=NUM_CLUSTERS)
# X = kmeans.fit_transform(_corpus_tfidf)

In [None]:
# X.shape

In [None]:
# dir(kmeans)
# kmeans.n_clusters