# Import data from webhose.io

In [1]:
import os
import json
import webhoseio

data_path = './data'
model_path = './model'

### Useful function to write and read json files

In [2]:
def store_data(filename, data): 
    data_file = os.path.join(data_path, filename)
    with open(data_file, 'w') as outfile:
        json.dump(data, outfile)

def load_data(filename): 
    data_file = os.path.join(data_path, filename)
    with open(data_file) as json_data:
        data = json.load(json_data)
        return data

### Query to webhose.io

In [3]:
webhoseio.config(token="1a815770-8785-4596-a11c-09bdae034336")
query_params = {
    "q": "language:english site_type:news site_category:media organization:Microsoft", 
    "ts": "1555697996143",
    "sort": "crawled"
}

In [4]:
output = webhoseio.query("filterWebContent", query_params)
feeds = [ item for item in output['posts']]
while output['moreResultsAvailable'] > 0:
    output = webhoseio.get_next()
    feeds += [ item for item in output['posts']]

In [8]:
store_data('microsoft_0504_0604.json', feeds)

In [4]:
news_list = load_data('microsoft_0504_0604.json')
print('* Total number of news articles: {}'.format(len(news_list)))
print('* Date range: from {} to {}'.format(
    min([news_list[x]['published'] for x in range(len(news_list))]), 
    max([news_list[x]['published'] for x in range(len(news_list))])))

* Total number of news articles: 7948
* Date range: from 2019-05-04T03:00:00.000+03:00 to 2019-06-04T08:41:00.000+03:00



# Deduplicate titles 

In [5]:
import re
import time
import warnings
import numpy as np
import pandas as pd 

warnings.filterwarnings("ignore")

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
stopwords = set(nltk.corpus.stopwords.words('english'))

from gensim.models import KeyedVectors
from simhash import Simhash, SimhashIndex

### Useful function to cleanup text and load word2vec 

In [19]:
def cleanup_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.replace("'s", " ")
    text = text.replace("n't", " not ")
    text = text.replace("'ve", " have ")
    text = text.replace("'re", " are ")
    text = text.replace("I'm"," I am ")
    text = text.replace("you're"," you are ")
    text = text.replace("You're"," You are ")
    text = text.replace("-"," ")
    text = text.replace("/"," ")
    text = text.replace("("," ")
    text = text.replace(")"," ")
    text = text.replace("%"," percent ")
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    text = " ".join([i for i in text.lower().split() if i not in stopwords])
    token = [WordNetLemmatizer().lemmatize(i) for i in text.split()]
    return token

def load_wordvec_model(modelName, modelFile, flagBin):
    model = KeyedVectors.load_word2vec_format(os.path.join(model_path, modelFile), binary=flagBin)
    return model

### Load word2vec and webhose data

In [17]:
model_w2v_AP = load_wordvec_model('Word2Vec Google News', 'GoogleNews-vectors-negative300.bin.gz', True)
data = load_data('microsoft_0504_0604.json')

### Get titles, cleanup text, and apply Simhash

In [20]:
tot_title = len(data)
title_list = [' '.join(cleanup_text(str(data[i]['title']))) for i in range(tot_title)]
objs = [(i, Simhash(title_list[i])) for i in range(tot_title)]

#### Function checks whether the input words are present in the vocabulary for the model

In [21]:
def vocab_check(vectors, words):
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
    return output

#### Function calculates similarity between two strings using a particular word vector model

In [22]:
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

### Remove duplicate titles

In [23]:
def remove_duplicate(index, title_list, index_simahs, model_w2v_AP):
    duplicates = set()
    title = title_list[index]
    
    # calculate  hash value
    title_hash = Simhash(title)
    
    # find all duplicate indices
    dup_indices = index_simahs.get_near_dups(title_hash)

    # apply word2vec 
    for dupi in dup_indices:
        if int(dupi) == index: 
            continue
        try:
            score = calc_similarity(title, title_list[int(dupi)], model_w2v_AP)
        except:
            score = 0
        if score > 0.55:
            duplicates.add(int(dupi))
    return duplicates 

distance = 15
index_simahs = SimhashIndex(objs, k=distance)

index = 6241
print('Title: \n' + data[index]['title'] + '\n\nDuplicates:')
for i in remove_duplicate(index, title_list, index_simahs, model_w2v_AP): 
    print(data[i]['title'])

Title: 
Chinese Military Ditching Microsoft Windows To Avoid CIA's 'Hefty Arsenal Of Hacking Tools'

Duplicates:
Chinese Military Ditching Microsoft Windows To Avoid CIA’s ‘Hefty Arsenal Of Hacking Tools’
Chinese Military Ditching Microsoft Windows to Avoid CIA's Arsenal of Hacking Tools


In [24]:
start = time.clock()
duplicates = set()
for index in range(tot_title): 
    if index in duplicates: 
        continue 
    new_duplicates = remove_duplicate(index, title_list, index_simahs, model_w2v_AP);
    duplicates = duplicates.union(new_duplicates)
    if index % 2000 == 0 and index != 0: 
        print(str(index) + '/' + str(tot_title), len(duplicates), time.clock() - start)
    
new_feeds = data.copy()
for dup in sorted([int(dup) for dup in duplicates], reverse=True):
    del new_feeds[dup]

2000/7948 1036 141.879877
4000/7948 1819 251.23051999999998
6000/7948 2471 364.603786


In [None]:
store_data('microsoft_0504_0604_clean.json', new_feeds)

In [9]:
news_cleaned_list = load_data('microsoft_0504_0604_clean.json')
print('* Total number of news articles once removed duplicates: {}'.format(len(news_cleaned_list)))
print('* Date range: from {} to {}'.format(
    min([news_list[x]['published'] for x in range(len(news_cleaned_list))]), 
    max([news_list[x]['published'] for x in range(len(news_cleaned_list))])))

* Total number of news articles once removed duplicates: 5021
* Date range: from 2019-05-04T03:00:00.000+03:00 to 2019-05-23T18:57:00.000+03:00


# Name Entity Recognition using IBM Watson

In [10]:
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 import Features, EntitiesOptions, KeywordsOptions

In [11]:
data = load_data('microsoft_0504_0604_clean.json') 

### Query to IBM Watson

In [12]:
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2018-11-16',
    iam_apikey='N9R1KBh-8CJVJWSMGHB_lLG-Sq03-BurNjDc_dh5l9Id',
    url='https://gateway.watsonplatform.net/natural-language-understanding/api/v1/analyze?version=2018-11-16'
)

  after removing the cwd from sys.path.


In [17]:
def get_title_WATSON_entities(text):
    entities_dict = dict()
    r = natural_language_understanding.analyze(
        text=text, features=Features(entities=EntitiesOptions(sentiment=True, limit=10))).get_result()
    for entity in r['entities']:
        entities_dict[entity['type']] = entity['text']
    return entities_dict

title = data[11]['title']
print(title)
get_title_WATSON_entities(title)

CBSE 10th Result 2019: Candidates can use alternative methods to provided by Google, Microsoft if official website is down


{'Company': 'Microsoft', 'JobTitle': 'official', 'Organization': 'CBSE'}

In [18]:
for i in range(len(data)): 
    title = data[i]['title']
    try:
        entities = get_title_WATSON_entities(title)
    except Exception:
        entities = dict()
    data[i]['title_entities'] = entities

In [24]:
store_data('microsoft_0504_0604_clean_with_entities.json', data)

In [80]:
news_cleaned_list = load_data('microsoft_0504_0604_clean_with_entities.json')
entity_title_list = [news['title_entities'] for news in news_cleaned_list if len(news['title_entities'].keys()) > 0]
tot_entities_dict = dict()
for entity_title in entity_title_list: 
    for entity in entity_title:
        if entity in tot_entities_dict.keys():
            tot_entities_dict[entity] += [entity_title[entity]]
        else:
            tot_entities_dict[entity] = [entity_title[entity]]
print('* Total number of news entities: {}'.format(len(tot_entities_dict)) + '\n')
print('* List of entities find in titles:\n -' + '\n -'.join(list(tot_entities_dict.keys())) + '\n')
print('* List of words associated with Broadcaster:\n -' + '\n -'.join(list(set(tot_entities_dict['Broadcaster']))))

* Total number of news entities: 27

* List of entities find in titles:
 -Person
 -Company
 -JobTitle
 -Location
 -GeographicFeature
 -Organization
 -Quantity
 -PrintMedia
 -Facility
 -Sport
 -Broadcaster
 -Number
 -Drug
 -Hashtag
 -HealthCondition
 -TwitterHandle
 -IPAddress
 -Movie
 -Date
 -Crime
 -Measure
 -MusicGroup
 -Money
 -TelevisionShow
 -Vehicle
 -Award
 -Percent

* List of words associated with Broadcaster:
 -ABC
 -BBC News
 -HBO
 -CNN
 -NBC
 -KTLA
 -CBS
 -FOX News
 -Fox News
 -TMZ


# Topic cluster using LDA

In [None]:
from gensim.models import ldamodel
from gensim.corpora.dictionary import Dictionary
import pyLDAvis.gensim

In [None]:
titles = [[data[i]['title'], data[i]['published'][:10]]  for i in range(len(data))]
df_feeds = pd.DataFrame(titles,columns=['title', 'date'])
titles = df_feeds[['title']].applymap(cleanup_text)['title']

In [None]:
dictionary = Dictionary(titles)
dictionary.filter_extremes(no_below=20, no_above=0.8)
corpora = [dictionary.doc2bow(doc) for doc in titles]

# Running and Trainign LDA model on the document term matrix.
lda_model = ldamodel.LdaModel(corpora, num_topics=7, id2word = dictionary, passes=50)

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpora, dictionary)
vis

In [None]:
# https://stackoverflow.com/questions/41819761/pyldavis-visualization-of-pyspark-generated-lda-model

# Topic cluster