In [2]:
import warnings
warnings.filterwarnings("ignore") # to ignore all future warinings

## 1. Preparing the dataset

### 1.1 Scraping news articles from the web

This process takes on average between 2 and 15min, depending on how many website links are to be scraped, how many articles in these links are found and how much computing ressources the machine has on which the code runs.

In [4]:
import feedparser as fp
import newspaper
from newspaper import Article
import time
from time import mktime
from datetime import datetime
from datetime import date
import pandas as pd
import json
import pprint
import dateutil

#### 1 Website data ####

## 1A ##  From JSON file - for final version

with open('NewsPapers.json') as data_file: #Loads the JSON files with news URLs
    companies = json.load(data_file)

## 1B ## From variable - this is for testing, makes it way faster
website = {"cnn": {"rss": "http://rss.cnn.com/rss/cnn_topstories.rss"},
          "cnbc":{"rss": "https://www.cnbc.com/id/10000664/device/rss/rss.html"}}


#### 2 Todays date - for filtering the articles by todays date ####
today = str(date.today()) 
print("Today's date:", today)


#### 3 Scraping the news articles ####

text_list = []
source_list = []
article_list = []
date_list = []
time_list = []
title_list = []

for source, value in website.items(): # if website is changed to companies, it scrapes from JSON file ! takes time !!
    d = fp.parse(value['rss'])
    article={}
    for entry in d.entries:
        if hasattr(entry, 'published'):
            article['source'] = source
            source_list.append(article['source'])

            #getting the article URLs
            article['link'] = entry.link
            article_list.append(article['link'])

            #getting the article published dates
            date = (getattr(entry, 'published'))
            date = dateutil.parser.parse(date)
            date_formated = date.strftime("%Y-%m-%d")
            time_formated = date.strftime("%H:%M:%S %Z") #hour, minute, timezone (converted)
            date_list.append(date_formated)
            time_list.append(time_formated)

            #getting the titles
            content = Article(entry.link)
            try:
                content.download() #downloading article content
                #downloading takes approx. 3min to load
                content.parse()                    
            except Exception as e: 
                #in case the download fails, it prints the error and immediatly continues with downloading the next article
                print(e)
                print("continuing...")
            title = content.title #extract article titles
            title_list.append(title)
            text = content.text
            text_list.append(text)
                
#creating dicts for formatting and inserting to pandas df
source_dict = {'source':source_list}
link_dict = {'link':article_list}
date_dict = {'published_date':date_list}
time_dict = {'published_time':time_list}
title_dict = {'title':title_list}
text_dict = {'text':text_list}

#creating separate pandas dfs for each feature
source_df = pd.DataFrame(source_dict, index=None)
link_df = pd.DataFrame(link_dict, index=None)
date_df = pd.DataFrame(date_dict, index=None)
time_df = pd.DataFrame(time_dict, index=None)
title_df = pd.DataFrame(title_dict, index=None)
text_df = pd.DataFrame(text_dict, index=None)

#join all pandas dfs together
news_df = source_df.join(link_df)
news_df = news_df.join(date_df)
news_df = news_df.join(time_df)
news_df = news_df.join(title_df)
news_df = news_df.join(text_df)

# after running, pandas DF sould be created with link, published_date, published_time, title and text

Today's date: 2019-11-12
Article `download()` failed with HTTPConnectionPool(host='rss.cnn.com', port=80): Max retries exceeded with url: /~r/rss/cnn_topstories/~3/lEBZaEYAn7Q/index.html (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x000001D199D68E10>, 'Connection to rss.cnn.com timed out. (connect timeout=7)')) on URL http://rss.cnn.com/~r/rss/cnn_topstories/~3/lEBZaEYAn7Q/index.html
continuing...


### 1.2. Filtering and cleaning the dataset

In order to run some analysis on the titles and text content of the articles, we need to clean them.
We first filter all the articles we scraped by todays date. 
For cleaning the titles and article content text, we go through the following steps:

*  remove stopwords (i.e. "a", "for", "when", "you", "if",... etc. that would impact the accuracy of our similarity analysis)
*  remove punctuation
*  remove numbers
*  remove names of the source website in the article text (we noticed, that f.e. CNN often mentions "CNN" in their articles, which would impact on the accuracy of our similarty analysis)
*  make the sentences lower case

In [15]:
import re

# List of english stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')

# Creating a dictionary for removing the names of the source websites
sources_list = (list(source_dict.values()))
for i in sources_list:
    sources_set = set(i)
sources_to_replace = dict.fromkeys(sources_set, "") # replace every source with "" nothing

# Cleaning the dataframe
news_df_daily = news_df[news_df.published_date == today] # filter by todays date
news_df_daily = news_df_daily.reset_index(drop=True) # reseting the index

news_df_daily["clean_title"] = news_df_daily["title"].str.lower()
news_df_daily["clean_text"] = news_df_daily["text"].str.lower()

# Filter out the stopwords
news_df_daily['clean_title'] = news_df_daily['clean_title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
news_df_daily['clean_text'] = news_df_daily['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

news_df_daily["clean_title"] = ((news_df_daily["clean_title"].str.replace('[^\w\s]','')) # remove punctuation from titles
                                .str.replace('\d+', '')) # remove numbers from titles

news_df_daily["clean_text"] = (((news_df_daily["clean_text"].str.replace('[^\w\s]','')) #remove punctuation from texts
                                .str.replace('\d+', '')) # remove numbers from texts
                               .replace(sources_to_replace, regex=True)) # remove source website names in text

news_df_daily.head()

Unnamed: 0,source,link,published_date,published_time,title,text,clean_title,clean_text
0,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/7e...,2019-11-11,22:46:01 UTC,Pentagon official testifies that she was told ...,(CNN) A key Pentagon official told House impea...,pentagon official testifies told ukrainians al...,key pentagon official told house impeachment ...
1,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/Co...,2019-11-11,22:45:11 UTC,READ: Impeachment testimony of former Ukraine ...,(CNN) The House committees running the impeach...,read impeachment testimony former ukraine aide...,house committees running impeachment inquiry ...
2,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/F5...,2019-11-11,21:37:12 UTC,Impeachment witness says in court filing Mulva...,(CNN) Impeachment witness Charles Kupperman di...,impeachment witness says court filing mulvaney...,impeachment witness charles kupperman distanc...
3,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/uO...,2019-11-11,19:36:50 UTC,Donald Trump Jr. leaves stage after protests a...,Life beyond Netflix: What you should know abou...,donald trump jr leaves stage protests ucla event,life beyond netflix know new wave streaming
4,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/hH...,2019-11-11,23:00:26 UTC,5 times Capitol Hill testimony left its mark o...,(CNN) A news public phase of the impeachment i...,times capitol hill testimony left mark trump ...,news public phase impeachment inquiry kicks w...


## 2. Analyzing the dataset

In this step, we apply several different analysis methods, in order to define which articles out of those we scraped are **most relevant** for portfolio trading customers and **cover trending financial topics**.

### 2.1. Cosine similarity

Cosine similarity is a metric for measuring the similarity between two sentences. It creates numbered vectors out of sentences and measures the **cosine of the angle between them**.

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1d94e5903f7936d3c131e040ef2c51b473dd071d" alt="Cosine similarity formula" title="Cosine similarity formula" />

where
* A ........... vector A
* A • B ..... dot product between vector A and B
* | A | ....... length of vector A


We apply this measure for both the title and the texts.

#### 2.1.A. Cosine similarity: titles

In [16]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer #for creating count vectors
from sklearn.metrics.pairwise import cosine_similarity #cosine similarity calculator

# for analysis, we need a list of all the titles
clean_titles_list = list(news_df_daily['clean_title'])

count_vectorizer = CountVectorizer()
count_matrix_title = count_vectorizer.fit_transform(clean_titles_list) # creates the count vector
count_matrix_title = count_matrix_title.todense() # creates numpy matrix out from all count vectors
count_matrix_title = pd.DataFrame(count_matrix_title, columns=count_vectorizer.get_feature_names()) # creates pandas dataframe from count vectors

# apply consine smilarity on count vector dataframe
df_cosim_title = pd.DataFrame(cosine_similarity(count_matrix_title, count_matrix_title))
df_cosim_title.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,66
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.117851,0.0,0.125,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.117851,1.0,0.0,0.0,0.0,0.0,0.172133,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.125,0.0,0.125,0.091287,0.0,0.133631,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0
4,0.0,0.125,0.0,0.125,1.0,0.0,0.125,0.091287,0.0,0.133631,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 2.1.B. Cosine similarity: texts

In [17]:
# for analysis, we need a list of all the texts
clean_texts_list = list(news_df_daily['clean_text'])

count_vectorizer = CountVectorizer()
count_matrix_text = count_vectorizer.fit_transform(clean_texts_list) # creates the count vector
count_matrix_text = count_matrix_text.todense() # creates numpy matrix out from all count vectors
#count_matrix_text.shape

count_matrix_text = pd.DataFrame(count_matrix_text, columns=count_vectorizer.get_feature_names()) # creates pandas dataframe from count vectors

# apply consine smilarity on count vector dataframe
df_cosim_texts = pd.DataFrame(cosine_similarity(count_matrix_text, count_matrix_text))
df_cosim_texts.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,66
0,1.0,0.333087,0.12359,0.021398,0.184121,0.026965,0.147046,0.104579,0.093073,0.021398,...,0.095805,0.057268,0.076074,0.06121,0.075524,0.033434,0.050307,0.056995,0.089991,0.095805
1,0.333087,1.0,0.157732,0.0,0.149822,0.040032,0.101154,0.17596,0.080603,0.0,...,0.032378,0.009017,0.0,0.0,0.0,0.0,0.030753,0.013162,0.018893,0.008095
2,0.12359,0.157732,1.0,0.0,0.086521,0.023453,0.087835,0.257721,0.108611,0.0,...,0.047423,0.026415,0.022515,0.032997,0.037083,0.01454,0.018017,0.034701,0.035974,0.045052
3,0.021398,0.0,0.0,1.0,0.024749,0.128586,0.048737,0.119689,0.031068,1.0,...,0.0156,0.017379,0.021162,0.038311,0.0,0.0,0.05927,0.0,0.018206,0.031201
4,0.184121,0.149822,0.086521,0.024749,1.0,0.04901,0.209882,0.281084,0.093297,0.024749,...,0.124324,0.056202,0.080657,0.053098,0.049782,0.024859,0.049059,0.045413,0.094623,0.083783


### 2.2. Soft cosine similarity measure

Metric for measuring the similarity between two sentences, but gives **higher scores for words with similar meaning**. For Example, ‘President’ vs ‘Prime minister’, ‘Food’ vs ‘Dish’, ‘Hi’ vs ‘Hello’ are considered similar. 

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9743aceb346ccb501ceaef15a46570d1ba8a6a1b" alt="Soft cosine formula" title="Soft cosine formula" />

where
* sij .... similarity (feature i, feature j)

**Difference to cosine similarity**: the traditional cosine similarity considers the vector space model (VSM i.e. features, unique words) features as independent or completely different, while the soft cosine measure proposes considering the similarity of features in VSM, which help generalize the concept of cosine (and soft cosine) as well as the idea of (soft) similarity. https://en.wikipedia.org/wiki/Cosine_similarity

This implies that we need some vector defining the similarity between words i.e. vectors of words that are similar. 
In our case we are going to use the pretrained `fasttext-wiki-news-subwords-300` vector dataset containing 1 million word vectors trained on Wikipedia 2017. More info here: https://github.com/RaRe-Technologies/gensim-data/releases/tag/fasttext-wiki-news-subwords-300

_**Side note:** other pre-trained models to be found here: https://github.com/RaRe-Technologies/gensim-data/releases_

#### 2.2.A. Soft cosine measure: titles

In [19]:
import gensim
from gensim.matutils import softcossim 
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

In [20]:
### ! ### this will download a file to your harddrive ### ! ###

# first we need to download the FastText model - about 960MB
# if already downloaded on machine it will only load it, this is a little faster - around 2-5min
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')

In [21]:
# testing the word vectors from the model
fasttext_model300.most_similar(positive="democrat") # outputs words similar to this one
#fasttext_model300.similarity("democrat", "republican") # outputs the computed smilarity between the two words

[('democrats', 0.7794002294540405),
 ('democrate', 0.7524039149284363),
 ('republican', 0.7467405200004578),
 ('anti-democrat', 0.7122665047645569),
 ('social-democrat', 0.7080994844436646),
 ('Democrat', 0.7080677151679993),
 ('socalist', 0.6955678462982178),
 ('democratic', 0.6946688890457153),
 ('liberalist', 0.6911271810531616),
 ('democratic-socialist', 0.688860297203064)]

In [22]:
# create a dictionary, a map of word to unique id from the title list
dictionary_titles = corpora.Dictionary([simple_preprocess(word) for word in clean_titles_list])

# generate a similarity sparse matrix from the words in the dictionary
# this process takes a bit due to calculation time
similarity_matrix_titles = fasttext_model300.similarity_matrix(dictionary_titles, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

In [26]:
# convert the titles into bag-of-words vectors through function
# appends the bag-of-words from all sentences into the sent list
def convert_bow(sentences):
    global sent_bow
    sent_bow = []
    for i in sentences:
        bow = dictionary_titles.doc2bow(simple_preprocess(i))
        sent_bow.append(bow)
        
convert_bow(clean_titles_list) 

#create soft cosine measure matrix thourgh function 
""" creates a matrix with the results of soft cosine measure calculation.
Takes into account the previously created similarity sparse matrix was created from the similar word meanings 
(we extracted from the FastText model) from the unique words that were in our unique dictionary."""

def create_soft_cossim_matrix(sentences):
    len_array = np.arange(len(sentences))
    xx, yy = np.meshgrid(len_array, len_array) # creates a grid with dimensions (nr of articles x nr of articles)
    soft_cossim_mat = pd.DataFrame([[round(softcossim(sentences[i],sentences[j], similarity_matrix_titles) ,2) for i, j in zip(x,y)] for y, x in zip(xx, yy)])
    return soft_cossim_mat

soft_cossim_mat_titles = create_soft_cossim_matrix(sent_bow)

In [27]:
soft_cossim_mat_titles.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,66
0,1.0,0.18,0.08,0.0,0.05,0.0,0.0,0.03,0.0,0.0,...,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.04
1,0.18,1.0,0.17,0.0,0.16,0.0,0.0,0.0,0.06,0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.08,0.17,1.0,0.0,0.09,0.0,0.0,0.31,0.02,0.0,...,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.04,0.0
3,0.0,0.0,0.0,1.0,0.17,0.0,0.12,0.12,0.03,0.12,...,0.05,0.03,0.0,0.0,0.0,0.0,0.0,0.11,0.06,0.0
4,0.05,0.16,0.09,0.17,1.0,0.0,0.12,0.11,0.07,0.13,...,0.0,0.03,0.0,0.0,0.04,0.0,0.0,0.0,0.03,0.0


#### 2.2.B. Soft cosine measure: texts

**! Be aware !** 

When you run the cell below - even when having only around 50 articles - the creation of a unique word dictionary and especially the corresponding similarity matrix for article texts takes at least 2 to 5min. 

This waiting time cannot be skipped for text soft cosine measure similarity comparison, since it just takes a lot of ressources to compute. If you want to time how long it exacly takes, look below for paragraph _X. Other stuff that could be helpful in the future_ - there is a code for timing the run time of a code. :-)

In [28]:
# create a dictionary, a map of word to unique id from the text list
dictionary_texts = corpora.Dictionary([simple_preprocess(word) for word in clean_texts_list])

# generate a similarity sparse matrix from the words in the dictionary
# this process takes a bit due to calculation time
similarity_matrix_texts = fasttext_model300.similarity_matrix(dictionary_texts, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

In [29]:
# convert the texts into bag-of-words vectors through function
# appends the bag-of-words from all sentences into the sent list
def convert_bow(sentences):
    global sent_bow
    sent_bow = []
    for i in sentences:
        bow = dictionary_texts.doc2bow(simple_preprocess(i))
        sent_bow.append(bow)
        
convert_bow(clean_texts_list) 

#create soft cosine measure matrix thourgh function 
""" creates a matrix with the results of soft cosine measure calculation.
Takes into account the previously created similarity sparse matrix was created from the similar word meanings 
(we extracted from the FastText model) from the unique words that were in our unique dictionary."""

def create_soft_cossim_matrix(sentences):
    len_array = np.arange(len(sentences))
    xx, yy = np.meshgrid(len_array, len_array) # creates a grid with dimensions (nr of articles x nr of articles)
    soft_cossim_mat = pd.DataFrame([[round(softcossim(sentences[i],sentences[j], similarity_matrix_texts) ,2) for i, j in zip(x,y)] for y, x in zip(xx, yy)])
    return soft_cossim_mat

soft_cossim_mat_texts = create_soft_cossim_matrix(sent_bow)

In [30]:
soft_cossim_mat_texts.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,66
0,1.0,0.41,0.22,0.08,0.41,0.18,0.4,0.24,0.22,0.08,...,0.31,0.22,0.18,0.27,0.25,0.12,0.2,0.17,0.29,0.28
1,0.41,1.0,0.22,0.05,0.27,0.09,0.15,0.2,0.19,0.05,...,0.11,0.09,0.03,0.04,0.07,0.02,0.08,0.03,0.05,0.04
2,0.22,0.22,1.0,0.03,0.27,0.12,0.24,0.42,0.16,0.03,...,0.19,0.16,0.1,0.12,0.13,0.06,0.1,0.07,0.15,0.14
3,0.08,0.05,0.03,1.0,0.07,0.15,0.15,0.11,0.05,1.0,...,0.06,0.06,0.06,0.07,0.1,0.03,0.09,0.04,0.09,0.13
4,0.41,0.27,0.27,0.07,1.0,0.2,0.44,0.47,0.28,0.07,...,0.32,0.25,0.18,0.21,0.21,0.13,0.19,0.15,0.27,0.25


## 3. Results: extracting most similar articles

After finding some results for the similarity in our scraped articles, we have to **filter the similar articles out of our initial** `news_df_daily` **dataframe**, in order to find out the title and article text.

We want to extract only articles that have some predefined minimum value for similarity f.e. we only want **articles that have a similarity of at least 0.7** (this number could vary depending on our choice). Since the row indexes and the column numbers in the `soft_cossim_mat` matrix are equal to the indexes of the articles in our initial `news_df_daily` dataframe, we need to filter `news_df_daily` by exactly these indexes which contain the minimum similarity value.

In [31]:
# general function to find the row and column index in a dataframe for a specific value
def get_indexes(dataframe, value):
    pos_list = list()
    for i in value:
        result = dataframe.isin([value]) # crete bool dataframe with True at positions where the given value exists
        series = result.any()
        column_names = list(series[series == True].index) # create list of columns that contain the value
        for col in column_names: # iterate over list of columns and fetch the rows indexes where value exists
            rows = list(result[col][result[col] == True].index)
            for row in rows:
                if row != col: # since matrix diagonal is always == 1, we exclude these results here
                    pos_list.append((row, col)) #creates a list of row, col position
        return pos_list # Return a list of tuples indicating the positions of value in the dataframe
    
# choosing the range of similarity values for which the sentences should be filtered
simval = np.arange(0.4, 1.01, 0.01) # choose similarity values between first number and 1.0, by steps of 0.01
simval = np.around(simval, decimals=2)
simval = (simval.astype(str))
 
# use dict comprehension and 'get_indexes' function to get index positions of elements in df with predefined similarity values
dict_pos_titles = {elem: get_indexes(soft_cossim_mat_titles, elem) for elem in simval}
dict_pos_texts = {elem: get_indexes(soft_cossim_mat_texts, elem) for elem in simval}

# function for creating a list of the row indexes
def find_indexes(dict_pos, index_list):
    for key, value in dict_pos.items():
    #print(key, ' : ', value) # this prints the similarity values and its corresponding row and col indexes in the df
        for num in value:
            for firstnum in num:
                index_list.append(firstnum)

### 3.1. Most similar articles: by similarity of article titles

In [32]:
index_list_titles = []
find_indexes(dict_pos_titles, index_list_titles)
index_list_titles = list(set(index_list_titles))

select_articles = ((news_df_daily.iloc[index_list_titles, :]).drop_duplicates()).sort_index()
select_articles

Unnamed: 0,source,link,published_date,published_time,title,text,clean_title,clean_text
9,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/dm...,2019-11-11,13:19:31 UTC,Veteran: Trump family tone deaf to military co...,Life beyond Netflix: What you should know abou...,veteran trump family tone deaf military community,life beyond netflix know new wave streaming
15,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/VP...,2019-11-11,23:07:48 UTC,Man set on fire in Hong Kong hours after prote...,Anderson Cooper speaks with CNN's Paula Hancoc...,man set fire hong kong hours protester shot,anderson cooper speaks s paula hancocks protes...
25,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/TT...,2019-11-11,23:19:47 UTC,Family of WWII veteran surprised to learn new ...,The family of World War II veteran Lucian Bask...,family wwii veteran surprised learn new detail...,family world war ii veteran lucian baskin surp...
28,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/E9...,2019-11-11,21:24:38 UTC,A World War II submarine that was missing for ...,(CNN) It's been 75 years since the USS Graybac...,world war ii submarine missing years found ok...,years since uss grayback went missing sailo...
29,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/ON...,2019-11-11,21:14:43 UTC,WWII submarine found after being missing for 7...,The Lost 52 Project discovered a WWII US Navy ...,wwii submarine found missing years,lost project discovered wwii us navy submarin...
41,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/HN...,2019-11-11,21:28:55 UTC,Hong Kong man set alight hours after protester...,Hong Kong (CNN) A man has been set on fire in ...,hong kong man set alight hours protester shot ...,hong kong man set fire hong kong hours protes...
51,cnbc,https://www.cnbc.com/2019/11/11/saudi-aramco-s...,2019-11-11,21:44:00 UTC,Saudi Aramco stock could price at volatile tim...,The initial public offering of Saudi Arabia's ...,saudi aramco stock could price volatile time o...,initial public offering saudi arabias big oil ...
52,cnbc,https://www.cnbc.com/2019/11/11/what-happened-...,2019-11-11,21:07:00 UTC,Here's what happened to the stock market on Mo...,"The Dow rose 10.25 points, or 0.04%, to close ...",heres happened stock market monday,dow rose points close sp dipped nasdaq c...
54,cnbc,https://www.cnbc.com/2019/11/11/regulator-prob...,2019-11-11,19:32:00 UTC,Regulator probing Goldman over Apple Card: Gen...,Companies that deploy biased algorithms — even...,regulator probing goldman apple card gender bi...,companies deploy biased algorithms even unkno...
58,cnbc,https://www.cnbc.com/2019/11/11/goldman-wants-...,2019-11-11,17:08:00 UTC,Goldman is looking to fix the flaw that has Ap...,Goldman Sachs is looking into ways that family...,goldman looking fix flaw apple card users clai...,goldman sachs looking ways family members shar...


### 3.2. Most similar articles: by similarity of article texts

In [1]:
index_list_texts = []
find_indexes(dict_pos_texts, index_list_texts)
index_list_texts = list(set(index_list_texts))

select_articles = ((news_df_daily.iloc[index_list_texts, :]).drop_duplicates()).sort_index()
select_articles.head()

NameError: name 'find_indexes' is not defined

### 3.3. Most similar articles: by similarity of article titles and articles

This filters the sentences by the `simval` defined before and keeps only the titles and the texts that BOTH match the value.

In [34]:
index_intersection = (set(index_list_titles).intersection(set(index_list_texts)))
index_intersection = list(index_intersection)

select_articles = ((news_df_daily.iloc[index_intersection, :]).drop_duplicates()).sort_index()
select_articles

Unnamed: 0,source,link,published_date,published_time,title,text,clean_title,clean_text
9,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/dm...,2019-11-11,13:19:31 UTC,Veteran: Trump family tone deaf to military co...,Life beyond Netflix: What you should know abou...,veteran trump family tone deaf military community,life beyond netflix know new wave streaming
15,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/VP...,2019-11-11,23:07:48 UTC,Man set on fire in Hong Kong hours after prote...,Anderson Cooper speaks with CNN's Paula Hancoc...,man set fire hong kong hours protester shot,anderson cooper speaks s paula hancocks protes...
28,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/E9...,2019-11-11,21:24:38 UTC,A World War II submarine that was missing for ...,(CNN) It's been 75 years since the USS Graybac...,world war ii submarine missing years found ok...,years since uss grayback went missing sailo...
29,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/ON...,2019-11-11,21:14:43 UTC,WWII submarine found after being missing for 7...,The Lost 52 Project discovered a WWII US Navy ...,wwii submarine found missing years,lost project discovered wwii us navy submarin...
41,cnn,http://rss.cnn.com/~r/rss/cnn_topstories/~3/HN...,2019-11-11,21:28:55 UTC,Hong Kong man set alight hours after protester...,Hong Kong (CNN) A man has been set on fire in ...,hong kong man set alight hours protester shot ...,hong kong man set fire hong kong hours protes...
51,cnbc,https://www.cnbc.com/2019/11/11/saudi-aramco-s...,2019-11-11,21:44:00 UTC,Saudi Aramco stock could price at volatile tim...,The initial public offering of Saudi Arabia's ...,saudi aramco stock could price volatile time o...,initial public offering saudi arabias big oil ...
52,cnbc,https://www.cnbc.com/2019/11/11/what-happened-...,2019-11-11,21:07:00 UTC,Here's what happened to the stock market on Mo...,"The Dow rose 10.25 points, or 0.04%, to close ...",heres happened stock market monday,dow rose points close sp dipped nasdaq c...
54,cnbc,https://www.cnbc.com/2019/11/11/regulator-prob...,2019-11-11,19:32:00 UTC,Regulator probing Goldman over Apple Card: Gen...,Companies that deploy biased algorithms — even...,regulator probing goldman apple card gender bi...,companies deploy biased algorithms even unkno...
58,cnbc,https://www.cnbc.com/2019/11/11/goldman-wants-...,2019-11-11,17:08:00 UTC,Goldman is looking to fix the flaw that has Ap...,Goldman Sachs is looking into ways that family...,goldman looking fix flaw apple card users clai...,goldman sachs looking ways family members shar...
59,cnbc,https://www.cnbc.com/2019/11/11/stocks-making-...,2019-11-11,16:51:00 UTC,Stocks making the biggest moves midday: Walgre...,Check out the companies making headlines midda...,stocks making biggest moves midday walgreens t...,check companies making headlines midday walgre...


# X. Other stuff that could be helpful in the future

## Time how long a code takes to execute

Could be used for speed comparison of two similarity methods

In [None]:
import timeit

code_to_test = """

"""
elapsed_time = timeit.timeit(code_to_test, number=100)/100
print(elapsed_time)

## Google word meaning vector, pre-trained

Maybe useful, some time?

Other pre-trained models to be found here: https://github.com/RaRe-Technologies/gensim-data/releases

In [None]:
model = api.load("word2vec-google-news-300") #1.6GB to download

## Splitting each word in title/text in pandas df to a separate column

Maybe useful, some time?

Code was hard to find via google haha

In [None]:
split = news_df_daily.str.split(expand=True)
title_splitted = pd.DataFrame(split)
title_splitted