In [183]:
import warnings
warnings.filterwarnings("ignore") # to ignore all future warinings

## 1. Preparing the dataset

### 1.1 Scraping news articles from the web

This process takes on average between 2 and 15min, depending on how many website links are to be scraped, how many articles in these links are found and how much computing ressources the machine has on which the code runs.

In [185]:
import feedparser as fp
import newspaper
from newspaper import Article
import time
from time import mktime
from datetime import datetime
from datetime import date
import pandas as pd
import json
import pprint
import dateutil

#### 1 Website data ####

with open('NewsPapers_new.json') as data_file: #Loads the JSON files with news URLs
    companies = json.load(data_file)

#### 2 Todays date - for filtering the articles by todays date ####
today = str(date.today()) 
print("Today's date:", today)


#### 3 Scraping the news articles ####

text_list = []
source_list = []
article_list = []
date_list = []
time_list = []
title_list = []
image_list = []
keywords_list = []
summaries_list = []

for source, value in companies.items(): 
    d = fp.parse(value['rss'])
    article={}
    for entry in d.entries:
        if hasattr(entry, 'published') and ((dateutil.parser.parse(getattr(entry, 'published'))).strftime("%Y-%m-%d") == today):
            article['source'] = source
            source_list.append(article['source'])

            #getting the article URLs
            article['link'] = entry.link
            article_list.append(article['link'])

            #getting the article published dates
            date = (getattr(entry, 'published'))
            date = dateutil.parser.parse(date)
            date_formated = date.strftime("%Y-%m-%d")
            time_formated = date.strftime("%H:%M:%S %Z") #hour, minute, timezone (converted)
            date_list.append(date_formated)
            time_list.append(time_formated)

            #getting the titles
            content = Article(entry.link)
            try:
                content.download()
                content.parse()  
                content.nlp()
            except Exception as e: 
                #in case the download fails, it prints the error and immediatly continues with downloading the next article
                print(e)
                print("continuing...")
            title = content.title #extract article titles
            image = content.top_image
            image_list.append(image)
            content.nlp()
            keywords = content.keywords
            keywords_list.append(keywords)
            title_list.append(title)
            text = content.text
            text_list.append(text)
            summaries = content.summary
            summaries_list.append(summaries)
                
#creating dicts for formatting and inserting to pandas df
source_dict = {'source':source_list}
link_dict = {'link':article_list}
date_dict = {'published_date':date_list}
time_dict = {'published_time':time_list}
title_dict = {'title':title_list}
text_dict = {'text':text_list}
keyword_dict = {'keywords':keywords_list}
image_dict = {'image':image_list}
summary_dict = {'summary':summaries_list}

#creating separate pandas dfs for each feature
source_df = pd.DataFrame(source_dict, index=None)
link_df = pd.DataFrame(link_dict, index=None)
date_df = pd.DataFrame(date_dict, index=None)
time_df = pd.DataFrame(time_dict, index=None)
title_df = pd.DataFrame(title_dict, index=None)
text_df = pd.DataFrame(text_dict, index=None)
keyword_df = pd.DataFrame(keyword_dict, index=None)
image_df = pd.DataFrame(image_dict, index=None)
summary_df = pd.DataFrame(summary_dict, index=None)

#join all pandas dfs together
news_df = source_df.join(link_df)
news_df = news_df.join(date_df)
news_df = news_df.join(time_df)
news_df = news_df.join(title_df)
news_df = news_df.join(text_df)
news_df = news_df.join(keyword_df)
news_df = news_df.join(image_df)
news_df = news_df.join(summary_df)

# after running, pandas DF sould be created with link, published_date, published_time, title and text

Today's date: 2019-11-18


### 1.2. Filtering and cleaning the dataset

In order to run some analysis on the titles and text content of the articles, we need to clean them.
We first filter all the articles we scraped by todays date. 
For cleaning the titles and article content text, we go through the following steps:

*  remove stopwords (i.e. "a", "for", "when", "you", "if",... etc. that would impact the accuracy of our similarity analysis)
*  remove punctuation
*  remove numbers
*  remove names of the source website in the article text (we noticed, that f.e. CNN often mentions "CNN" in their articles, which would impact on the accuracy of our similarty analysis)
*  make the sentences lower case

In [193]:
import re

# List of english stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')

# Creating a dictionary for removing the names of the source websites
sources_list = (list(source_dict.values()))
for i in sources_list:
    sources_set = set(i)
sources_to_replace = dict.fromkeys(sources_set, "") # replace every source with "" nothing

# Cleaning the dataframe
#news_df_daily = news_df[news_df.published_date == today] # filter by todays date
news_df_daily = news_df.reset_index(drop=True) # reseting the index

news_df_daily["clean_title"] = news_df_daily["title"].str.lower()
news_df_daily["clean_text"] = news_df_daily["text"].str.lower()

# Filter out the stopwords
news_df_daily['clean_title'] = news_df_daily['clean_title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
news_df_daily['clean_text'] = news_df_daily['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

news_df_daily["clean_title"] = ((news_df_daily["clean_title"].str.replace('[^\w\s]','')) # remove punctuation from titles
                                .str.replace('\d+', '')) # remove numbers from titles

news_df_daily["clean_text"] = (((news_df_daily["clean_text"].str.replace('[^\w\s]','')) #remove punctuation from texts
                                .str.replace('\d+', '')) # remove numbers from texts
                               .replace(sources_to_replace, regex=True)) # remove source website names in text



# you can't use none here
pd.set_option('display.max_colwidth', 20)
news_df_daily

Unnamed: 0,source,link,published_date,published_time,title,text,keywords,image,summary,clean_title,clean_text
0,cnn,http://rss.cnn.c...,2019-11-18,18:02:00 UTC,House investigat...,Washington (CNN)...,"[mueller, trump,...",https://cdn.cnn....,Washington (CNN)...,house investigat...,washington hous...
1,cnn,http://rss.cnn.c...,2019-11-18,13:23:02 UTC,The latest on th...,Mark Makela/Gett...,"[trump, set, twe...",https://cdn.cnn....,Mark Makela/Gett...,latest trump imp...,mark makelagetty...
2,cnn,http://rss.cnn.c...,2019-11-18,16:58:27 UTC,Trump attacks an...,(CNN) President ...,"[witness, trump,...",https://cdn.cnn....,But Sondland is ...,trump attacks an...,president donal...
3,cnn,http://rss.cnn.c...,2019-11-18,01:00:41 UTC,Tweets can be us...,Chat with us in ...,"[world, court, m...",https://cdn.cnn....,Chat with us in ...,tweets used cour...,chat us facebook...
4,cnn,http://rss.cnn.c...,2019-11-18,17:03:00 UTC,House Republican...,(CNN) House Repu...,"[republican, ukr...",https://cdn.cnn....,(CNN) House Repu...,house republican...,house republica...
5,cnn,http://rss.cnn.c...,2019-11-18,18:18:27 UTC,Supreme Court st...,Washington (CNN)...,"[documents, trum...",https://cdn.cnn....,Washington (CNN)...,supreme court st...,washington pres...
6,cnn,http://rss.cnn.c...,2019-11-18,19:09:03 UTC,Why you just can...,(CNN) President ...,"[physical, donal...",https://cdn.cnn....,(CNN) President ...,cant trust white...,president donal...
7,cnn,http://rss.cnn.c...,2019-11-18,19:10:48 UTC,Chick-fil-A will...,New York (CNN Bu...,"[salvation, orga...",https://cdn.cnn....,New York (CNN Bu...,chickfila longer...,new york busine...
8,cnn,http://rss.cnn.c...,2019-11-18,18:40:04 UTC,Gunman kills at ...,(CNN) At least t...,"[lot, walmart, s...",https://cdn.cnn....,(CNN) At least t...,gunman kills lea...,least two victi...
9,cnn,http://rss.cnn.c...,2019-11-18,15:37:22 UTC,Ryan Costello: M...,(CNN) Ryan Coste...,"[organization, m...",https://cdn.cnn....,(CNN) Ryan Coste...,ryan costello mi...,ryan costello p...


## 2. Analyzing the dataset

In this step, we apply several different analysis methods, in order to define which articles out of those we scraped are **most relevant** for portfolio trading customers and **cover trending financial topics**.

### 2.1. Cosine similarity

Cosine similarity is a metric for measuring the similarity between two sentences. It creates numbered vectors out of sentences and measures the **cosine of the angle between them**.

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1d94e5903f7936d3c131e040ef2c51b473dd071d" alt="Cosine similarity formula" title="Cosine similarity formula" />

where
* A ........... vector A
* A • B ..... dot product between vector A and B
* | A | ....... length of vector A


We apply this measure for both the title and the texts.

#### 2.1.A. Cosine similarity: titles

In [201]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer #for creating count vectors
from sklearn.metrics.pairwise import cosine_similarity #cosine similarity calculator

# for analysis, we need a list of all the titles
clean_titles_list = list(news_df_daily['clean_title'])

count_vectorizer = CountVectorizer()
count_matrix_title = count_vectorizer.fit_transform(clean_titles_list) # creates the count vector
count_matrix_title = count_matrix_title.todense() # creates numpy matrix out from all count vectors
count_matrix_title = pd.DataFrame(count_matrix_title, columns=count_vectorizer.get_feature_names()) # creates df from count vectors

# apply consine smilarity on count vector dataframe
df_cosim_title = pd.DataFrame(cosine_similarity(count_matrix_title, count_matrix_title))
df_cosim_title.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,434,435,436,437,438,439,440,441,442,443
0,1.0,0.204124,0.136083,0.0,0.154303,0.272166,0.154303,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.204124,1.0,0.333333,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.136083,0.333333,1.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125988,0.0
3,0.0,0.0,0.0,1.0,0.0,0.149071,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.154303,0.0,0.0,0.0,1.0,0.125988,0.142857,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 2.1.B. Cosine similarity: texts

In [5]:
# for analysis, we need a list of all the texts
clean_texts_list = list(news_df_daily['clean_text'])

count_vectorizer = CountVectorizer()
count_matrix_text = count_vectorizer.fit_transform(clean_texts_list) # creates the count vector
count_matrix_text = count_matrix_text.todense() # creates numpy matrix out from all count vectors
#count_matrix_text.shape

count_matrix_text = pd.DataFrame(count_matrix_text, columns=count_vectorizer.get_feature_names()) # creates df from count vectors

# apply consine smilarity on count vector dataframe
df_cosim_texts = pd.DataFrame(cosine_similarity(count_matrix_text, count_matrix_text))
df_cosim_texts.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
0,1.0,0.393918,0.221237,0.361961,0.0,0.0,0.176696,0.065258,0.054845,0.057833,...,0.155287,0.016945,0.047378,0.035183,0.059305,0.021785,0.027497,0.024376,0.046458,0.04534
1,0.393918,1.0,0.348979,0.317417,0.016165,0.016165,0.262719,0.17209,0.153299,0.148967,...,0.237521,0.013676,0.071282,0.01948,0.109172,0.062931,0.038391,0.071882,0.102331,0.097625
2,0.221237,0.348979,1.0,0.298039,0.0,0.0,0.101844,0.092324,0.049174,0.046668,...,0.110623,0.004883,0.054615,0.004056,0.032172,0.030693,0.019018,0.046832,0.037488,0.046459
3,0.361961,0.317417,0.298039,1.0,0.0,0.0,0.195096,0.060739,0.020797,0.0,...,0.125047,0.008778,0.0,0.0,0.011906,0.008261,0.0,0.013865,0.047565,0.017193
4,0.0,0.016165,0.0,0.0,1.0,1.0,0.027639,0.015776,0.03241,0.061517,...,0.018559,0.0,0.0,0.0,0.055661,0.025747,0.0,0.0,0.0,0.053586


### 2.2. Soft cosine similarity measure

Metric for measuring the similarity between two sentences, but gives **higher scores for words with similar meaning**. For Example, ‘President’ vs ‘Prime minister’, ‘Food’ vs ‘Dish’, ‘Hi’ vs ‘Hello’ are considered similar. 

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9743aceb346ccb501ceaef15a46570d1ba8a6a1b" alt="Soft cosine formula" title="Soft cosine formula" />

where
* sij .... similarity (feature i, feature j)

**Difference to cosine similarity**: the traditional cosine similarity considers the vector space model (VSM i.e. features, unique words) features as independent or completely different, while the soft cosine measure proposes considering the similarity of features in VSM, which help generalize the concept of cosine (and soft cosine) as well as the idea of (soft) similarity. https://en.wikipedia.org/wiki/Cosine_similarity

This implies that we need some vector defining the similarity between words i.e. vectors of words that are similar. 
In our case we are going to use the pretrained `fasttext-wiki-news-subwords-300` vector dataset containing 1 million word embeddings trained on Wikipedia 2017. More info here: https://github.com/RaRe-Technologies/gensim-data/releases/tag/fasttext-wiki-news-subwords-300

_**Side note:** other pre-trained models to be found here: https://github.com/RaRe-Technologies/gensim-data/releases_

**Word embeddings**: position of a word within the vector space is learned from text and is based on the words that surround the word when it is used. Word embeddings can be used with pre-trained models applying transfer learning.

#### 2.2.A. Soft cosine measure: titles

In [6]:
import gensim
from gensim.matutils import softcossim 
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec



In [24]:
### ! ### this will download a file to your harddrive ### ! ###

# first we need to download the FastText model - about 250MB

glove_wiki = api.load('glove-wiki-gigaword-200')



In [194]:
glove_wiki.most_similar(positive="trump")

[('nows', 0.5313489437103271),
 ('ivana', 0.5286872982978821),
 ('ivanka', 0.5049400925636292),
 ('knauss', 0.4901247024536133),
 ('melania', 0.46927106380462646),
 ('casino', 0.46679919958114624),
 ('developer', 0.4634384512901306),
 ('trumps', 0.4476448595523834),
 ('condo', 0.4284646511077881),
 ('hilton', 0.4261905550956726)]

In [202]:
# create a dictionary, a map of word to unique id from the title list
dictionary_titles = corpora.Dictionary([simple_preprocess(word) for word in clean_titles_list])

In [206]:
# generate a similarity sparse matrix from the words in the dictionary
# this process takes a bit due to calculation time
similarity_matrix_titles = glove_wiki.similarity_matrix(dictionary_titles, 
                                                        tfidf=None, 
                                                        threshold=0.0, 
                                                        exponent=2.0, 
                                                        nonzero_limit=100)

In [208]:
# convert the titles into bag-of-words vectors through function
# appends the bag-of-words from all sentences into the sent list
def convert_bow(sentences):
    global sent_bow
    sent_bow = []
    for i in sentences:
        bow = dictionary_titles.doc2bow(simple_preprocess(i))
        sent_bow.append(bow)
        
convert_bow(clean_titles_list) 

#create soft cosine measure matrix thourgh function 
""" creates a matrix with the results of soft cosine measure calculation.
Takes into account the previously created similarity sparse matrix was created from the similar word meanings 
(we extracted from the FastText model) from the unique words that were in our unique dictionary."""

def create_soft_cossim_matrix(sentences):
    len_array = np.arange(len(sentences))
    xx, yy = np.meshgrid(len_array, len_array) # creates a grid with dimensions (nr of articles x nr of articles)
    soft_cossim_mat = pd.DataFrame([[round(softcossim(sentences[i],sentences[j], similarity_matrix_titles) ,2) for i, j in zip(x,y)] for y, x in zip(xx, yy)])
    return soft_cossim_mat

soft_cossim_mat_titles = create_soft_cossim_matrix(sent_bow)

In [209]:
soft_cossim_mat_titles.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,434,435,436,437,438,439,440,441,442,443
0,1.0,0.34,0.29,0.1,0.24,0.33,0.23,0.07,0.09,0.03,...,0.0,0.02,0.0,0.0,0.0,0.09,0.07,0.0,0.04,0.06
1,0.34,1.0,0.43,0.03,0.05,0.24,0.09,0.0,0.0,0.05,...,0.06,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.05,0.06
2,0.29,0.43,1.0,0.05,0.08,0.26,0.14,0.0,0.11,0.23,...,0.1,0.06,0.06,0.02,0.09,0.0,0.14,0.0,0.26,0.15
3,0.1,0.03,0.05,1.0,0.0,0.25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0
4,0.24,0.05,0.08,0.0,1.0,0.18,0.21,0.02,0.06,0.03,...,0.02,0.02,0.03,0.0,0.0,0.09,0.05,0.02,0.03,0.14


#### 2.2.B. Soft cosine measure: texts

**! Be aware !** 

When you run the cell below - even when having only around 50 articles - the creation of a unique word dictionary and especially the corresponding similarity matrix for article texts takes at least 2 to 5min. 

This waiting time cannot be skipped for text soft cosine measure similarity comparison, since it just takes a lot of ressources to compute. If you want to time how long it exacly takes, look below for paragraph _X. Other stuff that could be helpful in the future_ - there is a code for timing the run time of a code. :-)

In [28]:
# create a dictionary, a map of word to unique id from the text list
dictionary_texts = corpora.Dictionary([simple_preprocess(word) for word in clean_texts_list])

# generate a similarity sparse matrix from the words in the dictionary
# this process takes a bit due to calculation time
similarity_matrix_texts = fasttext_model300.similarity_matrix(dictionary_texts, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

In [29]:
# convert the texts into bag-of-words vectors through function
# appends the bag-of-words from all sentences into the sent list
def convert_bow(sentences):
    global sent_bow
    sent_bow = []
    for i in sentences:
        bow = dictionary_texts.doc2bow(simple_preprocess(i))
        sent_bow.append(bow)
        
convert_bow(clean_texts_list) 

#create soft cosine measure matrix thourgh function 
""" creates a matrix with the results of soft cosine measure calculation.
Takes into account the previously created similarity sparse matrix was created from the similar word meanings 
(we extracted from the FastText model) from the unique words that were in our unique dictionary."""

def create_soft_cossim_matrix(sentences):
    len_array = np.arange(len(sentences))
    xx, yy = np.meshgrid(len_array, len_array) # creates a grid with dimensions (nr of articles x nr of articles)
    soft_cossim_mat = pd.DataFrame([[round(softcossim(sentences[i],sentences[j], similarity_matrix_texts) ,2) for i, j in zip(x,y)] for y, x in zip(xx, yy)])
    return soft_cossim_mat

soft_cossim_mat_texts = create_soft_cossim_matrix(sent_bow)

In [30]:
soft_cossim_mat_texts.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,66
0,1.0,0.41,0.22,0.08,0.41,0.18,0.4,0.24,0.22,0.08,...,0.31,0.22,0.18,0.27,0.25,0.12,0.2,0.17,0.29,0.28
1,0.41,1.0,0.22,0.05,0.27,0.09,0.15,0.2,0.19,0.05,...,0.11,0.09,0.03,0.04,0.07,0.02,0.08,0.03,0.05,0.04
2,0.22,0.22,1.0,0.03,0.27,0.12,0.24,0.42,0.16,0.03,...,0.19,0.16,0.1,0.12,0.13,0.06,0.1,0.07,0.15,0.14
3,0.08,0.05,0.03,1.0,0.07,0.15,0.15,0.11,0.05,1.0,...,0.06,0.06,0.06,0.07,0.1,0.03,0.09,0.04,0.09,0.13
4,0.41,0.27,0.27,0.07,1.0,0.2,0.44,0.47,0.28,0.07,...,0.32,0.25,0.18,0.21,0.21,0.13,0.19,0.15,0.27,0.25


## 3. Results: extracting most similar articles

After finding some results for the similarity in our scraped articles, we have to **filter the similar articles out of our initial** `news_df_daily` **dataframe**, in order to find out the title and article text.

We want to extract only articles that have some predefined minimum value for similarity f.e. we only want **articles that have a similarity of at least 0.7** (this number could vary depending on our choice). Since the row indexes and the column numbers in the `soft_cossim_mat` matrix are equal to the indexes of the articles in our initial `news_df_daily` dataframe, we need to filter `news_df_daily` by exactly these indexes which contain the minimum similarity value.

In [231]:
# general function to find the row and column index in a dataframe for a specific value
def get_indexes(dataframe, value):
    pos_list = list()
    for i in value:
        result = dataframe.isin([value]) # crete bool dataframe with True at positions where the given value exists
        series = result.any()
        column_names = list(series[series == True].index) # create list of columns that contain the value
        for col in column_names: # iterate over list of columns and fetch the rows indexes where value exists
            rows = list(result[col][result[col] == True].index)
            for row in rows:
                if row != col: # since matrix diagonal is always == 1, we exclude these results here
                    pos_list.append((row, col)) #creates a list of row, col position
        return pos_list # Return a list of tuples indicating the positions of value in the dataframe

# function for creating a list of the row indexes
def find_indexes(dict_pos, index_list):
    for key, value in dict_pos.items():
    #print(key, ' : ', value) # this prints the similarity values and its corresponding row and col indexes in the df
        for num in value:
            for firstnum in num:
                index_list.append(firstnum)
                
# choosing the range of similarity values for which the sentences should be filtered
simval = np.arange(0.9, 1.01, 0.01) # choose similarity values between first number and 1.0, by steps of 0.01
simval = np.around(simval, decimals=2)
simval = (simval.astype(str))
 
# use dict comprehension and 'get_indexes' function to get index positions of elements in df with predefined similarity values
dict_pos_titles = {elem: get_indexes(soft_cossim_mat_titles, elem) for elem in simval}
#dict_pos_texts = {elem: get_indexes(soft_cossim_mat_texts, elem) for elem in simval}

### 3.1. Most similar articles: by similarity of article titles

In [232]:
index_list_titles = []
find_indexes(dict_pos_titles, index_list_titles)
index_list_titles = list(set(index_list_titles))

select_articles = ((news_df_daily.iloc[index_list_titles, :]).drop_duplicates(("title"))).sort_index()
select_articles

Unnamed: 0,source,link,published_date,published_time,title,text,keywords,image,summary,clean_title,clean_text
13,cnn,http://rss.cnn.c...,2019-11-18,19:14:24 UTC,WeWork braces fo...,New York (CNN Bu...,"[according, brac...",https://cdn.cnn....,"On Monday, WeWor...",wework braces ma...,new york busine...
25,cnn,http://rss.cnn.c...,2019-11-18,17:18:26 UTC,Prince Andrew ac...,London (CNN) Pri...,"[standard, andre...",https://cdn.cnn....,London (CNN) Pri...,prince andrew ac...,london prince a...
33,cnn,http://rss.cnn.c...,2019-11-18,17:28:42 UTC,Jennifer Arcuri ...,London (CNN) Jen...,"[johnson, fed, l...",https://cdn.cnn....,Arcuri made the ...,jennifer arcuri ...,london jennifer...
92,bs_top,https://www.busi...,2019-11-18,19:17:56,SC stays tributa...,The on Monday st...,"[crore, held, or...",https://bsmedia....,"On January 10, 2...",sc stays tributa...,monday stayed or...
102,bs_top,https://www.busi...,2019-11-18,15:43:39,Gayatri Projects...,Shares of plunge...,"[tanks, projects...",https://bsmedia....,Shares of plunge...,gayatri projects...,shares plunged ...
104,bs_top,https://www.busi...,2019-11-18,13:29:41,Pokarna tanks 20...,Shares of Limite...,"[tanks, determin...",https://bsmedia....,"PESL, wholly-own...",pokarna tanks u...,shares limited l...
105,bs_top,https://www.busi...,2019-11-18,12:40:38,PSB Q2 review: N...,Over 100 per cen...,"[analysts, psb, ...",https://bsmedia....,Mid-and small-si...,psb q review nii...,per cent yearon...
128,bs_market,https://www.busi...,2019-11-18,12:08:00,Parag Milk Foods...,Shares of climbe...,"[market, crore, ...",https://bsmedia....,Shares of climbe...,parag milk foods...,shares climbed ...
129,bs_market,https://www.busi...,2019-11-18,11:53:00,Morgan Stanley p...,After cutting th...,"[2020, ems, pegs...",https://bsmedia....,"They, however, s...",morgan stanley p...,cutting growth p...
130,bs_market,https://www.busi...,2019-11-18,10:55:00,Bharti Airtel hi...,Shares of Bharti...,"[21month, high, ...",https://bsmedia....,"On Friday, Novem...",bharti airtel hi...,shares bharti ai...


In [None]:
select_articles1["keywords"]= select_articles1["keywords"].astype(str) 

listtest = ["model","sales","technology","stocks","stockmarket","finance","model","2020"]

#select_articles1[select_articles1['keywords'].str.contains("tesla")]

### 3.2. Most similar articles: by similarity of article texts

In [None]:
index_list_texts = []
find_indexes(dict_pos_texts, index_list_texts)
index_list_texts = list(set(index_list_texts))

select_articles = ((news_df_daily.iloc[index_list_texts, :]).drop_duplicates()).sort_index()
select_articles.head()

### 3.3. Most similar articles: by similarity of article titles and articles

This filters the sentences by the `simval` defined before and keeps only the titles and the texts that BOTH match the value.

In [None]:
index_intersection = (set(index_list_titles).intersection(set(index_list_texts)))
index_intersection = list(index_intersection)

select_articles = ((news_df_daily.iloc[index_intersection, :]).drop_duplicates()).sort_index()
select_articles1

# 4. Generating newsletter in HTML

After already having designed the HTML body for the newsletter, we need to prepare the extracted article titles and texts for automatically entering intp the HTML body.

## 4.1 Importing extracted titles and content into Newsletter

Just for testing, we will randomly chose which articles to include in our newsletter body.

In [360]:
# creating separate lists of the columns and info we want to include
similar_sources_list = list(select_articles1['source'])
similar_links_list = list(select_articles1['link'])
similar_titles_list = list(select_articles1['title'])
similar_texts_list = list(select_articles1['text'])

# randomly select articles to include
nr_of_art = (list(random_select.shape))[0] # finding max number of rows of the df of the most similar articles

random_art_nr = np.random.choice(nr_of_art, 8, replace=False) # randomly chose 7 articles out of the max possible
random_art_nr_list = list(random_art_nr)

# function to extract the articles by their random number in the index, limits the characters of text by 'max_chars' and adds '...' to the end 
def rand_info(nr_of_art, max_chars):
    global rand_text, rand_source, rand_link, rand_title
    rand_text, rand_source, rand_link, rand_title = [], [], [], []
    random_art_nr = np.random.choice(nr_of_art, 8, replace=False)  # chosen randomly  
    for nr in random_art_nr:
        (rand_text.append((similar_texts_list[nr])[:max_chars]))
        (rand_source.append(similar_sources_list[nr]))
        (rand_link.append(similar_links_list[nr]))
        (rand_title.append(similar_titles_list[nr]))
    rand_text = [item + '...' for item in rand_text]

random_select = select_articles.reset_index(drop=True) # resetting the index of the df
rand_info(random_art_nr_list, 250) # selecting the articles randomly and maximizing texts by 250 chars

# now every time the code is excuted, a new randomly chosen article appears
print(rand_title[0],"\n" , rand_text[0], "\n" , rand_source[0], "\n", rand_link[0])

#test

SC stays tributal order that held Sebi had no powers to bar auditors 
 The on Monday stayed an order of the Securities Appellate Tribunal (SAT) which had held that markets' watchdog does not have the power to bar

A bench comprising Justices Arun Mishra and Indira Banerjee also issued notice on the appeal filed by the S... 
 bs_top 
 https://www.business-standard.com/article/pti-stories/sc-stays-sat-s-order-holding-that-sebi-lacks-power-to-bar-auditors-119111801286_1.html


In [367]:
# Formatting issues
# ! are more unsupported characters ! to be edited and added over time

# function to replace the wrongly formatted characters (obersed by looking at the html output)
def replace_char(list_of_str):
    for i in range(len(list_of_str)):
        list_of_str[i] = list_of_str[i].replace("’","`")
        list_of_str[i] = list_of_str[i].replace(":",":")
        list_of_str[i] = list_of_str[i].replace("–","-")
        #print(data)

replace_char(rand_text)
replace_char(rand_title)

rand_title

['SC stays tributal order that held Sebi had no powers to bar auditors',
 'US cancels civil nuclear cooperation waiver for Iran',
 'BPCL gains 4% on report that govt wants to wrap up stake sale by March 2020',
 "Bharti Airtel hits 21-month high, surges 20% from Friday's low",
 'UN expert: 100,000 kids in migration-related detention in US',
 'Morgan Stanley pegs 2020 global growth at 3.2%; EMs to outperform',
 'Brokerages turn cautious on markets amid slowing growth, rich valuation',
 'Highway Patrol: Three People Killed in Shooting at Oklahoma Walmart']

In [364]:
import webbrowser
import os

['SC stays tributal order that held Sebi had no powers to bar auditors',
 'US cancels civil nuclear cooperation waiver for Iran',
 'BPCL gains 4% on report that govt wants to wrap up stake sale by March 2020',
 "Bharti Airtel hits 21-month high, surges 20% from Friday's low",
 'UN expert: 100,000 kids in migration-related detention in US',
 'Morgan Stanley pegs 2020 global growth at 3.2%; EMs to outperform',
 'Brokerages turn cautious on markets amid slowing growth, rich valuation',
 'Highway Patrol: Three People Killed in Shooting at Oklahoma Walmart']

In [368]:
# Code is way easier to edit in Notepad ++

print ()
f = open('HTML_with VARS_V1.html','w')
 
message = """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Demystifying Email Design</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link href="NewsletterTemplate_files/css.css" rel="stylesheet">    
 
</head>
<body style="margin: 0; padding: 0;">
    <table width="100%" cellspacing="0" cellpadding="0" border="0"> 
        <tbody><tr>
            <td style="padding: 10px 0 10px 0;">
                <table style="border: 1px solid #cccccc; border-collapse: collapse;" width="1000" cellspacing="0" cellpadding="0" border="0" align="center">
                    <tbody><tr>
                        <td style="padding: 20px" height="204" bgcolor="#fbf315" align="top">
                            <img alt="Creating Email Magic" style="display: block;" src="NewsletterTemplate_files/Logo-Raiffeisen-Bank-2017.png" width="304" height="304">
                        </td>
                    </tr>
                    <tr>
                        <td style="padding: 20px 30px 40px 30px;" bgcolor="#ffffff">
                            <table width="100%" cellspacing="0" cellpadding="0" border="0">
                                <tbody><tr>
                                    <td style="color: #153643; 
    font-family: 'Archivo Black', sans-serif; font-size: 40px;">
                                        <b>Daily Finance Update
</b>
                                    </td>
                                
                                        
                                    </tr><tr>
                                    <td style="color: #153643; 
    font-family: 'Archivo Black', sans-serif; font-size: 20px; padding: 10px 0px 10px 0px;">
                                        <b>Stocks
</b>
                                    </td>
                                
                                        
                                    </tr>
                                
                                <tr>
                                    <td>
                                        <table width="100%" cellspacing="0" cellpadding="0" border="0">
                                            <tbody><tr>
                                                <td style="box-shadow: 1px 2px 4px rgba(0, 0, 0, .5);" width="160" valign="top">
                                                    <div style="padding: 20px 10px 5px 10px; font-family: 'Archivo Black', sans-serif; font-size: 22px">
  <b>TECH</b>
</div><tdbody>
                                                        <div class="row margin-top" style="padding: 10px 10px 5px 10px">
  <div style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> 
       <span class="item-Label">Tech | {rand_source[0]}</span>
   </div>
</div><div style="padding: 5px 10px 0 10px" font-family:="" font-size:=""><b style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> <a href="{rand_link[0]}">{rand_title[0]}</a></b></div><div class="row margin-top" style="padding: 5px 10px 15px 10px; font-family:'Raleway', sans-serif; font-size: 14px">{rand_text[0]}

                                                            
                                                        
                                                    </div></tdbody><tdbody>
                                                        <div class="row margin-top" style="padding: 10px 10px 5px 10px">
  <div style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> 
       <span class="item-Label">Tech | {rand_source[1]}</span>
   </div>
</div><div style="padding: 5px 10px 0 10px" font-family:="" font-size:=""><b style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> <a href="{rand_link[1]}">{rand_title[1]}
</a></b></div><div class="row margin-top" style="padding: 5px 10px 15px 10px; font-family:'Raleway', sans-serif; font-size: 14px">{rand_text[1]}
                                                   
                                                        
                                                    </div></tdbody><tdbody>
                                                        <div class="row margin-top" style="padding: 10px 10px 5px 10px">
  <div style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> 
       <span class="item-Label">Tech | {rand_source[2]}</span>
   </div>
</div><div style="padding: 5px 10px 0 10px" font-family:="" font-size:=""><b style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> <a href="{rand_link[2]}">{rand_title[2]}
</a></b></div><div class="row margin-top" style="padding: 5px 10px 15px 10px; font-family:'Raleway', sans-serif; font-size: 14px">{rand_text[2]}
</div></tdbody><table width="100%" cellspacing="0" cellpadding="0">
                                                        </table>
                                                </td><td style="font-size: 0; line-height: 0;" width="20">
                                                    &nbsp;
                                                </td><td style="box-shadow: 1px 2px 4px rgba(0, 0, 0, .5);" width="160" valign="top">
                                                    <div style="padding: 20px 10px 5px 10px; font-family: 'Archivo Black', sans-serif; font-size: 22px">
  <b>DEALS AND IPOs
</b>
</div><tdbody>
                                                        <div class="row margin-top" style="padding: 10px 10px 5px 10px">
  <div style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> 
       <span class="item-Label">Deals | {rand_source[3]} 
</span>
   </div>
</div><div style="padding: 5px 10px 0 10px" font-family:="" font-size:=""><b style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> <a href="{rand_link[3]}">{rand_title[3]}
Day record of more than $30 billion in sales and climbing</a></b></div><div class="row margin-top" style="padding: 5px 10px 15px 10px; font-family:'Raleway', sans-serif; font-size: 14px">
                                                            
{rand_text[3]}


                                                            
                                                        
                                                    </div></tdbody><tdbody>
                                                        <div class="row margin-top" style="padding: 10px 10px 5px 10px">
  <div style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> 
       <span class="item-Label">Markets | {rand_source[4]}
</span>
   </div>
</div><div style="padding: 5px 10px 0 10px" font-family:="" font-size:=""><b style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> <a href="{rand_link[4]}">{rand_title[4]}
</a></b></div><div class="row margin-top" style="padding: 5px 10px 15px 10px; font-family:'Raleway', sans-serif; font-size: 14px">
                                                            
{rand_text[4]}


                                                            
                                                        
                                                    </div></tdbody><table width="100%" cellspacing="0" cellpadding="0" border="0">
                                                        
</table>
                                                </td>
                                                <td style="font-size: 0; line-height: 0;" width="20">
                                                    &nbsp;
                                                </td>
                                                <td style="box-shadow: 1px 2px 4px rgba(0, 0, 0, .5);" width="160" valign="top">
                                                    <div style="padding: 20px 10px 5px 10px; font-family: 'Archivo Black', sans-serif; font-size: 22px">
  <b>BANKS
</b>
</div><tdbody>
                                                        <div class="row margin-top" style="padding: 10px 10px 5px 10px">
  <div style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> 
       <span class="item-Label">Trading | {rand_source[5]}</span>
   </div>
</div><div style="padding: 5px 10px 0 10px" font-family:="" font-size:=""><b style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> <a href="{rand_link[5]}">{rand_title[5]}
</a></b></div><div class="row margin-top" style="padding: 5px 10px 15px 10px; font-family:'Raleway', sans-serif; font-size: 14px">
                                                            
{rand_text[5]}


                                                            
                                                        
                                                    </div></tdbody><tdbody>
                                                        <div class="row margin-top" style="padding: 10px 10px 5px 10px">
  <div style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> 
       <span class="item-Label">Earnings | {rand_source[6]}</span>
   </div>
</div><div style="padding: 5px 10px 0 10px" font-family:="" font-size:=""><b style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> <a href="{rand_link[6]}">{rand_title[6]}
</a></b></div><div class="row margin-top" style="padding: 5px 10px 15px 10px; font-family:'Raleway', sans-serif; font-size: 14px">{rand_text[6]}
</div></tdbody><tdbody>
                                                        <div class="row margin-top" style="padding: 10px 10px 5px 10px">
  <div style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> 
       <span class="item-Label">JPMorgan | {rand_source[7]}</span>
   </div>
</div><div style="padding: 5px 10px 0 10px" font-family:="" font-size:=""><b style="color: #153643; font-family: Roboto, sans-serif; font-size: 18px;"> <a href="{rand_link[7]}">{rand_title[7]}
</a></b></div><div class="row margin-top" style="padding: 5px 10px 15px 10px; font-family:'Raleway', sans-serif; font-size: 14px">{rand_text[7]}
</div></tdbody><table width="100%" cellspacing="0" cellpadding="0">
                                                        </table>
                                                </td>
                                            </tr>
                                        </tbody></table>
                                    </td>
                                </tr>
                            </tbody></table>
                        </td>
                    </tr>
                    <tr>
                        <td style="padding: 30px 30px 30px 30px;" bgcolor="#666666">
                            <table width="100%" cellspacing="0" cellpadding="0" border="0">
                                <tbody><tr>
                                    <td style="color: #ffffff; font-family: Arial, sans-serif; font-size: 14px;" width="75%">
                                        ® Someone, somewhere 2019<br>
                                        <a href="#" style="color: #ffffff;"><font color="#ffffff">Unsubscribe</font></a> to this newsletter instantly
                                    </td>
                                    <td width="25%" align="right">
                                        <table cellspacing="0" cellpadding="0" border="0">
                                            <tbody><tr>
                                                <td style="font-family: Arial, sans-serif; font-size: 12px; font-weight: bold;">
                                                    <a href="https://twitter.com/raiffeisen_at" style="color: #666666;">
                                                        <img src="NewsletterTemplate_files/logo.png" alt="Twitter" style="display: block;" width="38" height="38" border="0">
                                                    </a>
                                                </td>
                                                <td style="font-size: 0; line-height: 0;" width="20">&nbsp;</td>
                                                <td style="font-family: Arial, sans-serif; font-size: 12px; font-weight: bold;">
                                                    <a href="http://www.facebook.com/raiffeisen/" style="color: #666666;">
                                                        <img alt="Facebook" style="display: block;" src="NewsletterTemplate_files/facebook-2.svg" width="38" height="38" border="0">
                                                    </a>
                                                </td>
                                            </tr>
                                        </tbody></table>
                                    </td>
                                </tr>
                            </tbody></table>
                        </td>
                    </tr>
                </tbody></table>
            </td>
        </tr>
    </tbody></table>



</body></html>

""".format(**locals()) #########
 
f.write(message)
f.close()

#Change path to reflect file location
filename = 'file:///'+os.getcwd()+'/' + 'HTML_with VARS_V1.html'
webbrowser.open_new_tab(filename)




True

# X. Other stuff that could be helpful in the future

## Time how long a code takes to execute

Could be used for speed comparison of two similarity methods

In [184]:
import timeit

code_to_test = """

"""
elapsed_time = timeit.timeit(code_to_test, number=100)/100
print(elapsed_time)

Today's date: 2019-11-18
Today's date: 2019-11-18
Today's date: 2019-11-18


KeyboardInterrupt: 

## Google word meaning vector, pre-trained

Maybe useful, some time?

Other pre-trained models to be found here: https://github.com/RaRe-Technologies/gensim-data/releases

In [None]:
model = api.load("word2vec-google-news-300") #1.6GB to download

## Splitting each word in title/text in pandas df to a separate column

Maybe useful, some time?

Code was hard to find via google haha

In [None]:
split = news_df_daily.str.split(expand=True)
title_splitted = pd.DataFrame(split)
title_splitted