#### Pre-process 1st step

In [4]:
import pandas as pd
import numpy as np
import os
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import Normalizer, normalize
from sklearn import metrics
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN, SpectralClustering
from sklearn.utils.extmath import randomized_svd
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords

from nltk.tag import StanfordNERTagger
from string import punctuation
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora, models, similarities, matutils, models
import spacy

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
# import spacy
# spacy.cli.download("en")


In [11]:
nlp = spacy.load(name='en_core_web_sm')

In [12]:
stops = stopwords.words('english')

In [13]:
extra_stops = ['pct', 'percent', 'cent', 'high', 'low', 'top', 'news', 'topnew', 'topnews', 'GMT', 'BST', 'AM', 'PM',
              'Reuters', 'reuters', 'reuters.com', 'plc', 'PLC', 'visit', 'click', 'thomson', 'Thomson', 'thomsonreuters',
              'suggest', 'feedback', 'alert', 'email', 'best', 'cms', 'CMS', 'pageid', 'livemarket', 'client', 'link',
              'net', 'change', 'chg', 'open', 'site', 'eikon', 'EIKON', 'yld', 'yr', 'say', 'year', 'close', 'performance',
              'perform', 'performs', 'id', 'ID', 'pa', 'report', 'reports', 'reporting', 'share', 'break', 'recent', 'past',
              'point', 'said', 'index', 'data', 'new', 'points', 'market', 'markets', 'bn', 'Bn', 'Mn', 'mn', 'avg', 'average',
              'fell', 'fall', 'rose', 'rise', 'time', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 
               'Sunday', 'people', 'higher', 'lower', 'likely', 'Reporting']

In [14]:
stops = stops + extra_stops
stops = set(stops)

In [15]:


def underscore_entities(document):
    spacy = nlp(document)
    
    uds_entities = []
    ent_chars = []
    for i in spacy.ents:
        ent_chars.append((i.start_char, i.end_char))
    
    new_string = []
    for index, item in enumerate(list(str(spacy))):
        for i, j in enumerate(ent_chars):
            if j[0] < index < j[1] and item == ' ':
                item = '_'
        new_string.append(item)

    ns = ''.join(new_string)
    uds_entities.append(ns)
        
    return uds_entities

In [18]:


def clean_cnbc(article):
    text_to_find = '(CNBC) - '
    try:
        cleaned = article[article.index(text_to_find) + len(text_to_find):]
        return cleaned
    except:
        return article

In [23]:
def clean_text(raw_text, stop=True):
    raw_text = raw_text.replace('U.S', 'US')
    raw_text = clean_cnbc(raw_text)
    letters_only = re.sub('[^a-zA-Z]', ' ', raw_text)
    letters_only = ' '.join(letters_only.split())
#     underscored = underscore_entities(letters_only)
#     words = underscored[0].split()
    words = letters_only.split()
    
    if stop == True:
        meaningful_words = [w for w in words if not w in stops]
        return( " ".join(meaningful_words))
    else:
        return( " ".join(words))

In [20]:
df = pd.read_csv('cnbc_news.csv')

In [21]:
df.head()

Unnamed: 0,title,url,published_at,author,publisher,short_description,keywords,header_image,raw_description,description,scraped_at
0,Santoli’s Wednesday market notes: Could Septem...,https://www.cnbc.com/2021/09/29/santolis-wedne...,2021-09-29T17:09:39Z,Michael Santoli,CNBC,"This is the daily notebook of Mike Santoli, CN...","cnbc, Premium, Articles, Investment strategy, ...",https://image.cnbcfm.com/api/v1/image/10694960...,"<div class=""group""><p><em>This is the daily no...","This is the daily notebook of Mike Santoli, CN...",2021-10-30T14:11:23.709372
1,My take on the early Brexit winners and losers,https://www.cnbc.com/2016/06/24/ian-bremmers-t...,2016-06-24T17:50:48Z,,CNBC,This commentary originally ran on Facebook. Bo...,"Articles, Politics, Europe News, European Cent...",https://fm.cnbc.com/applications/cnbc.com/reso...,,,2021-10-30T14:11:23.820139
2,Europe&#039;s recovery depends on Renzi&#039;s...,https://www.cnbc.com/2014/03/25/europes-recove...,2014-03-25T17:29:45Z,,CNBC,"In spring, ambitious reforms began in Italy. U...","Articles, Business News, Economy, Europe Econo...",https://fm.cnbc.com/applications/cnbc.com/reso...,,,2021-10-30T14:11:23.85471
3,US Moves Closer to Becoming A Major Shareholde...,https://www.cnbc.com/2009/04/22/us-moves-close...,2009-04-22T19:49:03Z,Michelle Caruso-Cabrera,CNBC,The US government is increasingly likely to co...,"cnbc, Articles, General Motors Co, Business Ne...",https://image.cnbcfm.com/api/v1/image/24947979...,"<div class=""group""><p>The US government is inc...",The US government is increasingly likely to co...,2021-10-30T14:11:24.261143
4,Trump: 'Mission accomplished' on 'perfectly ex...,https://www.cnbc.com/2018/04/14/trump-mission-...,2018-04-14T14:59:04Z,Javier E. David,CNBC,,"cnbc, Articles, George W. Bush, Vladimir Putin...",https://image.cnbcfm.com/api/v1/image/10513177...,"<div class=""group""></div>,<div class=""group""><...",President Donald Trump hailed the U.S.-led int...,2021-10-30T14:11:24.48949


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625 entries, 0 to 624
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              625 non-null    object
 1   url                625 non-null    object
 2   published_at       625 non-null    object
 3   author             397 non-null    object
 4   publisher          625 non-null    object
 5   short_description  609 non-null    object
 6   keywords           625 non-null    object
 7   header_image       619 non-null    object
 8   raw_description    594 non-null    object
 9   description        593 non-null    object
 10  scraped_at         625 non-null    object
dtypes: object(11)
memory usage: 53.8+ KB


In [24]:
df['cleaned_text'] = df['description'].apply(clean_cnbc)

In [26]:
df.sample(10)

Unnamed: 0,title,url,published_at,author,publisher,short_description,keywords,header_image,raw_description,description,scraped_at,cleaned_text
374,Amanda Campbell: Compatent,https://www.cnbc.com/2016/12/07/amanda-campbel...,2016-12-07T14:17:33Z,,CNBC,"Meet Amanda Campbell, 24 from Kingston. She's ...","cnbc, Articles, Business News, Leadership, Ent...",https://image.cnbcfm.com/api/v1/image/10415174...,"<div class=""group""><p> Meet Amanda Campbell, ...","Meet Amanda Campbell, 24 from Kingston. She's ...",2021-10-30T14:12:12.537178,"Meet Amanda Campbell, 24 from Kingston. She's ..."
449,"CNBC Program Changes for Saturday, 11/15 & Sun...",https://www.cnbc.com/2014/11/12/cnbc-program-c...,2014-11-12T17:49:15Z,,CNBC,"(ALL TIMES ARE IN ET) Saturday, 11/15/2014: 1...","cnbc, Articles, CNBC Information and Policies,...",https://sc.cnbcfm.com/applications/cnbc.com/st...,"<div class=""group""><p> <strong><span>(ALL TIM...","(ALL TIMES ARE IN ET) Saturday, 11/15/2014: 1...",2021-10-30T14:12:22.096386,"(ALL TIMES ARE IN ET) Saturday, 11/15/2014: 1..."
456,Trump officials say China pursuing 'blame game...,https://www.cnbc.com/2019/06/04/trump-official...,2019-06-04T01:22:15Z,,CNBC,U.S. President Donald Trump's administration s...,"cnbc, Articles, Politics, U.S. Economy, Trade,...",https://image.cnbcfm.com/api/v1/image/10588390...,"<div class=""group""><p><a href=""https://www.cnb...",U.S. President Donald Trump's administration s...,2021-10-30T14:12:22.662418,U.S. President Donald Trump's administration s...
151,Gold as Collateral: Could This Solve the Euro ...,https://www.cnbc.com/2012/11/05/gold-as-collat...,2012-11-05T06:36:34Z,,CNBC,Debt-crippled euro zone countries could see th...,"cnbc, Articles, Business News, Economy, Europe...",https://image.cnbcfm.com/api/v1/image/10019620...,"<div class=""group""><p>Debt-crippled euro zone ...",Debt-crippled euro zone countries could see th...,2021-10-30T14:11:43.654753,Debt-crippled euro zone countries could see th...
51,39. Nexmo,https://www.cnbc.com/2014/06/16/disruptors-in-...,2014-06-17T10:09:51Z,,CNBC,"Founders: Tony Jamous (CEO), Eric Nadalin ...","Articles, Technology, CNBC Disruptors 2014, Mo...",https://fm.cnbc.com/applications/cnbc.com/reso...,,,2021-10-30T14:11:31.112864,
555,Google bought $750 million Lenovo stake on Jan...,https://www.cnbc.com/2014/02/06/google-bought-...,2014-02-07T04:14:11Z,,CNBC,Internet search company Google Inc bought a 5....,"cnbc, Articles, Alphabet Class A, Lenovo Group...",https://image.cnbcfm.com/api/v1/image/10039872...,"<div class=""group""><p>Internet search company ...",Internet search company Google Inc bought a 5....,2021-10-30T14:12:37.333137,Internet search company Google Inc bought a 5....
512,UPDATE 2-Serb central bank raises key rate to ...,https://www.cnbc.com/2012/10/09/update-2serb-c...,2012-10-09T13:32:00Z,,CNBC,* Rate rise of 25 basis points tied to inflati...,"cnbc, Articles, Europe, Poland, Eastern Europe...",https://sc.cnbcfm.com/applications/cnbc.com/st...,"<div class=""group""><p>* Rate rise of 25 basis ...",* Rate rise of 25 basis points tied to inflati...,2021-10-30T14:12:32.280763,* Rate rise of 25 basis points tied to inflati...
458,"After-hours buzz: AIG, CBS, King Digital & more",https://www.cnbc.com/2015/02/12/after-hours-bu...,2015-02-12T22:09:35Z,Karma Allen,CNBC,Check out which companies are making headlines...,"cnbc, Articles, Market Insider, American Inter...",https://image.cnbcfm.com/api/v1/image/10227052...,"<div class=""group""><p> <em>Check out which co...",Check out which companies are making headlines...,2021-10-30T14:12:23.066931,Check out which companies are making headlines...
49,OPEC comments show lost relevance ahead of mee...,https://www.cnbc.com/2015/11/23/opec-comments-...,2015-11-23T20:57:10Z,Patti Domm,CNBC,Comments from Saudi Arabian officials stirred ...,"cnbc, Articles, Market Insider, Oil and Gas, C...",https://image.cnbcfm.com/api/v1/image/10205379...,"<div class=""group""><p> Comments from Saudi Ar...",Comments from Saudi Arabian officials stirred ...,2021-10-30T14:11:30.557706,Comments from Saudi Arabian officials stirred ...
548,"There won’t be a summer rate hike, so I’ll foc...",https://www.cnbc.com/2016/06/06/there-wont-be-...,2016-06-06T16:44:27Z,Annie Pei,CNBC,Despite all signs from Fed Chair Janet Yellen ...,"cnbc, Articles, SPDR S&P Oil & Gas Exploration...",https://image.cnbcfm.com/api/v1/image/10368009...,"<div class=""group""><p> Despite all signs from...",Despite all signs from Fed Chair Janet Yellen ...,2021-10-30T14:12:36.619382,Despite all signs from Fed Chair Janet Yellen ...
