# <h1 style='font-size:2.2rem;color:orange;'>Stock Markets Sensitivity to Social vs. News Media Sentiment</h1>

## 1. Install packages

In [2]:
#pip install transformers
#pip install yfinance

## 2. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import bs4 as bs
from selenium import webdriver
import requests
import time
import concurrent.futures
import re
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import yfinance as yf

## 3. Download datasets | Stock Data | News Data | Twitter Data

### 3.1 Scrap Stock data

In [None]:
df = pd.DataFrame()
tickers= ['AAPL', 'MSFT', 'TSLA', 'AMZN', 'ATVI', 'NVDA', 'FB', 'UBER', 'V', 'MA', 'AVGO', 'CSCO', 'ADBE', 'CRM', 'AMD', 'INTC', 'NFLX']

for ticker in tickers:
#Get the data for the stock each stock
    data = yf.download(ticker,'2021-12-01','2022-03-01')
    data['Ticker'] = ticker
    data.to_csv('Price_'+ str(ticker) + '.csv')

### 3.2 Scrap News

In [None]:
#Funtion to scrape news content, from the news links
def linkScraper(newsLinks):
    try:
        linkResp = requests.get(newsLinks)
        linkSoup = bs.BeautifulSoup(linkResp.text, 'lxml')
        
    except:
        print(newsLinks) 
    try:
        url.append(linkSoup.find('meta',{"property":"og:url"}).get('content'))
    except:
        url.append(np.nan)
    try:
        title.append(linkSoup.find('h1',{"class":"text__text__1FZLe text__dark-grey__3Ml43 text__medium__1kbOh text__heading_2__1K_hh heading__base__2T28j heading__heading_2__3Fcw5"}).text)
    except:
        title.append(np.nan)
    try:        
        date.append(linkSoup.find('span',{"class":"date-line__date__23Ge-"}).text)
    except:
        date.append(np.nan)
    try:
        author.append(linkSoup.find('a',{"class":"author-name__author__1gx5k"}).text)
    except:
        author.append(np.nan)
    try:
        content.append(linkSoup.find('div',{"class":"article-body__content__3VtU3 paywall-article"}).get_text(separator=' '))
    except:
        content.append(np.nan)
    

#Function to scrape all news links
def newScraper():
    symbols = ['Amazon', 'Apple', 'Microsoft', 'Tesla', 'Blizzard', 'Nvidia', 'Facebook',
               'Uber', 'Mastercard', 'AMD', 'Intel', 'Netflix']
    
    global stock, url, title, date, author, content
    stock = []
    url = []
    title = []   
    date = []
    author = []  
    content = [] 
    
    driver = webdriver.Chrome("chromedriver.exe")
    
    for symbol in symbols: 
        searchFormat = 'https://www.reuters.com/site-search/?query={}&offset={}&sort=newest&date=past_year'  
        newsLinks = []

        for i in range(1,70):
            try: 
                driver.get(searchFormat.format(symbol, i*10-10))
                time.sleep(1) 
                
            except:
                continue
            
            for attempt in range(3):
                try: 
                    elems = driver.find_elements_by_css_selector(".search-results__item__2oqiX [href]")
                    if elems != []:
                        for elem in elems:
                            newsLinks.append(elem.get_attribute('href'))
                        
                    else:
                        print('no elements, go to next stock')
                        break 
                    
                except: 
                    print ('webpage error, retrying again')
                    continue

            else:
                continue
            
            break
    
 

        newsLinks = list(set(newsLinks))

        with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor: 
            executor.map(linkScraper, newsLinks)
        stock.extend([symbol] * len(newsLinks))
        
        print(str(symbol) + " stock length is : " + str(len(stock)))
        print(str(symbol) + " url length is : " + str(len(url)))
        print(str(symbol) + " title length is : " + str(len(title)))
        print(str(symbol) + " date length is : " + str(len(date)))
        print(str(symbol) + " author length is : " + str(len(author)))
        print(str(symbol) + " content length is : " + str(len(content)))

    df = pd.DataFrame({'Stock': stock, 'Date':date, 'Title': title, 'Author': author, 'Content':content, 'Url':url})  
    return (df)


def clean():
    #remove symbols and advertisement words
    global df
    df = df.dropna()
    df.Content = [i.replace('Register now for FREE unlimited access to Reuters.com Register ', '')
                .replace(' Our Standards:  The Thomson Reuters Trust Principles.', '') for i in df.Content]
    df.Content = df.Content.map(lambda x: re.sub('[,.\'"!+?$%]', '', x))
    
    #change to lower case
    df.Content = df.Content.map(lambda x: x.lower())

    #df insert ticker column for stocks
    ticker = ['AMZN', 'AAPL', 'MSFT', 'TSLA', 'ATVI', 'NVDA', 'FB', 'UBER', 'MA', 'AMD', 'INTC', 'NFLX']
    stockDict = dict(zip(df.Stock.unique(), ticker))
    tickerList = list(df.Stock.map(stockDict))
    df.insert(loc = 0, column = 'Ticker', value = tickerList)





### 3.3 Scrap Twitter

## 4. Data Pre-processing | Stock Data | News Data | Twitter Data

### 4.1 News Cleaning and Bag-of-words

In [None]:
df = pd.read_csv('News.csv', index_col =0)

df.Date = pd.to_datetime(df['Date'])
df.Date = df.Date.dt.strftime('%d/%m/%Y')

df.Content = df.Content.map(lambda x: re.sub('[-=)(*><&:]', '', x))

#Tokenize every single word into a list
def token(i):
    for i in df.Content: 
         yield(gensim.utils.simple_preprocess(str(i), deacc=True)) 
  
    
#Make bigrams for each news doc
def makeBigrams(content):  
    bigram = gensim.models.Phrases(matrix1, min_count=5, threshold=150) # Higher threshold fewer phrases.
    return [bigram[doc] for doc in content]
         

#Lemmatization
def lemmatization(content, allowed_postags=['NOUN', 'PROPN', 'VERB', 'ADP']):  
    nlp = spacy.load("en_core_web_sm")  #Using spaCy library, loading English package here
    texts_out = []
    for i in content: #Operation for each news doc
        doc = nlp(" ".join(i)) #Join the words together first for lemma analysis
        #Get the lemma for each word, if the word is noun
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) 
    return texts_out

#Remove all the stopwords
def stopword(content):   
    stopWords = stopwords.words('english') 
    stopWords.extend(['reuters', 'january', 'february', 'march', 'april', 'may', 'june', 
                      'july', 'august', 'september', 'october', 'november', 'devember', 
                      'london', 'los_angele', 'summary', 'new york','bengaluru', 'america' ,'north_carolina'])
    return [[i for i in simple_preprocess(str(doc)) 
             if i not in stopWords and len(i) >=1 ] for doc in content] 


    
#Execute step by step
matrix1 = list(token(df.Content))  
matrix2 = makeBigrams(matrix1)
matrix3 = lemmatization(matrix2)
matrix4 = stopword(matrix3) 

#Create Dictionary
matrixDict = corpora.Dictionary(matrix4) 

#Create Corpus
corpus = [matrixDict.doc2bow(text) for text in matrix4] 