# Scrape data from Investing.com

In [8]:
from selenium import webdriver
from time import sleep
import errno    
import os
import os.path
import datetime
import sys
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from functools import reduce

xpath_nextpage = '//div[@id="paginationWrap"]//div[last()]/a'
xpath_msg = '//div[contains(@class,"mainComment")]//div[contains(@class,"commentText")]//span[@class="js-text"]'
xpath_date = '//div[contains(@class,"mainComment")]//div[contains(@class,"commentBody")]//span[@class="js-date"]'

comments_list = []

def build_chrome_options():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.accept_untrusted_certs = True
    chrome_options.assume_untrusted_cert_issuer = True
    # chrome configuration
    # More: https://github.com/SeleniumHQ/docker-selenium/issues/89
    # And: https://github.com/SeleniumHQ/docker-selenium/issues/87
    chrome_options.add_argument("incognito")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1024,800")
    chrome_options.add_argument("disable-extensions")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--test-type=browser")
    chrome_options.add_argument("--disable-impl-side-painting")
    chrome_options.add_argument("--disable-setuid-sandbox")
    chrome_options.add_argument("--disable-seccomp-filter-sandbox")
    chrome_options.add_argument("--disable-breakpad")
    chrome_options.add_argument("--disable-client-side-phishing-detection")
    chrome_options.add_argument("--disable-cast")
    chrome_options.add_argument("--disable-cast-streaming-hw-encoding")
    chrome_options.add_argument("--disable-cloud-import")
    chrome_options.add_argument("--disable-popup-blocking")
    chrome_options.add_argument("--ignore-certificate-errors")
    chrome_options.add_argument("--disable-session-crashed-bubble")
    chrome_options.add_argument("--disable-ipv6")
    chrome_options.add_argument("--allow-http-screen-capture")
    return chrome_options 

def init():
    global driver
    driver = webdriver.Chrome('TA_Exam/Misc/chromedriver', options=build_chrome_options())

def connect(page):
    driver.get('https://www.investing.com/equities/tesla-motors-commentary/' + str(page))

def scrap():
    msgs = driver.find_elements_by_xpath(xpath_msg)
    dates = driver.find_elements_by_xpath(xpath_date)
    
    try:
        for i in range(len(msgs)):
            soup = BeautifulSoup(msgs[i].text, 'html.parser').decode("utf-8")
            time = dates[i].text
            
            comment_dict = {
                "date": time,
                "comment": soup
            }
            
            print(comment_dict)
            
            comments_list.append(comment_dict)

    finally:
        print("done scraping")


def main(argv):
    #angiv aktier man vil scrape i listen
    init()
    max_pages = 1

    page = 1
    while page < max_pages:
        connect(page)
        sleep(3)
        scrap()
        print("success")
        page += 1

    df = pd.DataFrame(comments_list)
    df.to_csv('test.csv')
    comments_list.clear()


if __name__ == "__main__":
    main(sys.argv)

WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home


# Read and format data

In [5]:
df = pd.read_csv('test.csv')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0


In [353]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [354]:
from dateutil import parser
import re

def convert_date(date_string):
    
    if 'last' in str(date_string):
        return
    
    if 'months' in str(date_string):
        return
    
    if 'minutes' in str(date_string):
        return
    
    if 'hours' in str(date_string):
        return

    if 'hour' in str(date_string):
        return
    
    if 'nan' in str(date_string):
        return
    
    dt = parser.parse(date_string)  
    return dt.strftime("%Y-%m-%d")

In [355]:
df['date'] = df['date'].apply(lambda x: convert_date(x))



In [356]:
df.tail()

Unnamed: 0,comment,date
95,Like I said it wouldn't stay above $240 for lo...,2019-05-02
96,cmon teslwrati give more money to your king!\n,2019-05-02
97,Musk-Con-Teer at its finest. Everyone knew TSL...,2019-05-02
98,I wouldn't be surprised if it went below $240 ...,2019-05-02
99,this stock is really insane... or a trader dre...,2019-05-02


# Sentiment Analysis

In [363]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mathiaslund/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [364]:
sid = SentimentIntensityAnalyzer()
for comment in df.comment:
    print(comment)
    ss = sid.polarity_scores(comment)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
        
    print()
    print()
    print()

It seems that no ship arrived in April to EU based on this data http://bit.ly/TeslaCarrier  (last one to . Zeebrugge on Mar,24) that would explain drop in EU registrations. Next batch arriving May 10. . Then we can watch live registrations to see if it is ...Show more

compound: -0.5106, neg: 0.089, neu: 0.911, pos: 0.0, 


250 premarket open...

compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 


Clean cars. For just that worth buy . Because we all know he will get there but just we don't know when.

compound: 0.3182, neg: 0.0, neu: 0.852, pos: 0.148, 


And liquidation of his shares would create downward share price pressure

compound: 0.5106, neg: 0.14, neu: 0.446, pos: 0.414, 


Tesla CEO Elon Musk is more than doubling the stock he will buy in a public offering in an apparent earn investors confidence

compound: 0.5106, neg: 0.0, neu: 0.864, pos: 0.136, 


A moderate slide it Tesla share could quickly turn into an avalanche

compound: 0.296, neg: 0.0, neu: 0.82, pos: 0.18, 


Tesla cr

AttributeError: 'float' object has no attribute 'encode'

In [369]:
def get_compound_score(comment):
    ss = sid.polarity_scores(str(comment))
    return ss['compound']

def get_sentiment(compound_score):
    if compound_score > 0.05:
        return 1
    elif compound_score > -0.05 and compound_score < 0.05:
        return 0
    else:
        return -1

In [370]:
df['compound_score'] = df['comment'].apply(lambda x: get_compound_score(x))
df['sentiment'] = df['compound_score'].apply(lambda x: get_sentiment(x))

Positive sentiment: compound >= 0.05

Neutral sentiment: compound > -0.05 < 0.05

Negative sentiment: compound < -0.05

In [371]:
df.head()

Unnamed: 0,comment,date,compound_score,sentiment
0,It seems that no ship arrived in April to EU b...,,-0.5106,-1
1,250 premarket open...\n,,0.0,0
2,Clean cars. For just that worth buy . Because ...,,0.3182,1
3,And liquidation of his shares would create dow...,,0.5106,1
4,Tesla CEO Elon Musk is more than doubling the ...,,0.5106,1


# Prepare data for ML

In [372]:
new_df = pd.DataFrame([], columns=['avg_sentiment', 'avg_compound_score', 'comments', 'pos_comments', 'neg_comments'])

In [373]:
df['date'].unique

<bound method Series.unique of 0           None
1           None
2           None
3           None
4           None
5           None
6           None
7           None
8           None
9           None
10          None
11          None
12          None
13          None
14          None
15          None
16    2019-05-04
17    2019-05-04
18    2019-05-04
19    2019-05-04
20    2019-05-04
21    2019-05-04
22    2019-05-04
23    2019-05-04
24    2019-05-03
25          None
26          None
27          None
28          None
29          None
         ...    
70    2019-05-02
71    2019-05-02
72    2019-05-02
73    2019-05-02
74    2019-05-02
75    2019-05-02
76    2019-05-02
77    2019-05-02
78    2019-05-02
79    2019-05-02
80    2019-05-02
81    2019-05-02
82    2019-05-02
83    2019-05-02
84    2019-05-02
85    2019-05-02
86    2019-05-02
87    2019-05-02
88    2019-05-02
89    2019-05-02
90    2019-05-02
91    2019-05-02
92    2019-05-02
93    2019-05-02
94    2019-05-02
95    2019-05-02


In [125]:
df[df['sentiment'] == -1].groupby(['date']).count()

Unnamed: 0_level_0,comment,user,compound_score,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-04-18,1,1,1,1
2019-04-24,9,9,9,9
2019-04-27,2,2,2,2


In [106]:
new_df['df[df['sentiment'] == 1].count()

comment           28
date              12
user              28
compound_score    28
sentiment         28
dtype: int64

In [97]:
new_df = df.groupby(['date']).count()

In [98]:
new_df

Unnamed: 0_level_0,comment,user,compound_score,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-04-12,1,1,1,1
2019-04-18,1,1,1,1
2019-04-24,33,33,33,33
2019-04-25,1,1,1,1
2019-04-27,2,2,2,2
2019-04-28,1,1,1,1
2019-04-30,2,2,2,2


In [90]:
df.groupby(['date']).mean()

Unnamed: 0_level_0,compound_score,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-04-12,0.0,0.0
2019-04-18,-0.2481,-1.0
2019-04-24,-0.023673,0.030303
2019-04-25,0.0,0.0
2019-04-27,-0.741,-1.0
2019-04-28,0.9485,1.0
2019-04-30,0.31245,0.5


In [91]:
df.groupby(['date']).count()

Unnamed: 0_level_0,comment,user,compound_score,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-04-12,1,1,1,1
2019-04-18,1,1,1,1
2019-04-24,33,33,33,33
2019-04-25,1,1,1,1
2019-04-27,2,2,2,2
2019-04-28,1,1,1,1
2019-04-30,2,2,2,2
