In [6]:
from urllib.request import Request, urlopen as Ureq
import urllib.request
import requests
import PyPDF4
from analysis import tokenizer, twitter_sent_analysis
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup as soup
import pandas as pd
import os
from collections import Counter
import time
import sys

nltk.download('vader_lexicon')

addl_stopwords = [',','`', '', 'rt', 'http', 'https', 'RT', 'BTC', 'bitcoin', 'ETH', 'LTC', 'XRP', 'co', 'crypto', 'blockchain', 'cryptocurrency', 'cripto', 'litecoin']

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\cscat\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [21]:
class WhitepapersDB():

    def __init__(self):
        self.wpdb_base_url = 'https://whitepaperdatabase.com/?s='
        self.acwp_base_url = 'https://www.allcryptowhitepapers.com/?s='


    def base_page(self, term, site):
        '''
        Returns the base page for the whitepaper of the coin entered.  Returns bs4 'soup' object.  
        
        Arguements: 
        
        term - The serch term or 'coin' being researched (ex. Bitcoin)
        
        site - Must equal wpdb for WhitepaperDatabase.com or acwp for AllCryptoWhitePapers.com
        '''
        if site == 'wpdb':
            new_url = self.wpdb_base_url + term
        elif site == 'acwp':
            new_url = self.acwp_base_url + term
        else:
            print('Error: Site must equal wpdb or acwp, see docstring for more details.')

        print(new_url)
        uClient = Ureq(new_url)
        raw_content = uClient.read()
        uClient.close()
        page_soup = soup(raw_content)
        return page_soup

    def get_paper_url_wpdb(self, page_soup):
        containers = page_soup.findAll("a")
        url = containers[8]['href']
        return url
    
    def get_paper_url_acwp(self, page_soup):
        containers = page_soup.findAll("a")
        url = containers[14]['href']
        return url

    def get_pdf_link(self, paper_url):
        uClient = Ureq(paper_url)
        raw_content = uClient.read()
        uClient.close()
        page_soup = soup(raw_content)
        pdf_link = page_soup.findAll("a", {"class":"pdfemb-viewer"})
        return pdf_link[0]['href']

    def get_pdf(self, ticker, pdf_link):
        filename = '../../data/whitepapers/' + ticker + '_whitepaper.pdf'
        urllib.request.urlretrieve(pdf_link, filename)

    def read_pdf(self, ticker):
        corpus = ''
        filename = '../../data/whitepapers/' + ticker + '_whitepaper.pdf'
        pdf_obj = open(filename, 'rb')
        pdfReader = PyPDF4.PdfFileReader(pdf_obj)
        pages = pdfReader.numPages
        for i in range(pages):
            raw_text = pdfReader.getPage(i)
            corpus = corpus + raw_text.extractText()
        pdf_obj.close()
        return corpus
    
    def preprocess(self, body):
        tokens = tokenizer(body)
        text = ''
        corpus = [text + ' ' + word for word in tokens]
        return corpus

    def check_sent(self, corpus):
        analyzer = SentimentIntensityAnalyzer()
        sentiment = analyzer.polarity_scores(corpus)
        return sentiment
    
    def remove_pdf(self, ticker):
        filename = '../../data/whitepapers/' + ticker + '_whitepaper.pdf'
        print(f'Removing {filename}')
        os.remove(filename)
        
    
    

In [22]:
wp = WhitepapersDB()

In [23]:
df.head()

Unnamed: 0,rank_cnc,Name,Ticker,market_cap,price,volume24,pct1h,pct24h,pct7d,country,...,pre_Duration,compound,negative,neutral,positive,no_of_posts,Longevity,custom_index,custom_index_scaled,success
0,2,Ethereum,ETH,20155411771,185.67,7906136399,-0.1,-1.2,-0.96,Switzerland,...,0 days 00:00:00.000000000,0.46057,0.06116,0.02739,0.90151,100,779,144636.93,0.129162,1
1,7,Binance Coin,BNB,3277372658,21.07,231846713,-0.46,-1.33,2.49,Japan,...,0 days 00:00:00.000000000,0.563773,0.09667,0.04141,0.86189,100,848,17867.36,0.015956,1
2,8,EOS,EOS,3214893394,3.42,1735988258,0.11,-1.49,-2.55,Cayman Islands,...,0 days 00:00:00.000000000,0.82847,0.0322,0.85855,0.10927,100,522,1785.24,0.001594,0
3,13,Cardano,ADA,1105888653,0.042654,53437224,-0.5,-1.72,-2.12,Switzerland,...,0 days 00:00:00.000000000,0.589474,0.098387,0.032946,0.868753,93,1049,44.744046,4e-05,0
4,16,Huobi Token,HT,913746412,3.79,252881495,-0.27,-1.89,-4.79,Seychelles,...,0 days 00:00:00.000000000,0.525717,0.08163,0.03348,0.88494,100,779,2952.41,0.002637,1


In [10]:
df = pd.read_csv('../../data/cleandata/success_df.csv')

In [32]:
count = 0
no_paper = []
scores = {}
for name in df['Name']:
    count += 1
    term = name.split(' ')
    try:
        term = str(term[1])+'%20'+str(term[2])
        print(term)
    except:
        term = term[1]
    print(f'Searching for {term} White Paper')
    try:
        wp.get_pdf(term, wp.get_pdf_link(wp.get_paper_url_wpdb(wp.base_page(term, 'wpdb'))))
        print(f'{name} White Paper successfully found on WhitepagesDatabase.com, beginning processing')
        print(f'{count} / {len(df)}')
        file = wp.read_pdf(term)
        words = wp.preprocess(file)
        corpus = ''
        for word in wp.preprocess(file):
            corpus = str(word) + ' ' + str(corpus)
        scores[name] = wp.check_sent(corpus)

        
    
    except:
        try:
            print(f'{name} White Paper Unavalible on WhitepagesDatabase.com, Searching AllCryptoWhitepages.com')
            wp.get_pdf(term, wp.get_pdf_link(wp.get_paper_url_acwp(wp.base_page(term, 'acwp'))))
            print(f'{name} White Paper successfully found on AllCryptoWhitepages.com, beginning processing')
            print(f'{count} / {len(df)}')
            file = wp.read_pdf(term)
            words = wp.preprocess(file)
            corpus = ''
            for word in wp.preprocess(file):
                corpus = str(word) + ' ' + str(corpus)
            scores[name] = wp.check_sent(corpus)
            
        
        except:
            print(f'{name} White Paper Unavalible on either site, adding to list')
            no_paper.append(name)
            print(f'{count} / {len(df)}')
            scores[name] = 'NA'      
    
    

Searching for Ethereum White Paper
https://whitepaperdatabase.com/?s=Ethereum
 Ethereum White Paper successfully found on WhitepagesDatabase.com, beginning processing
1 / 573
 Ethereum White Paper Unavalible on WhitepagesDatabase.com, Searching AllCryptoWhitepages.com
https://www.allcryptowhitepapers.com/?s=Ethereum
 Ethereum White Paper Unavalible on either site, adding to list
1 / 573
Binance%20Coin
Searching for Binance%20Coin White Paper
https://whitepaperdatabase.com/?s=Binance%20Coin
 Binance Coin White Paper successfully found on WhitepagesDatabase.com, beginning processing
2 / 573
Searching for EOS White Paper
https://whitepaperdatabase.com/?s=EOS
 EOS White Paper successfully found on WhitepagesDatabase.com, beginning processing
3 / 573
Searching for Cardano White Paper
https://whitepaperdatabase.com/?s=Cardano
 Cardano White Paper successfully found on WhitepagesDatabase.com, beginning processing
4 / 573
Huobi%20Token
Searching for Huobi%20Token White Paper
https://whitepaper



Ethereum%20Classic
Searching for Ethereum%20Classic White Paper
https://whitepaperdatabase.com/?s=Ethereum%20Classic
 Ethereum Classic White Paper successfully found on WhitepagesDatabase.com, beginning processing
9 / 573
 Ethereum Classic White Paper Unavalible on WhitepagesDatabase.com, Searching AllCryptoWhitepages.com
https://www.allcryptowhitepapers.com/?s=Ethereum%20Classic
 Ethereum Classic White Paper successfully found on AllCryptoWhitepages.com, beginning processing
9 / 573
 Ethereum Classic White Paper Unavalible on either site, adding to list
9 / 573
Basic%20Attenti...
Searching for Basic%20Attenti... White Paper
https://whitepaperdatabase.com/?s=Basic%20Attenti...
 Basic Attenti... White Paper Unavalible on WhitepagesDatabase.com, Searching AllCryptoWhitepages.com
https://www.allcryptowhitepapers.com/?s=Basic%20Attenti...
 Basic Attenti... White Paper Unavalible on either site, adding to list
10 / 573
Searching for Decred White Paper
https://whitepaperdatabase.com/?s=Decre

In [27]:
scores

{' Ethereum': 'NA',
 ' Binance Coin': {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0},
 ' EOS': {'neg': 0.016, 'neu': 0.963, 'pos': 0.02, 'compound': 0.3182}}

In [33]:
scored_df = pd.DataFrame(scores).T

In [34]:
scored_df

Unnamed: 0,compound,neg,neu,pos
Ethereum,,,,
Binance Coin,0,0,0,0
EOS,0.3182,0.016,0.963,0.02
Cardano,-0.9001,0.022,0.95,0.028
Huobi Token,,,,
Tezos,0,0,1,0
Cosmos,,,,
Dash,0,0,0,0
Ethereum Classic,,,,
Basic Attenti...,,,,


In [35]:
scored_df.to_csv('../../data/cleandata/wp_scores.csv')