# Imports

In [694]:
import pandas as pd
import json
from urllib.request import Request, urlopen
import re
import datetime
from datetime import datetime as dt
from bs4 import BeautifulSoup as soup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Begin work on ICO Projects

In [29]:
#ICO Watch List API Wrapper- Can be called with 'live', 'upcoming', and 'finished' on the end for their respective lists, otherwise calls all ICO's
#No API key needed, 1sec limit per call

class ICO_data():
    
    # Initiates the object (self) and allows variables to be set for use in any of the classes funtions
    def __init__(self):
        self.url =' https://api.icowatchlist.com/public/v1/'
        
    def get_json(self):
        '''
        Sends HTTP Request to provided url and returns a json (dictionary) object.

        Arguements: 'url' - Requires a full http address including any applicable API keys.
        '''
        request = Request(self.url, headers={'User-Agent': 'Python'})
        response = urlopen(request)
        raw_data = response.read()
        json_data = json.loads(raw_data)
        return json_data

    def get_ico_df(self):
        '''
        Performs the 'get_json()' funtion and converts it into a Pandas DataFrame
        '''
        json_data = self.get_json()
        ico_list = json_data['ico']['finished']
        df = pd.DataFrame(ico_list)
        return df


     
    
    def preprocess_data(self):
        '''
        Performs the 'get_df' function and removes erronus columns, converts time to DateTime objects and 
        converts the numbers to floats
        '''
        df = self.get_ico_df()
        
        df.drop(columns=['icowatchlist_url', 'image', 'website_link'], inplace = True)
        reordered_columns = ['Name', 'Description', 'Price(USD)', 'Start', 'End', 'ROI(Pct)', 'Timezone']
        df.rename(columns={'all_time_roi': 'ROI(Pct)',
                           'coin_symbol': 'Ticker',
                          'description': 'Description',
                           'end_time': 'End',
                           'name': 'Name',
                           'price_usd': 'Price(USD)',
                          'start_time': 'Start',
                           'timezone': 'Timezone',
                          }, inplace=True)
        df.set_index('Ticker', inplace = True)
        df=df.reindex(columns=reordered_columns)
        #Convert the strings to datetime objects
        df['Start'] = df['Start'].apply(lambda x: dt.strptime(x, "%Y-%m-%d %H:%M:%S"))
        df['End'] = df['End'].apply(lambda x: dt.strptime(x, "%Y-%m-%d %H:%M:%S"))
        df['Price(USD)'] = df['Price(USD)'].replace("NA",'0')
        df['ROI(Pct)'] = df['ROI(Pct)'].replace('NA','0%')
        #Split the price values that are over 1,000 at the ','
        df['Price(USD)'] = df['Price(USD)'].apply(lambda x: re.split(',', x))
        #Split the roi values at their ',', and '%'
        df['ROI(Pct)'] = df['ROI(Pct)'].apply(lambda x: re.split('[, %]', x))
        #Merge the strings back that are over 1,000 and turn them into floats
        try:
            df['Price(USD)'] = df['Price(USD)'].apply(lambda x: float(x[0] + x[1])) 
        #Convert the remaining strings back to floats    
        except:
            df['Price(USD)'] = df['Price(USD)'].apply(lambda x: float(x[0]))
        
        try:
            df['ROI(Pct)'] = df['ROI(Pct)'].apply(lambda x: float(x[0] + x[1])/100) 
        except:
            df['ROI(Pct)'] = df['ROI(Pct)'].apply(lambda x: float(x[0])/100)
        #Create a duration column from the start and end dates
        df["Duration"] = df['End'] - df['Start']
        return df

  

In [371]:
  
def check_cmc(project_name):
    '''
    Check Coin Market Cap for the project, if it exists capture the current price, market cap, volume, circulating supply, total supply, ATH, ATL
    '''
    cmc_base_url = 'https://coinmarketcap.com/currencies/'
    headers={'User-Agent':"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"} 
    request=Request(cmc_base_url + project_name, headers=headers) 
    response =urlopen(request)
    soup = BeautifulSoup(response, 'html.parser')
        
        

# Test ICO functions

In [30]:
#Instantiate the object
ico = ICO_data()

In [804]:
#Call the funtion and create the dateframe
clean_df = ico.preprocess_data()
clean_df.head()

Unnamed: 0_level_0,Name,Description,Price(USD),Start,End,ROI(Pct),Timezone,Duration
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
REP,Augur,Augur is a decentralized prediction market,12.16,2015-08-17 00:00:00,2015-10-01 00:00:00,19.9569,UTC+0,45 days
LSK,Lisk,It is a cryptocurrency and decentralized appli...,0.78,2016-02-22 00:00:00,2016-03-21 00:00:00,9.3792,UTC+0,28 days
DGD,Digix DAO,A Decentralized Autonomous Organization focuse...,12.4,2016-03-30 12:00:00,2016-03-30 12:00:00,2.8272,UTC+0,0 days
WAVES,Waves,Waves helps to make the launching and coordina...,0.8,2016-04-12 13:00:00,2016-05-31 13:00:00,3.2407,UTC+0,49 days
STRAT,Stratis,Stratis was developed to help organisations de...,0.32,2016-06-21 00:00:00,2016-07-26 00:00:00,42.0832,UTC+0,35 days


# Begin work on Coin Projects

In [751]:
def tokenizer(text):
    """Tokenizes text."""
    addl_stopwords = ['ann', '']
    text = word_tokenize(text)
    text = [word.lower() for word in text]
    regex = re.compile("[^a-zA-Z ]")
    text = [regex.sub('', word) for word in text]
    sw = set(stopwords.words('english') + addl_stopwords)
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    clean_text = [word for word in text if word not in sw]
    return clean_text



class Coin_data():
    
    def __init__(self):
        self.btctalk_ann_url = 'https://bitcointalk.org/index.php?board=159.0'
        self.cmc_base_url = 'https://coinmarketcap.com/currencies/'
        self.cmc_coin_url = 'https://coinmarketcap.com/all/views/all/'

    def get_cmc_coins(self):
        headers={'User-Agent':"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"} 
        request=Request(self.cmc_coin_url, headers=headers) 
        response =urlopen(request)
        soup = BeautifulSoup(response, 'html.parser')
        coin_list = soup.findAll('a' , {'class':"currency-name-container link-secondary"})
        coins = []
        for coin in coin_list:
            coins.append(coin.text)
        return coins
        
        
    def get_new_projects(self):
        '''
        Gets a list of all new projects from Bitcointalk.org that are listed on CoinMarketCap
        '''
        #Get list of coins on Coin Market Cap
        print("Getting a list of all coins on Coin Market Cap")
        coin_list = self.get_cmc_coins()
        coin_list = [coin.lower() for coin in coin_list]
        
        #Prepare BS4 to scrape bitcointalk.org announcement page
        headers={'User-Agent':"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"} 
        request=Request(self.btctalk_ann_url, headers=headers) 
        response =urlopen(request)
        soup = BeautifulSoup(response, 'html.parser')
        
        #Create a list of all the post on the announcement page
        links = soup.findAll('a')
        
        #Capture the url for each post
        links = [url.get('href') for url in links if 'ANN' in url.text]
        
        #For loop over each url saving the content of each page to a dict key
        print('Looping over each url saving the content of each page to a dict key')
        coins = {}
        count = 0
        for url in links:
            count += 1
            headers={'User-Agent':"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"} 
            request=Request(url, headers=headers) 
            response =urlopen(request)
            soup = BeautifulSoup(response, 'html.parser')
            coins[count] = soup
            
        #For loop over each dict key (post html page) and captive the title, as well as search the body for key words
        print('Looping over each dict key (post html page) and captive the title, as well as search the body for key words')
        count = 1
        flags = ['gaurenteed', 'profit', 'government', 'approval', 'massive', 'mega', 'rich', 'money']
        name, rating = [],[]
        for i in range(len(coins)):
            count += 1
            scam_meter = 0

            for flag in flags:
                if flag in coins[i+1].text.lower():
                    scam_meter += 1
            name += [coins[i+1].title.text]
            rating += [scam_meter]
            
        #Create a dataframe to store the title and scam rating for each post
        df = pd.DataFrame({
            'Title':name,
            'Scam_Rating':rating,
        })
        return df
#commented out so the function will still run and return a df
    '''        
        #Extract the project name from the title
        print('Verifing if coin is on Coin Market Cap and removing those that are not')
        words, names=[], []
        for i in range(len(df)):
            words += tokenizer(df['Title'][i])
            for word in words:
                if word in coin_list:
                    cmc = 0
#                    names.append(word)
                else:
                    cmc = 1
                names.append(cmc)
#                    names.append('NA')
        print(names)           
#        df['Name'] = names
#        df = df.dropna()
'''                
        
        
        #Extract the ticker from the title
        
        
        #Extract a start date from the body of each post
        
        
        #Extract an end date from the body of each post
        
        
        #Capture the offering price for each coin
        
        
        #Identify the Algo for each project 
        


"        \n        #Extract the project name from the title\n        print('Verifing if coin is on Coin Market Cap and removing those that are not')\n        words, names=[], []\n        for i in range(len(df)):\n            words += tokenizer(df['Title'][i])\n            for word in words:\n                if word in coin_list:\n                    cmc = 0\n#                    names.append(word)\n                else:\n                    cmc = 1\n                names.append(cmc)\n#                    names.append('NA')\n        print(names)           \n#        df['Name'] = names\n#        df = df.dropna()\n"

# Test the Coin functions

In [752]:
#Instantiate the object
coin = Coin_data()

In [806]:
#Call the funtion and create the dateframe
df = coin.get_new_projects()
df.head()

Getting a list of all coins on Coin Market Cap
Looping over each url saving the content of each page to a dict key
Looping over each dict key (post html page) and captive the title, as well as search the body for key words


Unnamed: 0,Title,Scam_Rating
0,[ANN][ICO]HoweyCoins: the only BitcoinTalk-end...,4
1,"Users posting ""joined"" type posts when not req...",1
2,"[ANN] x42 - Feeless, PoS, Masternodes/Smart Co...",3
3,[ANN] Datacoin - Censorship-Free Data Storage,1
4,[ANN][DIVI][ONE-CLICK MASTERNODES] Tiered Mast...,0


In [803]:
#Tokenize each Title and check if any words match the coin_names[] if so add that name to the dataframe otherwise set the name to 'NA'
print('Verifing if coin is on Coin Market Cap and removing those that are not')
words={}
for i in range(len(df)):
    words[i]=tokenizer(df['Title'][i])
words

Verifing if coin is on Coin Market Cap and removing those that are not


{0: ['ico',
  'howeycoins',
  'bitcointalkendorsed',
  'ico',
  'guaranteed',
  'profit'],
 1: ['user',
  'posting',
  'joined',
  'type',
  'post',
  'required',
  'allowed',
  'banned'],
 2: ['x', 'feeless', 'po', 'masternodessmart', 'contractsside', 'blockchains'],
 3: ['divi',
  'oneclick',
  'masternodes',
  'tiered',
  'masternodes',
  'smart',
  'wallet',
  'live'],
 4: ['dash',
  'dash',
  'dashorg',
  'first',
  'selffunding',
  'selfgoverning',
  'crypto',
  'currency'],
 5: ['deeponion',
  'onion',
  'new',
  'private',
  'currency',
  'onion',
  'discover',
  'privacy'],
 6: ['cyber',
  'like',
  'google',
  'cybertendermintknowledge',
  'consensus',
  'supercomputer'],
 7: ['kmd',
  'dpow',
  'komodo',
  'open',
  'composable',
  'smart',
  'chain',
  'platform',
  'secured',
  'b'],
 8: ['qredit', 'hodler', 'enterprise', 'enhanced', 'blockchain', 'suite'],
 9: ['axe', 'decentralized', 'x', 'llmq', 'pow', 'pose'],
 10: ['neo', 'major', 'update', 'neo', 'blockchain', 'comin

In [795]:
for words in words[6]:
    print(words)

cyber
like
google
cybertendermintknowledge
consensus
supercomputer


In [799]:
for i in range(len(words)):
    for words in words[i]:
        print(words)

s


IndexError: string index out of range