# Polymarket and PredictIt webscrapers

This notebook helps you scrape all of the markets listed on Polymarket and PredictIt, and compare their prices for arbitrage opportunities.

# Install packages

In [1]:
pip install py-clob-client

In [2]:
!pip install pandas sentence-transformers scikit-learn numpy
!pip install --upgrade protobuf==5.27.2

# Polymarket Scraper

In [None]:
import pandas as pd
import os
from py_clob_client.constants import POLYGON
from py_clob_client.client import ClobClient
from py_clob_client.clob_types import OrderArgs
from py_clob_client.order_builder.constants import BUY
from google.colab import userdata


host = "https://clob.polymarket.com"
raise ValueError("Insert your polymarket key below!")
key = userdata.get('polymarket_key')
chain_id = POLYGON

# Create CLOB client and get/set API credentials
client = ClobClient(host, key=key, chain_id=chain_id)
client.set_api_creds(client.create_or_derive_api_creds())
def get_all_markets(client):
    cursor = ""
    all_markets = []

    while True:
        try:
            markets = client.get_sampling_markets(next_cursor = cursor)
            # markets = client.get_simplified_markets(next_cursor = cursor)
            markets_df = pd.DataFrame(markets['data'])
            cursor = markets['next_cursor']

            all_markets.append(markets_df)

            if cursor is None:
                break
        except:
            break

    all_df = pd.concat(all_markets)
    all_df = all_df.reset_index(drop=True)

    return all_df

In [None]:
markets = get_all_markets(client)

# All the markets are active and not closed anyways
markets = markets[markets['active'] == True]
markets = markets.loc[:, ['condition_id', 'question_id', 'question', 'tokens']]
markets.head()

Unnamed: 0,condition_id,question_id,question,tokens
0,0x26ee82bee2493a302d21283cb578f7e2fff2dd157438...,0x22b180e61b0628d5a2c3fc05ad0e3bf19a499c492e4e...,Which party wins 2024 US Presidential Election?,[{'token_id': '1101547097368417782972921928726...
1,0xda60399dab4f9cb4dc21b8a7e46fc3e9a141e8da6a23...,0xadaf1dc7f57a76a8ca331b593dc7b2cf55a1e12db711...,Senate control after 2024 election?,[{'token_id': '1064284159723064408056597988215...
2,0xe32b258020c4663576e22423fb9792932a44fd5986d5...,0x5435f5434e4ed38a43a57289d297a20d717dbc583326...,Will ≥10% of Votes go to 3rd Party Candidates ...,[{'token_id': '7595642012662687480103274281352...
3,0xd2d4628c6119f20763ac855b54db1fbb408b518d81bb...,0x82dc7bdf0184d71fb83bc6804d5a1d0ea8cbf1293ee3...,OpenAI announces it has achieved AGI in 2024?,[{'token_id': '1058504388286148844521973109384...
4,0x01a9eea306780839c5cf9a15a572a438c23af6c49b57...,0xc7a8c727d15683c1755ab2d4fcc06d9d2193d7c3204b...,Joe Biden impeached before 2024 election?,[{'token_id': '7595561407791009430045213829302...


# Create comparison functions between both API's

This imports a simple language model to automatically find the prompts which are the same.

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the questions outside of the function so that you don't have to do it every time
full_df_encodings = model.encode(markets['question'].tolist())

def find_similar_question(input_question, df, df_encodings, similarity_threshold=0.95):
    # Encode the input question
    input_encoding = model.encode([input_question])

    # Calculate cosine similarity
    similarities = cosine_similarity(input_encoding, df_encodings)[0]

    # Find the index of the most similar question
    most_similar_index = np.argmax(similarities)

    # Check if the similarity is above the threshold
    if similarities[most_similar_index] >= similarity_threshold:
        return df.iloc[most_similar_index]['question'], df.iloc[most_similar_index]['tokens'], df.iloc[most_similar_index]['condition_id'], similarities[most_similar_index]
    else:
        return df.iloc[most_similar_index]['question'], None, None, similarities[most_similar_index]



from datetime import datetime, timezone
# Get time until market ends
def time_difference_from_now(date_string):
    # Parse the input string to a datetime object
    target_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)

    # Get the current time in UTC
    current_time = datetime.now(timezone.utc)

    # Calculate the time difference
    time_difference = current_time - target_date

    # Calculate days, hours, minutes
    days = time_difference.days
    hours, remainder = divmod(time_difference.seconds, 3600)
    minutes, _ = divmod(remainder, 60)

    # Determine if the date is in the past or future
    if time_difference.total_seconds() > 0:
        tense = "ago"
    else:
        tense = "in the future"
        days, hours, minutes = abs(days), abs(hours), abs(minutes)

    return days #, f"{days} days, {hours} hours, and {minutes} minutes {tense}"

from py_clob_client.clob_types import BookParams


# Get the polymarket buy yes and stuff
def get_polymarket_values(input_question, markets_df, df_encodings):
  # input_question = "Will Kamala Harris win the 2024 US presidential election?"
  question, token_id, condition_id, similarity_score = find_similar_question(input_question, markets_df, df_encodings)

  if token_id:
      pass
      # print(f"  Most similar question: {question}")
      # print(f"Most similar question ID: {token_id}")
      # print(f"Similarity score: {similarity_score}")
  else:
      # print(f"  No similar question found. Highest similarity score: {similarity_score}. Most similar question: {question}")
      return None, None, None, None


  for i in range(0, len(token_id)):
    if token_id[i]['outcome'] == 'Yes':
      # buy_yes  = BookParams(token_id=token_id[i]['token_id'],side="BUY")
      best_sell_yes = client.get_price(token_id=token_id[i]['token_id'],side="SELL")['price'] #BookParams(token_id=token_id[i]['token_id'],side="SELL")
    else:
      # buy_no  = BookParams(token_id=token_id[i]['token_id'],side="BUY")
      best_sell_no = client.get_price(token_id=token_id[i]['token_id'],side="SELL")['price']

  days_till_end = None
  try:
    days_till_end = time_difference_from_now(client.get_market(condition_id)['end_date_iso'])
  except:
    pass
  return question, days_till_end, best_sell_yes, best_sell_no


# Function which checks for arbitrage

Note: PredictIt has some fees that you have to account for, which is a 10% capital gains fee and a 5% withdrawl fee. We only care about arbitrage if it is profitable after accounting for these fees

In [None]:
# Assume you invested $1000 in
def check_for_arbitrage(best_yes_polymarket, best_no_polymarket, best_yes_predictit, best_no_predictit):
  # Check for buying yes polymarket, buying no predictit
  if best_yes_polymarket + best_no_predictit < 1:
    # Shares of both sides you bought (goes to $1000)
    shares_purchased = 1000*(1/(best_yes_polymarket + best_no_predictit))
    polymarket_investment = shares_purchased * best_yes_polymarket
    predictit_investment = shares_purchased * best_no_predictit

    # Now two outcomes. The yes wins, or the no wins

    # If no_predictit wins:
    # ***Remove 10% predictit cap gains fee***
    no_profit = 0.99*shares_purchased - 0.1*(0.99-best_no_predictit)*shares_purchased - 1000

    # If yes_polymarket wins:
    yes_profit = 0.01*shares_purchased + shares_purchased - 1000

    if yes_profit > 0 and no_profit > 0:
      return yes_profit, no_profit, shares_purchased
    else:
      return None, None, None


  elif best_yes_predictit + best_no_polymarket < 1:
    shares_purchased = 1000*(1/(best_yes_predictit + best_no_polymarket))
    polymarket_investment = shares_purchased * best_yes_predictit
    predictit_investment = shares_purchased * best_no_polymarket

    # If no_polymarket wins:
    no_profit =  0.01*shares_purchased + shares_purchased - 1000
    # If yes_predictit wins:
    yes_profit = 0.99*shares_purchased - 0.1*(0.99-best_yes_predictit)*shares_purchased - 1000

    if yes_profit > 0 and no_profit > 0:
      return yes_profit, no_profit, shares_purchased
    else:
      return None, None, None

  else:
    return None, None, None




# Running it across the entirety of both sites

If we get a guarenteed 4% return on investment but the market settles tomorrow, the implied yearly return is much greater than 4% thus we should act on this. However, if the market settles in 3 years, this is not worth the investment. The script thus prints out the implied yearly return of the investment for a fair comparison

In [None]:
import requests
import json

def get_full_list(markets_df, df_encodings):

  response = requests.get('https://www.predictit.org/api/marketdata/all/')

  # Check if the request was successful
  if response.status_code == 200:
    # Parse the JSON content
    data = json.loads(response.content)

    # Iterate through each market
    for market in data['markets']:

      # Iterate through each contract in the market
      for contract in market['contracts']:
        # Will ___ win the ___
        if market['name'].lower() != contract['name'].lower():
          try:
            q = "Will " + contract['name'] + " " + market['name'][market['name'].lower().index("win"):]
          except:
            # These are markets that don't have "win" in the name and don't have
            continue
        else:
          q = contract['name']
        question, days_till_end, best_yes_polymarket, best_no_polymarket = get_polymarket_values(q, markets_df, df_encodings)


        if question != None:

          best_yes_polymarket = float(best_yes_polymarket)
          best_no_polymarket = float(best_no_polymarket)

          best_yes_predictit = contract['bestBuyYesCost']
          best_no_predictit = contract['bestBuyNoCost']

          if best_yes_predictit == None or best_no_predictit == None:
            continue

          yes_profit, no_profit, shares_purchased = check_for_arbitrage(best_yes_polymarket, best_no_polymarket, best_yes_predictit, best_no_predictit)

          if yes_profit != None and no_profit != None:
            # We have found arbitrage
            print(f"Found arbitrage in: {q}")
            print(f"  Best YES Polymarket: ${best_yes_polymarket}")
            print(f"  Best NO  Polymarket: ${best_no_polymarket}")
            print(f"  Best YES Predictit:  ${best_yes_predictit}")
            print(f"  Best NO  Predictit:  ${best_no_predictit}")
            print(f"  Shares   Purchased:   {shares_purchased}")
            try:
              days_till_end = float(days_till_end)
              print(f"  YES Profit (given $1000): ${yes_profit:.2f}. Implied yearly return: {((1+yes_profit/1000)**(365.25/days_till_end-1)-1):.2%}")
              print(f"  NO  Profit (given $1000): ${no_profit:.2f}. Implied yearly return: {((1+no_profit/1000)**(365.25/days_till_end-1)-1):.2%}")
              if best_yes_polymarket > best_yes_predictit:
                # Then bought yes on predictit (buy low)
                print(f"    YES Profit % after 5% withdrawl fee: {((1+(yes_profit-50)/1000)**(365.25/days_till_end-1)-1):.2%}")
                print(f"    NO  Profit % after 5% withdrawl fee: {((1+(no_profit)/1000)**(365.25/days_till_end-1)-1):.2%}")
              else:
                # Then bought yes on polymarket (buy low)
                print(f"    YES Profit % after 5% withdrawl fee: {((1+(yes_profit)/1000)**(365.25/days_till_end-1)-1):.2%}")
                print(f"    NO  Profit % after 5% withdrawl fee: {((1+(no_profit-50)/1000)**(365.25/days_till_end-1)-1):.2%}")
            except:
              print(f"  YES Profit (given $1000): ${yes_profit:.2f}.")
              print(f"  NO Profit (given $1000): ${no_profit:.2f}.")
            print(f" ")

  else:
    print(f"Failed to retrieve data: Status code {response.status_code}")

In [None]:
get_full_list(markets, full_df_encodings)

Found arbitrage in: Will Donald Trump win the 2024 US presidential election?
  Best YES Polymarket: $0.607
  Best NO  Polymarket: $0.394
  Best YES Predictit:  $0.54
  Best NO  Predictit:  $0.47
  Shares   Purchased:   1070.663811563169
  YES Profit (given $1000): $11.78. Implied yearly return: 187.89%
  NO  Profit (given $1000): $81.37. Implied yearly return: 116936.63%
    YES Profit % after 5% withdrawl fee: -97.04%
    NO  Profit % after 5% withdrawl fee: 116936.63%
 
Found arbitrage in: Will Kamala Harris win the 2024 US presidential election?
  Best YES Polymarket: $0.395
  Best NO  Polymarket: $0.606
  Best YES Predictit:  $0.52
  Best NO  Predictit:  $0.5
  Shares   Purchased:   1117.31843575419
  YES Profit (given $1000): $128.49. Implied yearly return: 5511227.17%
  NO  Profit (given $1000): $51.40. Implied yearly return: 9142.62%
    YES Profit % after 5% withdrawl fee: 5511227.17%
    NO  Profit % after 5% withdrawl fee: 13.43%
 
