In [3]:
# https://github.com/njday/Kickstarter-Scraper

# Datasets
import pandas as pd
import numpy as np

# formatting tools
import datetime as dt
from pprint import pprint
from itertools import chain
import json

# reddit crawler
import praw

# converting created dates from reddit API into human readable format
from datetime import datetime, timedelta

# make directories for data collection
import os

# copy data structure
import copy

# regular expression search PRAW results
import re

# wait time for api limits and api retry
import time
#import asyncio # Not implemented

# debugging tools
import traceback
import logging

# HTTP calls
import urllib.request as urlreq
import urllib.error as urlerr
import http.cookiejar

# Change logging level to standard output
logging.basicConfig(level=logging.INFO)
# Create a file handler to log debug messages to a file
debug_handler = logging.StreamHandler()
debug_handler.setLevel(logging.DEBUG)
# Create a formatter
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
debug_handler.setFormatter(formatter)
my_logger = logging.getLogger('my_logger').addHandler(debug_handler)

# API scraper to pause between API calls

In [4]:
# Limit API to 15 calls per minute
api_restriction = 60/15

# After successful result 
# Retry every 10 seconds 12 times for a total of 2 minutes
def retry_function(func, *args, max_attempts=12, delay=10, **kwargs):
    attempts = 0
    while attempts < max_attempts:
        try:
            start_time = time.time() # record start time of api call
            result = func(*args, **kwargs)  # Call the function
            end_time = time.time() # record start time of api call
            # wait for the difference between the api restriction and the total api call time
            api_wait_time = api_restriction - (end_time-start_time)
            if api_wait_time > 0: time.sleep(api_wait_time)
            return result  # Return the result if successful
        except Exception as e:
            print(f"An error occurred: {e}")
            print(f"Function: {func}")
            print(f"Args: {args}")
            traceback.print_exc()
            attempts += 1
            if attempts < max_attempts:
                print(f"Retrying attempt #{attempts} in {delay} seconds...")
                for _ in range(delay): time.sleep(1)
    print("Max attempts reached. Continuing loop.")
    return None  # Or you can raise an exception here if needed


# Import Kickstarter datasets from WebRobots
[WebRobots.io](https://webrobots.io/kickstarter-datasets/) scrapes from Kickstarter projects.

In [5]:
# Import dependencies for handling JSON, URL requests, HTML scraping, CSV Formatting
import json
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas
import sys
import re


# finds the video link on a description page
def findVideo(htmlIn):
    divs = htmlIn.findAll("video", {"class": "landscape"})
    result = []
    # for each video class find the <source> and 'src'
    for div in divs:
        video = div.find("source").attrs['src']
        result.append(video)
    # return only the first/one video src
    return result


# find the risks and challenges text on a description page
def findRisks(htmlIn):
    # select the risks and challenges div <div>
    divs = htmlIn.find("div", {"class": "mb3 mb10-sm mb3 js-risks"})
    result = [""]
    # find all tags inside of main <div>
    children = divs.findChildren()
    result = processChildren(children)
    return result


# find the description text on a description page
def findDescription(htmlIn):
    # select the main description <div>
    divs = htmlIn.find("div", {"class": "full-description js-full-description responsive-media formatted-lists"})
    result = [""]
    # find all tags inside of main <div>
    children = divs.findChildren()
    result = processChildren(children)
    return result


# find the description text on a description page
def findAI(htmlIn):
    # select the main description <div>
    divs = htmlIn.find("div", {"class": "mb3 mb10-sm mb3"})
    result = [""]
    # find all tags inside of main <div>
    children = divs.findChildren()
    result = processChildren(children)
    return result


# takes set of child tags and returns an array containing all text elements appended together
def processChildren(children):
    result = [""]
    # for each tag process and append result
    for child in children:
        # checks valid tag type and text content
        if checkValidTags(child):
            # strip out white space and break characters
            divText = child.getText().strip().strip("\n")
            divText = divText.replace(u'\xa0', "")
            # If result is not empty append and continue
            if len(divText) > 0:
                result[0] += (divText + " ")

    return result


# check valid elements/html tags
def checkValidTags(element):
    check = True
    if element.name == 'video':
        check = False
    elif element.name == 'source':
        check = False
    elif element.name == ('source'):
        check = False
    elif element.name == ('img'):
        check = False
    elif element.name == ('time'):
        check = False
    elif element.find(text=False):
        check = False
    return check


def findUpdates(htmlIn):
    # select the main description <div>
    divs = htmlIn.find("div", {"class": "timeline"})
    result = [""]
    # find all tags inside of main <div>
    children = divs.findChildren()
    result = processChildren(children)
    return result


def findFAQ(htmlIn):
    # select the main description <div>
    divs = htmlIn.find("div", {"class": "NS_projects__faqs_section js-project-faqs"})
    result = [""]
    # find all tags inside of main <div>
    children = divs.findChildren()
    result = processChildren(children)
    return result


def scrapeData(urlClean):
        
    results = []

    appendages = ["/description", "/updates", "/faqs", "/community"]

    urlScraped = urlClean + appendages[0]

    # Create a cookie jar to store cookies
    cookie_jar = http.cookiejar.CookieJar()

    # Create an opener with support for cookies
    opener = urlreq.build_opener(urlreq.HTTPCookieProcessor(cookie_jar))

    # Headers required or site gives 403: Forbidden error
    hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'none',
        'Accept-Language': 'en-US,en;q=0.8',
        'Connection': 'keep-alive'}

    # Create a request object with the URL and headers
    req = urlreq.Request(urlScraped, headers=hdr)

    # Perform the HTTP request
    response = opener.open(req)
    html = response.read()
    soup = BeautifulSoup(html, 'lxml')
    
    try:
        results.append([findDescription(soup)[0]])
    except Exception as e:
        print(e)
        results.append("could not find description")
        print("could not find description")
    # description = findDescription(soup)[0]
        
    try:
        results.append([findAI(soup)[0]])
    except Exception as e:
        print(e)
        results.append("could not find description")
        print("could not find description")
    # description = findDescription(soup)[0]

    try:
        result = re.sub("(Risks and challenges)*"
                        , ""
                        , findRisks(soup)[0])
        results.append([result])
    except Exception as e:
        print(e)
        results.append("could not find risks")
        print("could not find risks")
    # risks = findRisks(soup)[0]

    try:
        results.append([findVideo(soup)[0]])
    except Exception as e:
        print(e)
        results.append("could not find video")
        print("could not find video")
    # video = findVideo(soup)[0]
        
    return results
"""
    # updates
    urlScraped = urlClean + appendages[1]
    html = urlopen(urlScraped)
    soup = BeautifulSoup(html, 'lxml')

    try:
        result = re.sub("(Project unsuccessful)*"
                        "(Project launched)*"
                        "(Project canceled)*"
                        , ""
                        , findUpdates(soup)[0])
        results.append([result])
    except Exception as e:
        print(e)
        results.append("could not find updates")
        print("could not find updates")

    # faqs
    urlScraped = urlClean + appendages[2]
    html = urlopen(urlScraped)
    soup = BeautifulSoup(html, 'lxml')

    try:
        result = re.sub("(Frequently Asked Questions)*"
                        "(Looks like there aren't any frequently asked questions yet. Ask the project creator directly\.)*"
                        "(Don't see the answer to your question?)*"
                        "(Ask the project creator directly. Ask a question)*"
                        "(Ask a question)*"
                        , ""
                        , findFAQ(soup)[0])
        results.append([result])
    except Exception as e:
        print(e)
        results.append("could not find faqs")
        print("could not find faqs")
"""


def read(csvToRead, outputDir):
    datafile = pandas.read_csv(csvToRead)
    totalRows = len(datafile['name'])

    # for each row in the csv scrape data and write back to csv
    for i in range(totalRows):
        url = json.loads(datafile.loc[i]["urls"])["web"]["project"]
        urlClean = re.sub("(\?ref=)+.*$", "", url)

        # collect data from url, pausing between web requests
        results = retry_function(scrapeData, urlClean)

        # write data to csv line
        datafile.loc[i, "description"] = results[0]
        datafile.loc[i, "risks"] = results[1]
        datafile.loc[i, "video"] = results[2]
        datafile.loc[i, "updates"] = results[3]
        datafile.loc[i, "faq"] = results[4]

        csvOutput = '/'.join([outputDir, os.path.basename(csvToRead)])
        datafile.to_csv(csvOutput)
        return datafile

output_dir = '/'.join(['webrobots','output'])
os.makedirs(output_dir, exist_ok=True)
read('webrobots/raw/Kickstarter_2023-03-09T03_20_04_199Z.zip/Kickstarter.csv', output_dir)


An error occurred: HTTP Error 403: Forbidden
Function: <function scrapeData at 0x000002BB5EACC550>
Args: ('https://www.kickstarter.com/projects/1820499508/pest-deterrent-grid-with-only-5w-voltage',)
Retrying attempt #1 in 10 seconds...


Traceback (most recent call last):
  File "C:\Users\georg\AppData\Local\Temp\ipykernel_39080\2253502710.py", line 11, in retry_function
    result = func(*args, **kwargs)  # Call the function
  File "C:\Users\georg\AppData\Local\Temp\ipykernel_39080\735047004.py", line 136, in scrapeData
    response = opener.open(req)
  File "C:\Users\georg\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 525, in open
    response = meth(req, response)
  File "C:\Users\georg\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 634, in http_response
    response = self.parent.error(
  File "C:\Users\georg\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 563, in error
    return self._call_chain(*args)
  File "C:\Users\georg\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 496, in _call_chain
    result = func(*args)
  File "C:\Users\georg\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 643, in http_error_de

KeyboardInterrupt: 

In [6]:
import urllib.request as urlreq
import urllib.error as urlerr
import http.cookiejar

# Create a cookie jar to store cookies
cookie_jar = http.cookiejar.CookieJar()

# Create an opener with support for cookies
opener = urlreq.build_opener(urlreq.HTTPCookieProcessor(cookie_jar))

url= 'https://www.kickstarter.com/projects/1820499508/pest-deterrent-grid-with-only-5w-voltage/description'

hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

# Create a request object with the URL and headers
req = urlreq.Request(url, headers=hdr)

try:
    # Perform the HTTP request
    response = opener.open(req)
    content = response.read()
    print(content)
except urlerr.HTTPError as e:
    print(e.read())
finally:
    # Print the received cookies
    for cookie in cookie_jar:
        print(cookie)

b'<!DOCTYPE html><html lang="en-US"><head><title>Just a moment...</title><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta http-equiv="X-UA-Compatible" content="IE=Edge"><meta name="robots" content="noindex,nofollow"><meta name="viewport" content="width=device-width,initial-scale=1"><style>*{box-sizing:border-box;margin:0;padding:0}html{line-height:1.15;-webkit-text-size-adjust:100%;color:#313131}button,html{font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica Neue,Arial,Noto Sans,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol,Noto Color Emoji}@media (prefers-color-scheme:dark){body{background-color:#222;color:#d9d9d9}body a{color:#fff}body a:hover{color:#ee730a;text-decoration:underline}body .lds-ring div{border-color:#999 transparent transparent}body .font-red{color:#b20f03}body .big-button,body .pow-button{background-color:#4693ff;color:#1d1d1d}body #challenge-success-text{background-image:url(data:image/svg+xml;base64,