In [1]:
from bs4 import BeautifulSoup
import time, os
import requests
import pandas as pd

In [2]:
dictionary = {}

In [3]:
def find_num(string):
    num_list = [i for i in string if i.isnumeric()]
    if len(num_list) > 1:
        num_list = ''.join(num_list)
        num_list = num_list.replace(',', '')
    else:
        num_list = ''.join(num_list)
    return num_list

In [4]:
def get_num_items(division):
    num_items = 0
    for string in division:
        num_items+=1
    return num_items

In [5]:
def get_project_loc_and_cat(soup):
    loc_and_cat = soup.find('div', class_= 'NS_projects__category_location ratio-16-9 flex items-center').text
    loc_and_cat = loc_and_cat.strip('\n').split('\n')
    loc_and_cat = [i for i in loc_and_cat if i]
    return loc_and_cat

In [6]:
def get_project_location(soup):
    return get_project_loc_and_cat(soup)[-2]

In [7]:
def get_project_category(soup):
    return get_project_loc_and_cat(soup)[-1]

In [8]:
def get_project_target(soup):
    target = soup.find('h3', class_='mb0').find('span', class_ = 'money').text
    return target

In [9]:
def get_project_goal(soup):
    goal = soup.find('div', class_='type-12 medium navy-500').find('span', class_='money').text
    return goal

In [10]:
def get_project_funding_period(soup):
    funding_period = soup.find('div', class_='NS_campaigns__funding_period').find('p', class_='f5').text
    funding_period = funding_period.split('(')[-1]
    return find_num(funding_period)

In [11]:
def get_project_backers(soup):
    backers = soup.find('div', class_='mb0').find('h3', class_='mb0').text
    return backers.strip('\n')

In [12]:
def get_project_num_rewards(soup):
    num_reward = get_num_items(soup.find('div', class_='NS_projects__rewards_list js-project-rewards').find_all('li', class_='hover-group pledge--inactive pledge-selectable-sidebar'))
    return num_reward

In [13]:
def get_pledge_backers(soup):
    pledge_backers_stats = soup.find_all('div', class_='pledge__backer-stats')

    pledge_backers = []

    for element in pledge_backers_stats:
        pledge_backers.append(element.text)

    return pledge_backers

In [14]:
def get_project_lowest_reward_backers(soup):
    pledge_backers = get_pledge_backers(soup)
    return find_num(pledge_backers[0])

In [15]:
def get_project_highest_backers(soup):
    pledge_backers = get_pledge_backers(soup)
    return find_num(pledge_backers[-1])

In [16]:
def get_project_pwl_tag_bool(soup):
    if get_project_loc_and_cat(soup)[0] == 'Project We Love':
        pwl_tag = 1
    else:
        pwl_tag = 0
    return pwl_tag

In [17]:
def get_project_video_bool(soup):
    if soup.find('video') != None:
        video_tag = 1
    else:
        video_tag = 0
    return video_tag

In [18]:
def get_project_title(soup):
    profile_title = soup.find('div', class_='NS_project_profile__title').text
    return profile_title.strip('\n')

In [19]:
def get_project_data(soup):
    profile_title = get_project_title(soup)
    
    target = get_project_target(soup)
    
    category = get_project_category(soup)
    
    location = get_project_location(soup)
    
    pwl_tag = get_project_pwl_tag_bool(soup)
    
    backers = get_project_backers(soup)
    
    goal = get_project_goal(soup)
    
    rewards = get_project_num_rewards(soup)
    
    lowest_pledge_backers = get_project_lowest_reward_backers(soup)
    
    highest_pledge_backers = get_project_highest_backers(soup)
    
    funding_period = get_project_funding_period(soup)
    
    video_tag = get_project_video_bool(soup)
    
    project_dict = {profile_title: [target, 
                                    category, 
                                    location, 
                                    pwl_tag, 
                                    backers,
                                    goal,
                                    rewards,
                                    lowest_pledge_backers,
                                    highest_pledge_backers,
                                    funding_period,
                                    video_tag]}
    return project_dict


In [20]:
US_url_df = pd.read_csv('/Users/evelynjohnson/Desktop/METIS/Linear_Regression/Project/US_url_csv.csv')

In [21]:
US_url_df.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,https://www.kickstarter.com/projects/blackbria...
1,1,https://www.kickstarter.com/projects/beccastor...
2,2,https://www.kickstarter.com/projects/toxiwatch...
3,3,https://www.kickstarter.com/projects/rugosekoh...
4,4,https://www.kickstarter.com/projects/animogame...


In [22]:
column_names = {'Unnamed: 0': 'index', '0': 'url'}
US_url_df.rename(columns=column_names, inplace=True)

In [23]:
US_url_df.drop(['index'], axis=1, inplace=True)

In [24]:
US_url_df.head()

Unnamed: 0,url
0,https://www.kickstarter.com/projects/blackbria...
1,https://www.kickstarter.com/projects/beccastor...
2,https://www.kickstarter.com/projects/toxiwatch...
3,https://www.kickstarter.com/projects/rugosekoh...
4,https://www.kickstarter.com/projects/animogame...


In [25]:
for index, row in US_url_df.iterrows():
    response = requests.get(row['url'])
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    dictionary.update(get_project_data(soup))

In [26]:
print(dictionary['Animo: Red Letter Day'])

['$30,638', 'Tabletop Games', 'Rochester, MI', 0, '218', '$15,000', 7, '3', '12', '20', 1]


In [27]:
kickstarter_data = pd.DataFrame.from_dict(dictionary, orient='index',
                       columns=['Funding',
                                'Category',
                                'Location',
                                'Project We Love Tag',
                                'Number of Backers',
                                'Monetary Goal',
                                'Number of Pledge-Reward Options',
                                'Number of Backers for Lowest Pledge',
                                'Number of Backers for Highest Pledge',
                                'Length of Funding Period',
                                'Video Present'])

In [28]:
kickstarter_csv = kickstarter_data.to_csv('kickstarter_data.csv')


In [29]:
kickstarter_data.shape

(1080, 11)