<a href="https://colab.research.google.com/github/elishaaquino/KickstarterAnalysis/blob/master/Data_Collection_and_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Collection and Cleaning**



In [0]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
from pandas.io.json import json_normalize
import json

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
df_projects = pd.read_csv("/content/combined_csv.csv")


In [0]:
df_projects.drop(['country_displayable_name', 'creator', 'currency_trailing_code', 
                  'disable_communication', 'friends', 'is_backing',
                  'permissions', 'slug', 'static_usd_rate',
                  'static_usd_rate', 'unread_messages_count', 'unseen_activity_count', 
                  'usd_type', 'converted_pledged_amount', 'currency_symbol',
                  'fx_rate', 'current_currency', 'id', 'is_starrable',
                  'is_starred', 'photo', 'profile', 'urls', 
                  'state_changed_at'], axis=1, inplace=True)

In [0]:
import re

categories = []

for link in df_projects["source_url"]:
  result = re.search('https://www.kickstarter.com/discover/categories/(.*)%20&%20video', 
                     link)
  if result is None:
    result = re.search('https://www.kickstarter.com/discover/categories/(.*)/', 
                       link)
  if result is None:
    result = re.search('https://www.kickstarter.com/discover/categories/(.*)', 
                       link)
  categories.append(result.group(1))

df_projects.drop(['source_url'], axis=1, inplace=True)

In [0]:
df_categories = pd.DataFrame({'Parent Category': categories})
df_projects = pd.merge(df_projects, df_categories, right_index=True, left_index=True)

In [0]:
project_categories = []
for category in df_projects["category"]:
  project_categories.append((json.loads(category)["name"]))

In [0]:
project_categories
df_child_categories = pd.DataFrame({'Category Name': project_categories})
df_projects = pd.merge(df_projects, df_child_categories, right_index=True, left_index=True)
df_projects.drop(["category"], axis=1, inplace=True)

In [0]:
import numpy as np

location_names = []
for location in df_projects["location"]:
  if location is np.nan:
    location_names.append("NA")
  else:
    location_names.append((json.loads(location)["name"]))

location_types = []
for location in df_projects["location"]:
  if location is np.nan:
    location_types.append("NA")
  else:
    location_types.append((json.loads(location)["type"]))

df_locations = pd.DataFrame({'Location': location_names, 'Location Type': location_types})
df_projects = pd.merge(df_projects, df_locations, right_index=True, left_index=True)
df_projects.drop(["location"], axis=1, inplace=True)

In [0]:
df_projects["Amount of Time Live"] = (df_projects["deadline"] - df_projects["launched_at"])/86400

In [0]:
import datetime

def convert_time(time):
  return datetime.datetime.fromtimestamp(time).strftime('%Y-%m-%d %H:%M:%S')
  
df_projects["deadline"] = df_projects["deadline"].apply(lambda x:convert_time(x))
df_projects["created_at"] = df_projects["created_at"].apply(lambda x:convert_time(x))
df_projects["launched_at"] = df_projects["launched_at"].apply(lambda x:convert_time(x))

In [0]:
df_projects

Unnamed: 0,backers_count,blurb,country,created_at,currency,deadline,goal,launched_at,name,pledged,spotlight,staff_pick,state,usd_pledged,Parent Category,Category Name,Location,Location Type,Amount of Time Live
0,0,A play performed at the FCO Global Summit on t...,GB,2014-04-24 22:47:23,GBP,2014-06-21 11:00:00,1500,2014-05-25 22:51:35,HIDDEN: The FCO Plays,0.0,False,False,failed,0.000000,theater,Plays,London,Town,26.505845
1,1,Peko Chan inspired stickers based on your favo...,US,2019-05-20 20:02:56,USD,2019-07-05 22:44:03,300,2019-05-21 22:44:03,Peko Chan Stickers,1.0,False,False,failed,1.000000,art,Digital Art,Irvine,Town,45.000000
2,105,"Adventure Guild: the social, mobile RPG. Creat...",US,2015-11-11 00:32:41,USD,2016-01-02 00:00:00,15000,2015-12-01 12:07:56,Adventure Guild,15220.0,True,False,successful,15220.000000,games,Mobile Games,Rochester,Town,31.494491
3,16,A beautiful painting of one of the most vivid ...,US,2018-11-03 10:50:51,USD,2019-01-01 04:59:00,5121,2018-11-08 22:19:44,Digital Painting of the Prophet Ezekiel’s Visi...,370.0,False,False,canceled,370.000000,art,Digital Art,Atlanta,Town,53.277269
4,2,Recorrer el planeta a través del buceo para en...,ES,2017-09-07 10:07:34,EUR,2017-10-22 14:24:36,5000,2017-09-07 14:24:36,Buceo y reportajes,2.0,False,False,failed,2.382070,journalism,Journalism,España,Town,45.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3734,2,I am a student at the University of Nebraska l...,US,2015-01-23 05:44:37,USD,2015-03-21 04:44:00,15000,2015-01-23 06:27:04,Studying and Journaling,37.0,False,False,failed,37.000000,journalism,Print,Montpellier,Town,56.928426
3735,15,The Nixie Experiment-1: A steel watch with cus...,US,2017-11-19 23:59:16,USD,2018-01-17 15:28:42,9000,2017-12-18 15:28:42,NXI Watch - Nixie Tube Watch with Custom Dials,9142.0,True,False,successful,9142.000000,technology,Wearables,Palm Beach,Town,30.000000
3736,1,Utah news publications often go unchecked when...,US,2016-10-18 18:37:14,USD,2016-11-27 20:15:35,500,2016-10-18 19:15:35,Independent News - Utah - Journalism,25.0,False,False,failed,25.000000,journalism,Web,Salt Lake City,Town,40.041667
3737,1,Rich Hlywka's first Zine. A tell all story abo...,CA,2014-02-09 02:53:47,CAD,2014-03-13 02:10:41,100,2014-02-11 03:10:41,A Zine: A pound past Nowhere An inch away from...,20.0,False,False,failed,18.127699,publishing,Poetry,Edmonton,Town,29.958333


In [0]:
df_projects.to_csv("pastProjects.csv")

### Scraping for live projects

In [0]:
page = 0

info = []
while page <= 200:
  response = requests.get(
    "https://www.kickstarter.com/discover/advanced?sort=newest&seed=2639808&page=%d" % page)
  
  soup = BeautifulSoup(response.content, "html.parser")
  project = soup.find_all("div", {"class" :"js-react-proj-card"})

  for proj in project:
    info.append(json.loads(proj.attrs.get("data-project")))

  time.sleep(0.5)
  
  page += 1  

In [0]:
df_table = json_normalize(info)

In [0]:
df_table.to_csv("notCleanLiveProjects.csv")

In [0]:
df_kickstarters = json_normalize(info)
df_kickstarters.drop(['slug', 'disable_communication', 'currency_symbol',
                  'currency_trailing_code', 'is_starrable', 'static_usd_rate',
                  'fx_rate', 'current_currency', 'is_liked', 'is_disliked',
                  'usd_type', 'id', 'friends', 'is_starred', 'is_backing',
                  'permissions', 'country_displayable_name', 'state_changed_at',
                  'converted_pledged_amount'], axis=1, inplace=True)


In [0]:
df_kickstarters.rename(columns={"category.parent_name": "Parent Category", 
                                "category.name": "Category Name", 
                                "location.name": "Location",
                                "location.type": "Location Type"},
                       inplace=True)

unwanted = df_kickstarters.columns[(df_kickstarters.columns.str.startswith('urls.')) | 
                      (df_kickstarters.columns.str.startswith('profile.')) |
                      (df_kickstarters.columns.str.startswith('photo.')) | 
                      (df_kickstarters.columns.str.startswith('location.')) | 
                      (df_kickstarters.columns.str.startswith('creator.')) | 
                      (df_kickstarters.columns.str.startswith('category.'))]

df_kickstarters.drop(unwanted, axis=1, inplace=True)

In [0]:
df_kickstarters["Amount of Time Live"] = (df_kickstarters["deadline"] - df_kickstarters["launched_at"])/86400

In [0]:
df_kickstarters["deadline"] = df_kickstarters["deadline"].apply(lambda x:convert_time(x))
df_kickstarters["created_at"] = df_kickstarters["created_at"].apply(lambda x:convert_time(x))
df_kickstarters["launched_at"] = df_kickstarters["launched_at"].apply(lambda x:convert_time(x))

In [0]:
df_kickstarters

Unnamed: 0,name,blurb,goal,pledged,state,country,currency,deadline,created_at,launched_at,staff_pick,backers_count,usd_pledged,spotlight,percent_funded,Location,Location Type,Category Name,Parent Category,Amount of Time Live
0,Love Earth,We will be creating a Multi-media Art Installa...,1000.0,0.0,live,US,USD,2020-04-03 02:00:00,2020-03-09 20:56:11,2020-03-11 03:19:37,False,0,0.0,False,0.000000,Jacksonville,Town,Public Art,Art,22.944711
1,Phoenix Forged Dice,"A Dice company that specializes in handmade, c...",500.0,0.0,live,US,USD,2020-04-21 02:00:00,2020-03-10 04:53:49,2020-03-11 03:19:21,False,0,0.0,False,0.000000,Orem,Town,Tabletop Games,Games,40.944896
2,Words N Words,Words N Words is a game that you can play agai...,30000.0,0.0,live,US,USD,2020-04-10 03:03:15,2020-03-08 20:01:47,2020-03-11 03:03:15,False,0,0.0,False,0.000000,Birmingham,Town,Mobile Games,Games,30.000000
3,Segundo Encuentro Nacional de Maromeros,Realización del Segundo Encuentro Nacional de ...,89999.0,0.0,live,MX,MXN,2020-05-04 02:37:04,2020-02-25 21:56:40,2020-03-11 02:37:04,False,0,0.0,False,0.000000,Santa Maria Tlahuitoltepec,Town,Public Art,Art,54.000000
4,Monkey & The Bee takes on Athleisure & Sportsw...,Monkey & The Bee is a graphic t-shirt line tha...,9750.0,301.0,live,US,USD,2020-04-10 02:15:19,2020-03-07 20:35:38,2020-03-11 02:15:19,False,3,301.0,False,3.087179,Dallas,Town,Apparel,Fashion,30.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1711,Empire of the Stars - 4X Board Game,Empire of the Stars is a fast 4X board game of...,35000.0,26870.0,live,US,USD,2020-03-14 00:00:00,2019-12-19 21:46:40,2020-02-20 13:00:01,False,354,26870.0,False,76.771429,Beech Grove,Town,Tabletop Games,Games,22.458322
1712,A New Day For Cray On The Bay,"A rappin', rhymin' children's book about a cra...",8100.0,6241.0,live,US,USD,2020-03-21 11:51:09,2019-09-23 16:51:45,2020-02-20 12:51:09,False,81,6241.0,False,77.049383,Mashpee,Town,Children's Books,Publishing,29.958333
1713,Feynman Cove - The Most Versatile Dive Watch,Feynman Cove - The Most Versatile Dive-watch w...,50000.0,74284.0,live,SG,SGD,2020-03-11 11:50:31,2020-02-10 10:19:17,2020-02-20 12:50:31,False,75,53382.88994444,False,148.568000,Singapore,Town,Accessories,Fashion,19.958333
1714,Timor Heritage Field,A Swiss made watch inspired by the British Mil...,80000.0,114264.0,live,GB,GBP,2020-03-21 11:00:11,2019-08-02 10:38:32,2020-02-20 12:00:11,False,167,148566.138498,False,142.830000,Newcastle upon Tyne,Town,Product Design,Design,29.958333


In [0]:
df_kickstarters.to_csv("liveProjects.csv")