# 9章 Kickstarterの分析、機械学習を使わないという選択肢

## 9.2 Kickstarterのクローラーを作成する

In [None]:
import urllib.request
import json
import os
import time

os.makedirs('result', exist_ok=True)

search_term = ''
sort_key = 'newest'
category_list = [16, 331, 332, 333, 334, 335, 336, 337, 52, 362, 338, 51, 339, 340, 341, 342] # technology category
base_query = 'https://www.kickstarter.com/projects/search.json?term={term}&category_id={category_id}&page={page_id}&sort={sort}'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0"
}

for category_id in category_list:
    project_count = 0
    for page_id in range(1, 201):
        try:
            query = base_query.format(term=search_term, category_id=category_id, page_id=page_id, sort=sort_key)
            request = urllib.request.Request(query, headers=headers)
            response_json = json.loads(urllib.request.urlopen(request).read().decode("utf-8"))
        except print(0):
            break

        if len(response_json['projects']) == 0:
            break

        project_count += len(response_json['projects'])
        total_hits = response_json['total_hits']

        print(category_id, 'progress', project_count, '/', total_hits, round(float(project_count) / total_hits * 100, 2), '%')

        for project in response_json['projects']:
            filepath = 'result/{}.json'.format(project['id'])
            fp = open(filepath, 'w')
            fp.write(json.dumps(project, sort_keys=True, indent=2))
            fp.close()
        
        time.sleep(1)

## 9.3 JSONデータをCSVに変換する

In [None]:
import glob
import pandas
import pandas.io.json

project_list = []

# globでresultフォルダにあるファイルを操作して読み込み
for filename in glob.glob("result/*.json"):
    project = json.loads(open(filename).read())
    project_list.append(project)

# json_normalizeを使ってDataFrameに変換する
df = pandas.io.json.json_normalize(project_list)

# 末尾が"_at"で終わるunixtimeのカラムをdatatimeに変換する
datetime_columns = filter(lambda a: a[-3:] == "_at", df.columns)
for column in datetime_columns:
    df[column] = pandas.to_datetime(df[column], unit='s')

# DataFrameからCSV形式のstrに変換する
csv_data = df.to_csv()

# WindowsのExcelに読み込ませるので、CP932にする
csv_data = csv_data.encode("cp932", "ignore")

# 結果を書き込む
fp = open("kickstarter_result.csv", "wb")
fp.write(csv_data)
fp.close()