# Getting Started - Collecting Data using the API


This file contains the code to extract data from ravelry.com using the API.

API documentation: https://www.ravelry.com/api

Used read-only credentials for data retrieval. I already had a ravelry account (for my knitting work), but it is free to create an account to get the API keys.

Got data for patterns sorted by popularity. I collected data for about 10,000 patterns, so it took a long time to do the final download, which is why I am keeping this in a separate file.


In [5]:
import http.client
import base64
import json
import requests
import pandas as pd

In [15]:
# load API token username and password
# credentials that are tied to my ravelry account removed prior to submission.
authUsername = #ADD IT IN HERE
authPassword = #ADD IT IN HERE

# test function - scrap the top 1000 patterns - not actually used in the final scraping, just wanted to see how it worked
def get_pattern_info_test(query, authUsername, authPassword):
    res = requests.get('https://api.ravelry.com/patterns/search.json?query={}&sort=projects&page_size=1000&page=80'.format(query),
                       auth=requests.auth.HTTPBasicAuth(authUsername, authPassword))
    return res

In [16]:
# run test function - scrap the top 1000 patterns - not actually used in the final scraping, just wanted to see how it worked

test_res = get_pattern_info_test('', authUsername, authPassword)
test_result = json.loads(test_res.content)
test_result['patterns']
patterns_test_df = pd.DataFrame(test_result['patterns'])

patterns_test_df

Unnamed: 0,free,id,name,permalink,personal_attributes,first_photo,designer,pattern_author,pattern_sources
0,False,470186,Unraveling the Mystery,unraveling-the-mystery,,"{'id': 38062368, 'sort_order': 1, 'user_id': 1...","{'crochet_pattern_count': 0, 'favorites_count'...","{'crochet_pattern_count': 0, 'favorites_count'...","[{'amazon_rating': None, 'amazon_reviews': Non..."
1,True,470281,Cute and Easy Cat Hat,cute-and-easy-cat-hat,,"{'id': 37994073, 'sort_order': 1, 'user_id': 3...","{'crochet_pattern_count': 85, 'favorites_count...","{'crochet_pattern_count': 85, 'favorites_count...","[{'amazon_rating': None, 'amazon_reviews': Non..."
2,True,471196,Tristan's Triangles,tristans-triangles,,"{'id': 43498361, 'sort_order': 1, 'user_id': 3...","{'crochet_pattern_count': 0, 'favorites_count'...","{'crochet_pattern_count': 0, 'favorites_count'...","[{'amazon_rating': None, 'amazon_reviews': Non..."
3,True,471546,8-Hour Shawl,8-hour-shawl-2,,"{'id': 38135293, 'sort_order': 1, 'user_id': 4...","{'crochet_pattern_count': 95, 'favorites_count...","{'crochet_pattern_count': 95, 'favorites_count...","[{'amazon_rating': None, 'amazon_reviews': Non..."
4,False,473633,Maylee,maylee,,"{'id': 39596645, 'sort_order': 1, 'user_id': 7...","{'crochet_pattern_count': 0, 'favorites_count'...","{'crochet_pattern_count': 0, 'favorites_count'...","[{'amazon_rating': None, 'amazon_reviews': Non..."
...,...,...,...,...,...,...,...,...,...
995,True,28592,Bonnie Blanket,bonnie-blanket,,"{'id': 15201313, 'sort_order': 1, 'user_id': 1...","{'crochet_pattern_count': 21, 'favorites_count...","{'crochet_pattern_count': 21, 'favorites_count...","[{'amazon_rating': None, 'amazon_reviews': Non..."
996,False,29095,Lavender Hat & Scarf,lavender-hat--scarf,,"{'id': 3216013, 'sort_order': 1, 'user_id': 38...","{'crochet_pattern_count': 0, 'favorites_count'...","{'crochet_pattern_count': 0, 'favorites_count'...","[{'amazon_rating': None, 'amazon_reviews': Non..."
997,False,29479,Forest jacket,forest-jacket,,"{'id': 742896, 'sort_order': 1, 'user_id': 217...","{'crochet_pattern_count': 13, 'favorites_count...","{'crochet_pattern_count': 13, 'favorites_count...","[{'amazon_rating': None, 'amazon_reviews': Non..."
998,False,29598,"Mittens, Mittens, Mittens (AC-001)",mittens-mittens-mittens-ac-001,,"{'id': 70739190, 'sort_order': 1, 'user_id': 4...","{'crochet_pattern_count': 0, 'favorites_count'...","{'crochet_pattern_count': 0, 'favorites_count'...","[{'amazon_rating': None, 'amazon_reviews': Non..."


In [17]:
# function to scrap the first 10 pages (1000 patterns on each page) of patterns sorted by popularity and what worked during testing

def get_pattern_info(authUsername, authPassword):
    pattern_ids = []
    for page in range(1, 11):
        res = requests.get('https://api.ravelry.com/patterns/search.json?&sort=popularity&page_size=1000&page=' + str(page),
                       auth=requests.auth.HTTPBasicAuth(authUsername, authPassword))
        result = json.loads(res.content)
        patterns_df = pd.DataFrame(result['patterns'])
        pattern_ids.append(patterns_df.id.tolist())
    return pattern_ids

In [18]:
# returns pattern ids as integers, list of lists (a list for each page of results)

pattern_ids = get_pattern_info(authUsername, authPassword)

In [19]:
# flatten list, and change integers into strings

ids_list = [str(patt_id) for sublist in pattern_ids for patt_id in sublist]

print(ids_list)

['1258283', '990044', '754478', '857493', '709323', '443533', '418518', '1279445', '1039033', '899479', '1039035', '130787', '1267291', '927223', '1159708', '766246', '819716', '317565', '1091238', '426231', '124400', '1168670', '840489', '754847', '799982', '632425', '625513', '1359184', '894237', '710742', '315418', '1214447', '1341105', '788421', '891114', '1062698', '967629', '832035', '1284321', '616533', '338883', '211562', '580119', '870739', '894972', '1259894', '1187153', '1334019', '959471', '410814', '1033093', '492166', '273024', '1127084', '971032', '908945', '957863', '1287102', '1233471', '3156', '528611', '493402', '1321260', '464893', '1224946', '1312345', '927031', '588220', '672877', '672420', '1316764', '106061', '1164269', '631393', '1014247', '1063022', '324885', '1301509', '437525', '1331457', '1130250', '111517', '894713', '1038036', '301227', '987217', '605', '1039034', '731318', '1272189', '924707', '1320708', '866404', '910492', '1246049', '299131', '518078',

In [20]:
# next step is to retreive details for an individual pattern id.

#test to see if I can do it once
test1 = requests.get('https://api.ravelry.com/patterns.json?ids=' +
                     str(990044), auth=requests.auth.HTTPBasicAuth(authUsername, authPassword))
result1 = json.loads(test1.content)
pattern_detail_test_df = pd.DataFrame(result1['patterns']).T

pattern_detail_test_df

Unnamed: 0,comments_count,craft,created_at,currency,currency_symbol,difficulty_average,difficulty_count,download_location,downloadable,favorites_count,...,unlisted_product_ids,updated_at,url,volumes_in_library,yardage,yardage_description,yardage_max,yarn_list_type,yarn_weight,yarn_weight_description
990044,80,"{'id': 2, 'name': 'Knitting', 'permalink': 'kn...",2019/12/23 10:25:36 -0500,GBP,£,2.407254,2178,"{'type': 'ravelry', 'free': False, 'url': 'htt...",True,42072,...,,2023/07/03 03:55:16 -0400,https://ysolda.com/products/musselburgh,,130,130 - 610 yards,610,2,"{'crochet_gauge': None, 'id': 15, 'knit_gauge'...",Any gauge - designed for any gauge


In [None]:
# create final scraping function that pulls in pattern details for every pattern in the top 10,000 popularity.
# at first I tried to run it in batches of 50, 25, or 10, but that was timing out
# I re-wrote the fuction to loop through all the ids one at a time and that worked better

def get_pattern_details(authUsername, authPassword, pattern_ids):
    pattern_details_dataframes = []

    for id in pattern_ids:
        url = f'https://api.ravelry.com/patterns.json?ids={id}'
        response = requests.get(url, auth=requests.auth.HTTPBasicAuth(authUsername, authPassword))
        result = json.loads(response.content)
        details_df = pd.DataFrame(result['patterns']).T
        pattern_details_dataframes.append(details_df)

    pattern_details_df = pd.concat(pattern_details_dataframes).reset_index().rename(columns={'index': 'patt_id'})
    return pattern_details_df

In [None]:
# run and save the pattern_details file. This takes a bit to run, so I am going to do my data cleaning & analysis in another file.

pattern_details = get_pattern_details(authUsername, authPassword, ids_list)
pattern_details.to_csv('pattern_details.csv', index=False)