## Data source:
https://xkcd.com/json.html

In [1]:
import pandas as pd
import os
import time

## This section will download a dataframe of all the xkcd comics from 0-newest

In [2]:
# doesn't need to be run 
# but helps to check that everything is up
test = pd.read_json("https://xkcd.com/614/info.0.json", typ = "series")
xkcdDf = pd.DataFrame(columns = test.index)
xkcdDf.loc[0] = test
xkcdDf

Unnamed: 0,month,num,link,year,news,safe_title,transcript,alt,img,title,day
0,7,614,,2009,,Woodpecker,[[A man with a beret and a woman are standing ...,If you don't have an extension cord I can get ...,https://imgs.xkcd.com/comics/woodpecker.png,Woodpecker,24


In [3]:
%%time
#pulls the newest comic
newestXkcd = pd.read_json("https://xkcd.com/info.0.json" , typ = "series")
xkcdDf = pd.DataFrame(columns = newestXkcd.index)
#gets the number of the newest comic so we can work backwards from there
newestNumber = newestXkcd["num"]

errorIssues = []
for issue in range(newestNumber):
    try:
        currentLink = "https://xkcd.com/{num}/info.0.json".format(num = issue)
        xkcdDf.loc[issue] = pd.read_json(currentLink , typ = "series")
    except:
        errorIssues += [issue]
        
    print("\r Currently on Issue#: {0}".format(issue), end='')
    time.sleep(.1)

print("")    
xkcdDf

 Currently on Issue#: 2441Wall time: 13min 6s


Unnamed: 0,month,num,link,year,news,safe_title,transcript,alt,img,title,day
1,1,1,,2006,,Barrel - Part 1,[[A boy sits in a barrel which is floating in ...,Don't we all.,https://imgs.xkcd.com/comics/barrel_cropped_(1...,Barrel - Part 1,1
2,1,2,,2006,,Petit Trees (sketch),[[Two trees are growing on opposite sides of a...,"'Petit' being a reference to Le Petit Prince, ...",https://imgs.xkcd.com/comics/tree_cropped_(1).jpg,Petit Trees (sketch),1
3,1,3,,2006,,Island (sketch),"[[A sketch of an Island]]\n{{Alt:Hello, island}}","Hello, island",https://imgs.xkcd.com/comics/island_color.jpg,Island (sketch),1
4,1,4,,2006,,Landscape (sketch),[[A sketch of a landscape with sun on the hori...,There's a river flowing through the ocean,https://imgs.xkcd.com/comics/landscape_cropped...,Landscape (sketch),1
5,1,5,,2006,,Blown apart,[[A black number 70 sees a red package.]]\n70:...,Blown into prime factors,https://imgs.xkcd.com/comics/blownapart_color.jpg,Blown apart,1
...,...,...,...,...,...,...,...,...,...,...,...
2437,3,2437,,2021,,Post-Vaccine Party,,"[Future update] Well, someone accidentally dro...",https://imgs.xkcd.com/comics/post_vaccine_part...,Post-Vaccine Party,15
2438,3,2438,,2021,,Siri,,Alexa defeated her in a battle hinging on the ...,https://imgs.xkcd.com/comics/siri.png,Siri,17
2439,3,2439,,2021,,Solar System Cartogram,,"For sentimental reasons, every active Mars rov...",https://imgs.xkcd.com/comics/solar_system_cart...,Solar System Cartogram,19
2440,3,2440,,2021,,Epistemic Uncertainty,,"Luckily, unlike in our previous study, we have...",https://imgs.xkcd.com/comics/epistemic_uncerta...,Epistemic Uncertainty,22


In [4]:
# Comics that didn't download properly
print("Missing comic #'s",errorIssues,"", sep = '\n')

Missing comic #'s
[0, 404]



Notice there is no comic #0 and #404, 404 is likely a joke because of the common error for missingness.  
I thought there would be one for 0 since they have tons of coding related humor, but guess not  
#1965 is strange error I encountered, this never happened before. 
After investigating, it worked perfectly fine by going to the json with the same link manually. Strange

In [5]:
# number of comics in df (including errors)
print('Newest comic number', newestNumber)
print('Total Comics: ', newestNumber-len(errorIssues)+1) # +1 because of them not starting the index at 0

Newest comic number 2442
Total Comics:  2441


## Now making a system that UPDATES the csv rather than starting fresh every time

In [6]:
#finding the newest dataframe

datas = os.listdir('data')
print("files in data directory:", datas,"", sep = '\n')
if len(datas) == 0:
    currentNum = 1
else:
    tableNumbers = [int(val[10:13]) for val in datas]
    print('Table nums that exist', tableNumbers)
    currentNum = max(tableNumbers) + 1
print('num for the next table: ', currentNum)

files in data directory:
['xkcd_data_001.csv', 'xkcd_data_002.csv', 'xkcd_data_003.csv', 'xkcd_data_004.csv']

Table nums that exist [1, 2, 3, 4]
num for the next table:  5


### ONLY run this if data is empty or needs to use the data from the section above

In [7]:
outfp = os.path.join('data', 'xkcd_data_' + str(currentNum).zfill(3) + '.csv')
print('path to next file: ', outfp)
xkcdDf.to_csv(outfp,index = False)

path to next file:  data\xkcd_data_005.csv


## Now for updating

In [8]:
#do this next time when there is more data
infp = os.path.join('data', 'xkcd_data_' + str(currentNum-1).zfill(3) + '.csv')
pd.read_csv(infp)

Unnamed: 0,month,num,link,year,news,safe_title,transcript,alt,img,title,day
0,1,1,,2006,,Barrel - Part 1,[[A boy sits in a barrel which is floating in ...,Don't we all.,https://imgs.xkcd.com/comics/barrel_cropped_(1...,Barrel - Part 1,1
1,1,2,,2006,,Petit Trees (sketch),[[Two trees are growing on opposite sides of a...,"'Petit' being a reference to Le Petit Prince, ...",https://imgs.xkcd.com/comics/tree_cropped_(1).jpg,Petit Trees (sketch),1
2,1,3,,2006,,Island (sketch),"[[A sketch of an Island]]\n{{Alt:Hello, island}}","Hello, island",https://imgs.xkcd.com/comics/island_color.jpg,Island (sketch),1
3,1,4,,2006,,Landscape (sketch),[[A sketch of a landscape with sun on the hori...,There's a river flowing through the ocean,https://imgs.xkcd.com/comics/landscape_cropped...,Landscape (sketch),1
4,1,5,,2006,,Blown apart,[[A black number 70 sees a red package.]]\n70:...,Blown into prime factors,https://imgs.xkcd.com/comics/blownapart_color.jpg,Blown apart,1
...,...,...,...,...,...,...,...,...,...,...,...
79,3,82,,2006,,Frame,[[A stick figure stands alone in the centre of...,...,https://imgs.xkcd.com/comics/frame.jpg,Frame,29
80,3,83,,2006,,Katamari,[[A girl stands on the left. A man is sitting...,"As the King of All Cosmos remarked, 'Is it tha...",https://imgs.xkcd.com/comics/katamari.jpg,Katamari,31
81,4,84,,2006,,National Language,This happened to my friend:\n[[Men and women a...,She's pretty sharp when provoked.,https://imgs.xkcd.com/comics/national_language...,National Language,3
82,4,85,,2006,,Paths,[[Blueprint of a campus. Two buildings in the ...,"It's true, I think about this all the time.",https://imgs.xkcd.com/comics/paths.jpg,Paths,5
