In [1]:
import json, urllib
import pandas as pd
import numpy as np
import gc # For managing garbage collector

In [66]:
# Building blocks of the API calls
BASE_ENDPOINT = 'http://archive.org/wayback/available?url='
TIMESTAMP_ARG = '&timestamp='

# Any CSV formatted as per our usual project standards can go in here.
def CSV_PROCESS(csvlocation):
    URL_LIST = df_URLs = pd.read_csv(csvlocation, low_memory = False, usecols=["NCESSCH", "URL"]) 
    URL_LIST = URL_LIST['URL'].tolist()
    return URL_LIST

# This function takes a list of URLs, checks if each one is on the Wayback Machine, and if it is, then a constructed URL will be outputted.
# If the optional timestamp parameter is entered, it will check if a snapshot from that date is available. 
# Timestamps must be formatted as YYYYMMDD strings, not integers, otherwise it'll throw a SyntaxError.
# INPUT: Shallow Python list of string URLs
# OUTPUT: Shallow Python list of 
def URL_PROCESS(urls, timestamp = ""):
    result = []
    for url in urls:
        if timestamp:
            endpoint = BASE_ENDPOINT + url + TIMESTAMP_ARG + timestamp
        else:
            endpoint = BASE_ENDPOINT + url
        
        response = json.load(urllib.request.urlopen(endpoint))
        found = len(response['archived_snapshots']) > 0
        
        if found:
            result.append(response['archived_snapshots']['closest']['url']) 
            
        if not found:
            result.append(0)
            
    return result

## Testing on a small sample of valid URLS

In [68]:
# Load data
gc.disable()
charterdf = pd.read_pickle('../../web_scraping/data/charters_valid_urls_2015.pkl')
gc.enable()

print(charterdf.shape)

(10862, 702)


In [69]:
charterdf.dropna(subset = ["URL"], how = "all", inplace = True) #dropping nan values for URL column
charterdf.head()

Unnamed: 0,NCESSCH,URL,LAT1516,LON1516,AM,AS,BL,HI,HP,TR,...,ESS_VALID_STR,PROG_VALID_COUNT,PROG_VALID_STR,RIT_VALID_COUNT,RIT_VALID_STR,INQUIRY_RATIO,DISCIPLINE_RATIO,ESS_VALID_RATIO,PROG_VALID_RATIO,RIT_VALID_RATIO
0,10019700000.0,http://www.maef.net/,,,,,,,,,...,-3.049363,18.0,-2.997944,51.0,-2.545622,0.007424,0.000446,0.000893,0.001005,0.002847
1,20000100000.0,https://education.alaska.gov/DOE_Rolodex/Schoo...,60.796131,-161.765194,167.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,,0.0,,,,,,
2,20015000000.0,https://www.kgbsd.org/ketchikancharter,55.347001,-131.641191,74.0,37.0,2.0,5.0,4.0,5.0,...,-6.0,1.0,-3.123525,0.0,-6.0,0.003762,0.000752,0.0,0.000752,0.0
3,20015000000.0,http://www.tongassschool.org/,55.347001,-131.641191,57.0,12.0,4.0,6.0,1.0,11.0,...,-3.673297,5.0,-3.752448,31.0,-2.96001,0.009768,0.000177,0.000212,0.000177,0.001096
4,20018000000.0,https://aquarian.asdk12.org/,61.192407,-149.916872,10.0,11.0,6.0,19.0,2.0,51.0,...,-6.0,0.0,-6.0,0.0,-6.0,0.0,0.0,0.0,0.0,0.0


In [50]:
sample_data = charterdf.iloc[0:10][["NCESSCH", "URL"]] #taking a small subset of 10 items

In [47]:
sample_data

Unnamed: 0,NCESSCH,URL
0,10019700000.0,http://www.maef.net/
1,20000100000.0,http://web.archive.org/web/20181128123424/http...
2,20015000000.0,0
3,20015000000.0,0
4,20018000000.0,0
5,20018000000.0,http://web.archive.org/web/20181128123424/http...
8,20018000000.0,0
10,20018000000.0,0
11,20018000000.0,0
12,20018000000.0,http://web.archive.org/web/20181128123424/http...


In [46]:
wayback_url = URL_PROCESS(sample_data["URL"]) #creating a list of valid wayback machine urls
wayback_url

['http://web.archive.org/web/20180412015059/http://www.maef.net:80/',
 'http://web.archive.org/web/20170606173900/http://www.kgbsd.org/ketchikancharter',
 'http://web.archive.org/web/20180421031400/http://tongassschool.org:80/',
 'http://web.archive.org/web/20180408075758/http://aquarian.asdk12.org:80/',
 'http://web.archive.org/web/20180806044913/http://winterberrycharterschool.com/',
 'http://web.archive.org/web/20180822074451/http://www.frontiercs.org/',
 'http://web.archive.org/web/20180327115410/http://highlandacademy.asdk12.org:80/']

In [51]:
wayback_url = URL_PROCESS(sample_data["URL"]) #creating a list of valid wayback machine urls
wayback_url

['http://web.archive.org/web/20180412015059/http://www.maef.net:80/',
 'http://web.archive.org/web/20181128123424/http://0.0.0.0/',
 'http://web.archive.org/web/20181128123424/http://0.0.0.0/',
 'http://web.archive.org/web/20181128123424/http://0.0.0.0/',
 'http://web.archive.org/web/20181128123424/http://0.0.0.0/',
 'http://web.archive.org/web/20181128123424/http://0.0.0.0/',
 'http://web.archive.org/web/20181128123424/http://0.0.0.0/']

In [67]:
#df = pd.DataFrame(np.array(my_list).reshape(3,3), columns = list("abc"))
pd.set_option('display.max_colwidth', -1)

wayback_df = pd.DataFrame(wayback_url, columns = ["URL"])
wayback_df["NCESSCH"] = ["1.001970e+10", "2.001500e+10", "2.001500e+10",
                        "2.001800e+10", "2.001800e+10", "2.001800e+10", "2.001800e+10"]
wayback_df = wayback_df[["NCESSCH", "URL"]]
wayback_df

Unnamed: 0,NCESSCH,URL
0,10019700000.0,http://web.archive.org/web/20180412015059/http://www.maef.net:80/
1,20015000000.0,http://web.archive.org/web/20170606173900/http://www.kgbsd.org/ketchikancharter
2,20015000000.0,http://web.archive.org/web/20180421031400/http://tongassschool.org:80/
3,20018000000.0,http://web.archive.org/web/20180408075758/http://aquarian.asdk12.org:80/
4,20018000000.0,http://web.archive.org/web/20180806044913/http://winterberrycharterschool.com/
5,20018000000.0,http://web.archive.org/web/20180822074451/http://www.frontiercs.org/
6,20018000000.0,http://web.archive.org/web/20180327115410/http://highlandacademy.asdk12.org:80/


## Testing CSV with same format

In [60]:
wayback.head()

Unnamed: 0,NCESSCH,URL,LAT1516,LON1516,AM,AS,BL,HI,HP,TR,...,ESS_VALID_STR,PROG_VALID_COUNT,PROG_VALID_STR,RIT_VALID_COUNT,RIT_VALID_STR,INQUIRY_RATIO,DISCIPLINE_RATIO,ESS_VALID_RATIO,PROG_VALID_RATIO,RIT_VALID_RATIO
1,20000100000.0,https://education.alaska.gov/DOE_Rolodex/Schoo...,60.796131,-161.765194,167.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,,0.0,,,,,,
2,20015000000.0,https://www.kgbsd.org/ketchikancharter,55.347001,-131.641191,74.0,37.0,2.0,5.0,4.0,5.0,...,-6.0,1.0,-3.123525,0.0,-6.0,0.003762,0.000752,0.0,0.000752,0.0
3,20015000000.0,http://www.tongassschool.org/,55.347001,-131.641191,57.0,12.0,4.0,6.0,1.0,11.0,...,-3.673297,5.0,-3.752448,31.0,-2.96001,0.009768,0.000177,0.000212,0.000177,0.001096
4,20018000000.0,https://aquarian.asdk12.org/,61.192407,-149.916872,10.0,11.0,6.0,19.0,2.0,51.0,...,-6.0,0.0,-6.0,0.0,-6.0,0.0,0.0,0.0,0.0,0.0
5,20018000000.0,https://education.alaska.gov/DOE_Rolodex/Schoo...,61.1981,-149.876,30.0,16.0,18.0,43.0,18.0,69.0,...,,0.0,,0.0,,,,,,


In [70]:
wayback = charterdf.iloc[1:20]
wayback['URL'] = URL_PROCESS(wayback["URL"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [71]:
wayback

Unnamed: 0,NCESSCH,URL,LAT1516,LON1516,AM,AS,BL,HI,HP,TR,...,ESS_VALID_STR,PROG_VALID_COUNT,PROG_VALID_STR,RIT_VALID_COUNT,RIT_VALID_STR,INQUIRY_RATIO,DISCIPLINE_RATIO,ESS_VALID_RATIO,PROG_VALID_RATIO,RIT_VALID_RATIO
1,20000100000.0,0,60.796131,-161.765194,167.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,,0.0,,,,,,
2,20015000000.0,http://web.archive.org/web/20170606173900/http...,55.347001,-131.641191,74.0,37.0,2.0,5.0,4.0,5.0,...,-6.0,1.0,-3.123525,0.0,-6.0,0.003762,0.000752,0.0,0.000752,0.0
3,20015000000.0,http://web.archive.org/web/20180421031400/http...,55.347001,-131.641191,57.0,12.0,4.0,6.0,1.0,11.0,...,-3.673297,5.0,-3.752448,31.0,-2.96001,0.009768,0.000177,0.000212,0.000177,0.001096
4,20018000000.0,http://web.archive.org/web/20180408075758/http...,61.192407,-149.916872,10.0,11.0,6.0,19.0,2.0,51.0,...,-6.0,0.0,-6.0,0.0,-6.0,0.0,0.0,0.0,0.0,0.0
5,20018000000.0,0,61.1981,-149.876,30.0,16.0,18.0,43.0,18.0,69.0,...,,0.0,,0.0,,,,,,
8,20018000000.0,http://web.archive.org/web/20180806044913/http...,61.19445,-149.791641,15.0,8.0,5.0,14.0,1.0,26.0,...,-3.492173,10.0,-3.190948,34.0,-2.659441,0.009728,0.000193,0.000322,0.000644,0.002191
10,20018000000.0,http://web.archive.org/web/20180822074451/http...,61.194914,-149.891223,27.0,8.0,1.0,16.0,4.0,20.0,...,-3.89137,2.0,-3.590229,6.0,-3.113052,0.007451,0.0,0.000128,0.000257,0.000771
11,20018000000.0,http://web.archive.org/web/20180327115410/http...,61.193958,-149.775759,8.0,9.0,10.0,13.0,7.0,23.0,...,-6.0,0.0,-6.0,0.0,-6.0,0.0,0.0,0.0,0.0,0.0
12,20018000000.0,0,61.173154,-149.895616,8.0,9.0,1.0,23.0,3.0,48.0,...,,0.0,,0.0,,,,,,
13,20018000000.0,http://web.archive.org/web/20180302202615/http...,61.21766,-149.810421,176.0,1.0,2.0,20.0,3.0,109.0,...,-6.0,0.0,-6.0,0.0,-6.0,0.0,0.0,0.0,0.0,0.0


## Output

In [68]:
wayback_df.to_csv('../../scrapy-cluster/kafka-monitor/wayback_test.csv', index=False)

In [72]:
#saving subset with similar format as 2016 CSV file
wayback.to_csv('../../scrapy-cluster/kafka-monitor/wayback_format.csv', index=False)

In [None]:
# I need this helper function for my manual conversion

In [None]:
def converter():
    