## Setting up the primary URL data frame
- import the DAS inventory and Little Forest(LF) list of subdomains.
- Create a new data frame that lists the domain and subdomains.
- Engineer new features from request/response data.
- Export new dataframe with new features.

In [2]:
import pandas as pd
import numpy as np
import requests

# pd.options.display.max_rows = 4000

## Notebook Functions

In [3]:
# This function will parse a url into a dictionary that describes the "main" domain name (top two domains, e.g. harvard.edu), "pre-domain", and each subdomain.
# Using this function to expand the data dataframe (i.e. Little FOrests domain list) with new features.

def parse_path(url):
    # empty dictionary to hold different domain values.
    obj = {}
    
    path_list = url.split(".")
    # join the two top-level domains to create the primary domain name (e.g. harvard.edu)
    domain = ".".join(path_list[-2:])
    # join the remaining domains to create the "pre-domain"
    predomain = ".".join(path_list[:-2])
    
    # Adding domain values to the dictionary
    obj["name"] = url
    obj["domain"] = domain
    obj["predomain"] = predomain
    
    # go through each domain in the URL and add it to the dictionary
    count = 1;
    domain_count = len(path_list)
    for i in range(0,domain_count - 2):
        sb_domain_level = domain_count - (3 + i)
        new_key = f"subdomain{count}"
        new_pair = {new_key : path_list[sb_domain_level]};
        obj.update(new_pair)
        count += 1
    return obj
    

In [4]:
# Making a request to a url to gather information about the url's status.

def find_url_status_via_request(url):
    
    '''
    given a url this function will make a request and return a dictionary. 
    The dictionary will inlcude -
        name: The URL passed to the function,
        res_status: The status code of the response,
        status_message: A custom message describing some relevant details of the response,
        rd: if the site was request was redirected,
        rd_code: the status code of the redirect,
        public: if the site is public (0=false, 1=true),
        login: if a login was detected (0=false, 1=true)
        hk: if harvard key was detected (0=false, 1=true),
        res_url: final URL, location of response,
        success: -1 if an error, 0 if unsuccessful response, 1 if successful response,
        asses: assessment of the URL, "KEEP" - "CHECK" - "REMOVE",
        note: useful info about the request/response.
    '''
    
    # initial values. 
    req_url = "http://" + url
    name = url
    res_status = -1 # -1 is no response, status code value otherwise.
    status_message = "empty" # returns a note about the type of response.
    rd = 0 # if redirected value is changed to 1. 
    rd_code = 0 # status code of the redirect.
    public = 0 # 1 if the site is public, 0 otherwise.
    login = 0 # 1 if a login page is detected, 0 otherwise.
    hk = 0 # 1 if Harvard Key, 0 otherwise.
    res_url = -1 # Final URL, location of response.
    success = -1 # 1 if successful request is completed 0 if the response had an error (e.g. 404, 500)
    assess = "CHECK" # Determine if the URL should be kept, "KEEP", in the inventory (i.e. is valid), "REMOVE", or More action is needed to verify "CHECK".
    note = "empty" # any useful information about the request.
    
        
    try:
        # Make a request to the URL
        res = requests.request("GET", req_url, timeout= 4.0)
        res_url = res.url
        
        # check for redirect
        if (res.history):
            rd = 1
            rd_code = res.history[0].status_code
            print(f'Redirect -----  {rd_code}')
            if(("ezp-prod1" in res.url) or ("harvard.edu/login" in res.url) or (".pin1." in res.url)):
                
                status_message = "HARVARD_KEY_REDIRECT"
                res_status = res.status_code
                hk = 1
                login = 1
                resolved_url = res.url
                success = 1
                assess = "KEEP"
                note =  (f'HARVARD KEY: {req_url} Not public. REDIRECT to {res.url}')
                
                row = return_status(url,res_status,status_message,rd,rd_code,public,login,hk,res_url,success, assess, note)
                return row
            
            elif (("login" in res.url.lower()) or ("adfs" in res.url) or ("logon" in res.url.lower())):
                status_message = "LOGIN_REDIRECT"
                res_status = res.status_code
                login = 1
                success = 0
                resolved_url = res.url
                assess = "CHECK"
                note = (f'FIREWALL/LOGIN: requested {req_url} and redirected to {res.url}. Potential login page. Status: {res.status_code}')
                row = return_status(url,res_status,status_message,rd,rd_code,public,login,hk,res_url,success, assess, note)
                return row
            
            elif (("https" in res.url) and (url in res.url)):
                
                status_message = "HTTPS_REDIRECT"
                res_status = res.status_code
                public = 1
                resolved_url = res.url
                success = 1
                assess = "KEEP"
                note = (f'HTTP Check: requested {req_url} and redirected to {res.url} https server. Status: {res.status_code}')
                
                row = return_status(url,res_status,status_message,rd,rd_code,public,login,hk,res_url,success, assess, note)
                return row
            
            elif ((url not in res.url)):
                
                status_message = "URL_REDIRECT"
                res_status = res.status_code
                public = 1
                resolved_url = res.url
                success = 0
                assess = "CHECK"
                note = (f'CHECK: requested {req_url} and redirected to {res.url}. Status: {res.status_code}')
                row = return_status(url,res_status,status_message,rd,rd_code,public,login,hk,res_url,success, assess, note)
                return row
            else:
                status_message = "VALID_REDIRECT"
                res_status = res.status_code
                public = 1
                resolved_url = res.url
                success = 1
                assess = "KEEP"
                note = (f'VALID: {req_url}: {res.status_code}')
                row = return_status(url,res_status,status_message,rd,rd_code,public,login,hk,res_url,success, assess, note)
                return row     
        elif (res.status_code == 404):
            status_message = "404"
            res_status = res.status_code
            public = 1
            resolved_url = res.url
            success = 0
            assess = "CHECK"
            note = (f'404: {req_url}: {res.status_code}')
            row = return_status(url,res_status,status_message,rd,rd_code,public,login,hk,res_url,success, assess, note)
            return row
        elif ((res.status_code >= 400) and (res.status_code < 500)):
            status_message = "CLIENT_ERROR"
            res_status = res.status_code
            public = 1
            resolved_url = res.url
            success = 0
            assess = "CHECK"
            note = (f'CLIENT ERROR: {req_url} returned a {res.status_code} error. ')
            row = return_status(url,res_status,status_message,rd,rd_code,public,login,hk,res_url,success, assess, note)
            return row
        elif ((res.status_code >= 500) and (res.status_code < 600)):
            status_message = "SERVER_ERROR"
            res_status = res.status_code
            public = 1
            resolved_url = res.url
            success = 0
            assess = "REMOVE"
            note = (f'SERVER ERROR: {req_url} returned a {res.status_code} error. ')
            row = return_status(url,res_status,status_message,rd,rd_code,public,login,hk,res_url,success, assess, note)
            return row
        else:
            status_message = "VALID"
            res_status = res.status_code
            public = 1
            resolved_url = res.url
            success = 1
            assess = "KEEP"
            note = (f'VALID: {req_url}: {res.status_code}')
            row = return_status(url,res_status,status_message,rd,rd_code,public,login,hk,res_url,success, assess, note)
            return row 
    except requests.exceptions.HTTPError as e:
        note = (f'ERROR: {req_url} : {e.code}')
        assess = "REMOVE"
        row = return_status(url,res_status,status_message,rd,rd_code,public,login,hk,res_url,success, assess, note)
        return row
    except requests.exceptions.ConnectionError as err:
        note = (f'ERROR: {req_url} : {err}')
        assess = "REMOVE"
        row = return_status(url,res_status,status_message,rd,rd_code,public,login,hk,res_url,success, assess, note)
        return row
    except Exception as err:
        note = (f'ERROR: {req_url} : {err}')
        assess = "REMOVE"
        row = return_status(url,res_status,status_message,rd,rd_code,public,login,hk,res_url,success, assess, note)
        return row
    
def return_status(url,status,mssg,rd,rd_code, public, fw, hk, res_url,sc, assess, note):
    '''
    Creating the dictionary for adding to the DF. 
    Used inside the find_url_status_via_request function.
    '''
    return {
        "name": url, # original domain
        "res_status":status, # response status_code
        "status_message":mssg, #status_message
        "redirect":rd, # redirected true(1) or false(0)
        "redirect_code":rd_code, # rd_code (eg 301, 302...)
        "public" : public, # true(1) or false(0)
        "login": fw, # redirected true(1) or false(0)
        "harvard_key": hk, # redirected true(1) or false(0)
        "resolved_url":res_url, # final url
        "success": sc, # was a page delivered (success = 1, uncertain = 0, fail = -1)
        "assess" : assess,
        "note": note
    }

In [5]:
# A safety check before mergin and exporting new request data with the existing request data.
# Check to see if columns match before merge and exporting data.

def check_merge_export_df(rd_df, prd_df):
    t = rd_df.columns == prd_df.columns
    if (np.any(t)):
        print("Adding new request data to request_data data frame...")
        rd_df = pd.concat([rd_df, prd_df], ignore_index=True)
        print("Resetting the index...")
        rd_df.reset_index(drop=True)
        print("Exporting to domain_requests.csv, replacing old csv...")
        rd_df.to_csv("domain_requests.csv")
        print("Exporting to domain_requests.xls, replacing old xls...")
        rd_df.to_excel("domain_reqs.xls")
        print("Done.")
    else:
        print("Columns do not match. Check partial_request_data.")

In [6]:
# Read in the little forest domains
data = pd.read_csv("data/LF_Survey.csv")

In [7]:
data.head()

Unnamed: 0,name,http status code,title,Unnamed: 3,Unnamed: 4,200,555
0,abcs.mgh.harvard.edu,200.0,ABCs - MICCAI 2020 Challenge,,,301.0,69.0
1,abel.harvard.edu,200.0,Harvard Mathematics Department : Home page,,,302.0,30.0
2,about.my.harvard.edu,200.0,Service Portal - IT Help,,,303.0,1.0
3,ac-web.dce.harvard.edu,200.0,AC-WEB: Academic Computing,,,307.0,4.0
4,academicresourcecenter.harvard.edu,200.0,Academic Resource Center,,,400.0,2.0


In [8]:
#remove unecessary columns
data.columns
data.drop(columns=['Unnamed: 3', 'Unnamed: 4', '200','555'],inplace=True)

In [9]:
data.head()

Unnamed: 0,name,http status code,title
0,abcs.mgh.harvard.edu,200.0,ABCs - MICCAI 2020 Challenge
1,abel.harvard.edu,200.0,Harvard Mathematics Department : Home page
2,about.my.harvard.edu,200.0,Service Portal - IT Help
3,ac-web.dce.harvard.edu,200.0,AC-WEB: Academic Computing
4,academicresourcecenter.harvard.edu,200.0,Academic Resource Center


In [10]:
# Add a column to hold an array of the domains of the URL
data["url"] = data["name"].str.split(pat='.')


## Parse URLs into discrete subdomains

In [11]:
# Test the parse_path function
test_url = data.iloc[10]["name"]
parse_path(test_url)

{'name': 'agupubs-onlinelibrary-wiley-com.ezp-prod1.hul.harvard.edu',
 'domain': 'harvard.edu',
 'predomain': 'agupubs-onlinelibrary-wiley-com.ezp-prod1.hul',
 'subdomain1': 'hul',
 'subdomain2': 'ezp-prod1',
 'subdomain3': 'agupubs-onlinelibrary-wiley-com'}

In [12]:
# Create new Data Frame with subdomains parsed out
new_data = data.apply(lambda x: parse_path(x["name"]), axis=1, result_type='expand')

In [13]:
new_data.head()

Unnamed: 0,name,domain,predomain,subdomain1,subdomain2,subdomain3,subdomain4,subdomain5,subdomain6
0,abcs.mgh.harvard.edu,harvard.edu,abcs.mgh,mgh,abcs,,,,
1,abel.harvard.edu,harvard.edu,abel,abel,,,,,
2,about.my.harvard.edu,harvard.edu,about.my,my,about,,,,
3,ac-web.dce.harvard.edu,harvard.edu,ac-web.dce,dce,ac-web,,,,
4,academicresourcecenter.harvard.edu,harvard.edu,academicresourcecenter,academicresourcecenter,,,,,


In [14]:
# merge the LF survey data and the parsed subdomain dataframe
merged_data = data.merge(new_data, on="name")

In [15]:
merged_data.head()

Unnamed: 0,name,http status code,title,url,domain,predomain,subdomain1,subdomain2,subdomain3,subdomain4,subdomain5,subdomain6
0,abcs.mgh.harvard.edu,200.0,ABCs - MICCAI 2020 Challenge,"[abcs, mgh, harvard, edu]",harvard.edu,abcs.mgh,mgh,abcs,,,,
1,abel.harvard.edu,200.0,Harvard Mathematics Department : Home page,"[abel, harvard, edu]",harvard.edu,abel,abel,,,,,
2,about.my.harvard.edu,200.0,Service Portal - IT Help,"[about, my, harvard, edu]",harvard.edu,about.my,my,about,,,,
3,ac-web.dce.harvard.edu,200.0,AC-WEB: Academic Computing,"[ac-web, dce, harvard, edu]",harvard.edu,ac-web.dce,dce,ac-web,,,,
4,academicresourcecenter.harvard.edu,200.0,Academic Resource Center,"[academicresourcecenter, harvard, edu]",harvard.edu,academicresourcecenter,academicresourcecenter,,,,,


# Get URL Responses for new features.

This portion of the notebook makes a request to each URL in the inventory to get information on the URL's status based on the response. It can take a long time to run on all the URLs.
Since all the URLs have already been run there is no need to run this again. The results have been merged with the subdomain data and exported to the `domain_requests.csv` and `domain_reqs.xls` files. 

In [16]:
testURL = find_url_status_via_request("accessibility.harvard.edu")

Redirect -----  301


In [17]:
testURL

{'name': 'accessibility.harvard.edu',
 'res_status': 200,
 'status_message': 'HTTPS_REDIRECT',
 'redirect': 1,
 'redirect_code': 301,
 'public': 1,
 'login': 0,
 'harvard_key': 0,
 'resolved_url': 'https://accessibility.harvard.edu/',
 'success': 1,
 'assess': 'KEEP',
 'note': 'HTTP Check: requested http://accessibility.harvard.edu and redirected to https://accessibility.harvard.edu/ https server. Status: 200'}

<p style="border: 2px solid black; padding:15px">
The followiing cells are commented out becuase they have already been run to get the request details to add to the dataframe. Running the request on all of the domains takes a lot of time and I ended up doing it in batches which isn't celarly described here. You can see the results in the domain_requests.csv</span> or the <span style="font-family:monospace">domain_name_exploration</span> notebook.
</p>

In [112]:
# Running on a chunk of the original dataframe to "batch" the requests.
# Running all 25000 URLs at once seems to run into issues.

# partial_df = merged_data[20000:].copy()

In [113]:
# partial_df.shape

(5000, 12)

In [1]:
# Create a new dataframe to hold the results of the requests made. 

# requests_partial_data = partial_df.apply(lambda x: find_url_status_via_request(x["name"]), axis=1, result_type='expand')

In [115]:
# requests_partial_data.shape

(5000, 12)

In [116]:
# Take a look at the new data

# requests_partial_data.head()

Unnamed: 0,name,res_status,status_message,redirect,redirect_code,public,login,harvard_key,resolved_url,success,assess,note
20000,w0049377.mgh.harvard.edu,-1,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://w0049377.mgh.harvard.edu : HTTPC...
20001,w0049418.mgh.harvard.edu,-1,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://w0049418.mgh.harvard.edu : HTTPC...
20002,w0049469.dfci.harvard.edu,-1,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://w0049469.dfci.harvard.edu : HTTP...
20003,w0049550.mgh.harvard.edu,-1,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://w0049550.mgh.harvard.edu : HTTPC...
20004,w0049592.mgh.harvard.edu,-1,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://w0049592.mgh.harvard.edu : HTTPC...


In [117]:
# Merge the request data with the existing data.

# partial_request_data = partial_df.merge(requests_partial_data, on="name")

In [118]:
# Have a look at the new dataframe
# partial_request_data.tail()

Unnamed: 0,name,http status code,title,url,domain,predomain,subdomain1,subdomain2,subdomain3,subdomain4,...,status_message,redirect,redirect_code,public,login,harvard_key,resolved_url,success,assess,note
4995,zoom.cfa.harvard.edu,,,"[zoom, cfa, harvard, edu]",harvard.edu,zoom.cfa,cfa,zoom,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://zoom.cfa.harvard.edu : HTTPConne...
4996,zop-v.cfa.harvard.edu,,,"[zop-v, cfa, harvard, edu]",harvard.edu,zop-v.cfa,cfa,zop-v,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://zop-v.cfa.harvard.edu : HTTPConn...
4997,zztestbuildingsw1.fas.harvard.edu,,,"[zztestbuildingsw1, fas, harvard, edu]",harvard.edu,zztestbuildingsw1.fas,fas,zztestbuildingsw1,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://zztestbuildingsw1.fas.harvard.ed...
4998,zzz36.mgh.harvard.edu,,,"[zzz36, mgh, harvard, edu]",harvard.edu,zzz36.mgh,mgh,zzz36,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://zzz36.mgh.harvard.edu : HTTPConn...
4999,zzz38.mgh.harvard.edu,,,"[zzz38, mgh, harvard, edu]",harvard.edu,zzz38.mgh,mgh,zzz38,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://zzz38.mgh.harvard.edu : HTTPConn...


## Updating the inventory list

Merging the request data from the partial data frame with the existing data from previous checks.
Since all the URLs have already been run and merged there is no need to run this again. The results have been merged and exported to the `domain_requests.csv` and `domain_reqs.xls` files. 

In [119]:
# TO RESET
# requests_data = pd.DataFrame([], columns = partial_request_data.columns)

In [120]:
# READ IN THE EXISTING DATA
# request_data = pd.read_csv("domain_requests.csv", index_col = 0)

In [18]:
# Checking the size of the dataframe. It should correspond to the segment og rows just run 
# So if rows 2000:3000 were run the dataframe should have about 2000 rows.

# request_data.shape

In [122]:
# Take a peak at the data.
# request_data.head()

Unnamed: 0,name,http status code,title,url,domain,predomain,subdomain1,subdomain2,subdomain3,subdomain4,...,status_message,redirect,redirect_code,public,login,harvard_key,resolved_url,success,assess,note
0,abcs.mgh.harvard.edu,200.0,ABCs - MICCAI 2020 Challenge,"['abcs', 'mgh', 'harvard', 'edu']",harvard.edu,abcs.mgh,mgh,abcs,,,...,HTTPS_REDIRECT,1,301,1,0,0,https://abcs.mgh.harvard.edu/,1,KEEP,HTTP Check: requested http://abcs.mgh.harvard....
1,abel.harvard.edu,200.0,Harvard Mathematics Department : Home page,"['abel', 'harvard', 'edu']",harvard.edu,abel,abel,,,,...,VALID,0,0,1,0,0,http://abel.harvard.edu/,1,KEEP,VALID: http://abel.harvard.edu: 200
2,about.my.harvard.edu,200.0,Service Portal - IT Help,"['about', 'my', 'harvard', 'edu']",harvard.edu,about.my,my,about,,,...,URL_REDIRECT,1,301,1,0,0,https://harvard.service-now.com/ithelp,0,CHECK,CHECK: requested http://about.my.harvard.edu a...
3,ac-web.dce.harvard.edu,200.0,AC-WEB: Academic Computing,"['ac-web', 'dce', 'harvard', 'edu']",harvard.edu,ac-web.dce,dce,ac-web,,,...,HTTPS_REDIRECT,1,302,1,0,0,https://ac-web.dce.harvard.edu/,1,KEEP,HTTP Check: requested http://ac-web.dce.harvar...
4,academicresourcecenter.harvard.edu,200.0,Academic Resource Center,"['academicresourcecenter', 'harvard', 'edu']",harvard.edu,academicresourcecenter,academicresourcecenter,,,,...,HTTPS_REDIRECT,1,301,1,0,0,https://academicresourcecenter.harvard.edu/,1,KEEP,HTTP Check: requested http://academicresourcec...


In [123]:
# Take a look at the end of the imported csv data.

# request_data.tail()

Unnamed: 0,name,http status code,title,url,domain,predomain,subdomain1,subdomain2,subdomain3,subdomain4,...,status_message,redirect,redirect_code,public,login,harvard_key,resolved_url,success,assess,note
12493,w0049250.mgh.harvard.edu,,,"['w0049250', 'mgh', 'harvard', 'edu']",harvard.edu,w0049250.mgh,mgh,w0049250,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://w0049250.mgh.harvard.edu : HTTPC...
12494,w0049280.mgh.harvard.edu,,,"['w0049280', 'mgh', 'harvard', 'edu']",harvard.edu,w0049280.mgh,mgh,w0049280,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://w0049280.mgh.harvard.edu : HTTPC...
12495,w0049311.mgh.harvard.edu,,,"['w0049311', 'mgh', 'harvard', 'edu']",harvard.edu,w0049311.mgh,mgh,w0049311,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://w0049311.mgh.harvard.edu : HTTPC...
12496,w0049368.mgh.harvard.edu,,,"['w0049368', 'mgh', 'harvard', 'edu']",harvard.edu,w0049368.mgh,mgh,w0049368,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://w0049368.mgh.harvard.edu : HTTPC...
12497,w0049375.mgh.harvard.edu,,,"['w0049375', 'mgh', 'harvard', 'edu']",harvard.edu,w0049375.mgh,mgh,w0049375,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://w0049375.mgh.harvard.edu : HTTPC...


In [124]:
# See where the data left off.
# The end of the request_data should line up with the head of the partial_request_data

# partial_request_data.tail()

Unnamed: 0,name,http status code,title,url,domain,predomain,subdomain1,subdomain2,subdomain3,subdomain4,...,status_message,redirect,redirect_code,public,login,harvard_key,resolved_url,success,assess,note
4995,zoom.cfa.harvard.edu,,,"[zoom, cfa, harvard, edu]",harvard.edu,zoom.cfa,cfa,zoom,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://zoom.cfa.harvard.edu : HTTPConne...
4996,zop-v.cfa.harvard.edu,,,"[zop-v, cfa, harvard, edu]",harvard.edu,zop-v.cfa,cfa,zop-v,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://zop-v.cfa.harvard.edu : HTTPConn...
4997,zztestbuildingsw1.fas.harvard.edu,,,"[zztestbuildingsw1, fas, harvard, edu]",harvard.edu,zztestbuildingsw1.fas,fas,zztestbuildingsw1,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://zztestbuildingsw1.fas.harvard.ed...
4998,zzz36.mgh.harvard.edu,,,"[zzz36, mgh, harvard, edu]",harvard.edu,zzz36.mgh,mgh,zzz36,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://zzz36.mgh.harvard.edu : HTTPConn...
4999,zzz38.mgh.harvard.edu,,,"[zzz38, mgh, harvard, edu]",harvard.edu,zzz38.mgh,mgh,zzz38,,,...,empty,0,0,0,0,0,-1,-1,REMOVE,ERROR: http://zzz38.mgh.harvard.edu : HTTPConn...


In [126]:
# Merge and export data

# check_merge_export_df(request_data,partial_request_data)

Adding new request data to request_data data frame...
Resetting the index...
Exporting to domain_requests.csv, replacing old csv...
Exporting to domain_requests.xls, replacing old xls...
Done.


In [127]:
# Checking the export

# request_data = pd.read_csv("domain_requests.csv", index_col = 0)

In [22]:
# Should have increased row size by number of rows in the partial_requests_data datafrmae.

# request_data.shape

In [24]:
# request_data.tail()