# Travel.State.Gov Visa Issuances

In [19]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [20]:
%cd drive/Shareddrives/Data\ Products\ Team/Products/Immigration\ Data\ Hub/DataRepo/

/content/drive/Shareddrives/Data Products Team/Products/Immigration Data Hub/DataRepo


In [10]:
!pip install PyPDF2
!pip install tabula-py

Collecting tabula-py
  Downloading tabula_py-2.3.0-py3-none-any.whl (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 94 kB/s 
[?25hCollecting distro
  Downloading distro-1.6.0-py2.py3-none-any.whl (19 kB)
Installing collected packages: distro, tabula-py
Successfully installed distro-1.6.0 tabula-py-2.3.0


In [11]:
from pathlib import Path
import requests

from bs4 import BeautifulSoup
import pandas as pd
from PyPDF2 import PdfFileReader
import tabula


from urllib.parse import urljoin, urlparse, urljoin

pd.set_option("max_rows", 400)

**Monthly visa issuances:**

The State Department releases monthly data on visa issuances, for both immigrant visas and nonimmigrant visas.  

The data list how many of each type of visa is issued for each visa category and for each country. 

We have been tracking how slowly the government has been issuing visas during the pandemic, by visa type. 

So each month when the new data are released, 
* I download the PDF, use an online PDF to excel converter, and 
* then create a pivot table in Excel to sum up visa issuances by visa category. 
* Then I copy that into my tracking spreadsheet, and use VLOOKUP to line the data up with prior months of data. 

I have formulas in Excel to add up various of visas into the categories that I track. But since new visa categories keep popping up, I always have to check and fix those formulas each month. Is there a more efficient way of doing any of this?

In [12]:
example_pdf1 = (
    "https://travel.state.gov/content/dam/visas/Statistics/"
    "Immigrant-Statistics/MonthlyIVIssuances/"
    "JULY%202021%20-%20IV%20Issuances%20by%20Post%20and%20Visa%20Class.pdf"
)

example_pdf2 = (
    "https://travel.state.gov/content/dam/visas/Statistics/"
    "Immigrant-Statistics/MonthlyIVIssuances/"
    "JULY%202021%20-%20IV%20Issuances%20by%20FSC%20or%20Place%20of%20Birth%20and%20Visa%20Class.pdf"
)

**Source Data Url**

In [13]:
url = "https://travel.state.gov/content/travel/en/legal/visa-law0/visa-statistics/immigrant-visa-statistics/monthly-immigrant-visa-issuances.html"


In [21]:
def download_all_pdf_links(url, output_folder):
    """
    Download all pdfs on a webpage where the pdf
    """

    output_folder = Path(output_folder)
    output_folder.mkdir(exist_ok=True, parents=True)

    parse_url = urlparse(url)
    base_url = f"{parse_url.scheme}://{parse_url.netloc}"

    # Requests URL and get response object
    response = requests.get(url)

    # Parse text obtained
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all hyperlinks present on webpage
    links = soup.find_all("a")
    # From all links check for pdf link and
    # if present download file
    for link in links:
        if ".pdf" in link.get("href", []):
            url = f"{base_url}/{link.get('href')}"
            name = link.text
            # Get response object for link
            response = requests.get(url)
            if response.status_code == 200:
                # Write content in pdf file
                outpath = output_folder / f"{name}.pdf"
                pdf = open(str(outpath), "wb")
                pdf.write(response.content)
                pdf.close()
                print("File ", f"{name}.pdf", " downloaded")
            else:
                print("File ", f"{name}.pdf", " not found.")
    print("All PDF files downloaded")

In [22]:
download_all_pdf_links(url, 'visa_test')

File  March 2017 - IV Issuances by FSC or Place of Birth and Visa Class.pdf  downloaded
File  March 2017 - IV Issuances by Post and Visa Class.pdf  downloaded
File  April 2017 - IV Issuances by FSC or Place of Birth and Visa Class.pdf  downloaded
File  April 2017 - IV Issuances by Post and Visa Class.pdf  downloaded
File  May 2017 - IV Issuances by FSC or Place of Birth and Visa Class.pdf  downloaded
File  May 2017 - IV Issuances by Post and Visa Class.pdf  downloaded
File  June 2017 - IV Issuances by FSC or Place of Birth and Visa Class.pdf  downloaded
File  June 2017 - IV Issuances by Post and Visa Class.pdf  downloaded
File  July 2017 - IV Issuances by FSC or Place of Birth and Visa Class.pdf  downloaded
File  July 2017 - IV Issuances by Post and Visa Class.pdf  downloaded
File  August 2017 - IV Issuances by FSC or Place of Birth and Visa Class.pdf  downloaded
File  August 2017 - IV Issuances by Post and Visa Class.pdf  downloaded
File  September 2017 - IV Issuances by FSC or Place 

In [26]:
path = "visas/July 2021 - IV Issuances by FSC or Place of Birth and Visa Class.pdf"
path = "visas/July 2021 - IV Issuances by Post and Visa Class.pdf"

In [76]:
def get_table_data(path, data_cols=["Post", "Visa Class", "Issuances"]):

    pdf = PdfFileReader(path)
    tables = []
    full_table = pd.DataFrame(columns=data_cols)
    start = 1
    stop = pdf.getNumPages() + 1
    # stop = 2
    table_num = -1
    for i in range(start, stop):
        new_table = False
        df = tabula.read_pdf(
            path,
            pages=f"{i}",
            lattice=True,
            pandas_options={"header": None},
        )[0]

        if df.shape[1] > 3:
          full_null = df.isnull().all()
          full_null_index = full_null[full_null].index[0]
          if full_null_index:
            df = df.drop(full_null_index, axis=1)
          else:
            print(f"ERROR on portion of table: {path}")

        if df.shape[1] == 3:
          # drop the headers
          df = df.loc[2:, :]
          df.columns = data_cols

        full_table = full_table.append(df)

    full_table = full_table.reset_index(drop=True)
    grand_total = full_table[full_table[data_cols[0]].str.upper().str.contains("GRAND TOTAL")]
    full_table = full_table.drop(grand_total.index, axis=0)

    full_table.loc[:, "Issuances"] = full_table.Issuances.str.replace(",", "").astype(
        int
    )
    table_grand_total = full_table.Issuances.sum()
    row_grand_total = int(grand_total.Issuances.sum().replace(",", ""))

    assert (
        table_grand_total == row_grand_total
    ), f"Warning - Grand Total Row Does Not Equal Sum of Rows {row_grand_total} vs {table_grand_total}"
    print("Data successfully extracted.")
    return full_table


def get_data_by_year_month(pdf_folder, year, month, report):
    """

    report: (options) -->  posts | fsc
    """
    pdf_folder = Path(pdf_folder)
    report = report.lower()
    target_filepath = None
    data_cols = (
        ["Post", "Visa Class", "Issuances"]
        if report == "post"
        else ["FSC", "Visa Class", "Issuances"]
    )
    for file in pdf_folder.iterdir():
        fn = file.name.lower()
        if str(year).lower() in fn and str(month).lower() in fn and report in fn:
            target_filepath = file
            break
    if target_filepath and target_filepath.exists():
        return get_table_data(str(target_filepath), data_cols=data_cols)


def summarize_by_visa_class(df, label, visa_class_map=None):

    if visa_class_map:
        df["vc"] = df["Visa Class"].map(visa_class_map)
    else:
        df["vc"] = df["Visa Class"]
    out = df.groupby("vc")["Issuances"].sum()
    out.name = label
    return out

In [29]:
months = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December",
]

In [58]:
pwd

'/content/drive/Shareddrives/Data Products Team/Products/Immigration Data Hub/DataRepo'

In [77]:
fsc_processed_data = []
for year in range(2018, 2021 + 1):
    for month in months:
        print(year, month)
        data = get_data_by_year_month("./visa_test/", year, month, "fsc")
        if data is not None:
           fsc_processed_data.append(summarize_by_visa_class(data, f"{year}-{month}"))


2018 January




Data successfully extracted.
2018 February




Data successfully extracted.
2018 March




Data successfully extracted.
2018 April




Data successfully extracted.
2018 May




Data successfully extracted.
2018 June




Data successfully extracted.
2018 July




Data successfully extracted.
2018 August




Data successfully extracted.
2018 September




Data successfully extracted.
2018 October




Data successfully extracted.
2018 November




Data successfully extracted.
2018 December




Data successfully extracted.
2019 January




Data successfully extracted.
2019 February




Data successfully extracted.
2019 March




Data successfully extracted.
2019 April




Data successfully extracted.
2019 May




Data successfully extracted.
2019 June




Data successfully extracted.
2019 July




Data successfully extracted.
2019 August




Data successfully extracted.
2019 September




Data successfully extracted.
2019 October




Data successfully extracted.
2019 November




Data successfully extracted.
2019 December




Data successfully extracted.
2020 January




Data successfully extracted.
2020 February




Data successfully extracted.
2020 March




Data successfully extracted.
2020 April




Data successfully extracted.
2020 May




Data successfully extracted.
2020 June




Data successfully extracted.
2020 July




Data successfully extracted.
2020 August




Data successfully extracted.
2020 September




Data successfully extracted.
2020 October




Data successfully extracted.
2020 November




Data successfully extracted.
2020 December




Data successfully extracted.
2021 January




Data successfully extracted.
2021 February
Data successfully extracted.
2021 March
Data successfully extracted.
2021 April




Data successfully extracted.
2021 May




Data successfully extracted.
2021 June




Data successfully extracted.
2021 July




Data successfully extracted.
2021 August
2021 September
2021 October
2021 November
2021 December


In [81]:
pd.concat(fsc_processed_data, axis=1).to_pickle('fsc_19_21.pkl')