# Download LSMS
This repo helps you to download the LSMS data. It still requires manual work, but reduces it. 

In [1]:
from bs4 import BeautifulSoup
from tqdm import tqdm

import json
import os
import pandas as pd
import re
import requests

In [2]:
df: pd.DataFrame = pd.read_csv("../../data/countries_meta/counties_lsms_time_valid.csv")

You have to specify you World Bank login data in the accounts json, or just remove the following block and hard code you data in the block after. Please be careful to not push it to a public repository or to one which you will make once public.

The json file should lay on the top level of the project.

In [3]:
with open("../../accounts.json", "r") as f:
    auth_data: any = json.load(f)

In [4]:
user: str = auth_data["worldbank"]["user"]
pw: str = auth_data["worldbank"]["pw"]

Perform Login

In [5]:
def login(session: requests.Session, user: str, pw: str) -> None:
    """Performs the login.

    Args:
        session (requests.Session): Session
        user (str): Username
        pw (str): Password
    """
    login_url: str = "https://microdata.worldbank.org/index.php/auth/login"
    login_params: dict[str] = {
        "email": user,
        "password": pw,
        "submit": "Login"
    }
    session.post(login_url, data=login_params)

In [6]:
session: requests.Session = requests.Session()
login(session, user, pw)

Consent

In [7]:
for _, row in tqdm(df.iterrows(), total=len(df)):
    url: str = row["url"]
    surveyid: str = url.split("/")[-1]
    res: any = session.get(url + "/get-microdata").content
    soup: any = BeautifulSoup(res)
    surveytitle: str = soup.find("h1", {"id": "dataset-title"}).span.text
    submitparam: dict[str] = {
        "surveytitle": surveytitle,
        "surveyid": surveyid,
        "id": "",
        "abstract": "Research project to predict poverty.",
        "chk_agree": "on",
        "submit": "Submit"
    }

    session.post(url + "/get-microdata", data=submitparam)


100%|██████████| 74/74 [00:58<00:00,  1.26it/s]


Download (it is working). You have to check for false positives in the end anyways. Just to reduce the click work.

In [8]:
regex_csv: re.Pattern = re.compile(".*CSV.*")
regex_spss: re.Pattern = re.compile(".*SPSS.")
for _, row in tqdm(df.iterrows(), total=len(df)):
    path = f"../../data/lsms/raw/{row['name']}/{row['year']}"
    if not os.path.exists(path):
        os.makedirs(path)
    url: str = row["url"]
    res: requests.Response = session.get(url + "/get-microdata").content
    soup: any = BeautifulSoup(res)
    if "Terms and conditions" in [x.text for x in soup.findAll("h1")]:
        data: dict[str] = {
            "accept": "Accept"
        }
        res = session.post(url + "/get-microdata", data=data).content
        soup = BeautifulSoup(res)
    try:
        if soup.find("a", {"data-filename": regex_csv}) == None:
            regex = regex_spss
        else:
            regex = regex_csv
        href: any = soup.find("a", {"data-filename": regex})["href"]
        title: any = soup.find("a", {"data-filename": regex})["title"]

        if os.path.exists(f"{path}/{title}"):
            continue
        res = session.get(href)
        with open(f"{path}/{title}", "wb") as f:
            f.write(res.content)
    except:
        print(url) # for manual work
        login(session, user, pw)

 15%|█▍        | 11/74 [02:07<12:04, 11.49s/it]

https://microdata.worldbank.org/index.php/catalog/2331


 18%|█▊        | 13/74 [02:10<06:36,  6.50s/it]

https://microdata.worldbank.org/index.php/catalog/2315


 20%|██        | 15/74 [02:14<04:11,  4.26s/it]

https://microdata.worldbank.org/index.php/catalog/2314


 22%|██▏       | 16/74 [02:15<03:08,  3.25s/it]

https://microdata.worldbank.org/index.php/catalog/2313


 28%|██▊       | 21/74 [03:59<20:40, 23.40s/it]

https://microdata.worldbank.org/index.php/catalog/3016


 61%|██████    | 45/74 [10:44<06:33, 13.56s/it]

https://microdata.worldbank.org/index.php/catalog/3062


 62%|██████▏   | 46/74 [10:45<04:34,  9.80s/it]

https://microdata.worldbank.org/index.php/catalog/2882


 64%|██████▎   | 47/74 [10:46<03:14,  7.19s/it]

https://microdata.worldbank.org/index.php/catalog/1576


 65%|██████▍   | 48/74 [10:47<02:18,  5.34s/it]

https://microdata.worldbank.org/index.php/catalog/297


 66%|██████▌   | 49/74 [10:48<01:41,  4.07s/it]

https://microdata.worldbank.org/index.php/catalog/902


100%|██████████| 74/74 [19:36<00:00, 15.90s/it]


The following are missing and require manual work.:
- [Ghana 1999](https://microdata.worldbank.org/index.php/catalog/2331): Data hosted on gov. server
- [Ghana 1992](https://microdata.worldbank.org/index.php/catalog/2315): Data hosted on gov. server
- [Ghana 1989](https://microdata.worldbank.org/index.php/catalog/2314): Data hosted on gov. server
- [Ghana 1988](https://microdata.worldbank.org/index.php/catalog/2313): Data hosted on gov. server
- [Malawi 2011](https://microdata.worldbank.org/index.php/catalog/3016): Other Term and No CSV
- [South Africa 2015](https://microdata.worldbank.org/index.php/catalog/3062): Terms by worldbank
- [South Africa 2015](https://microdata.worldbank.org/index.php/catalog/2882): Data hosted on gov. server
- [South Africa 1999](https://microdata.worldbank.org/index.php/catalog/1576): Data hosted on gov. server
- [South Africa 1993](https://microdata.worldbank.org/index.php/catalog/297): No CSV, or SPSS
- [South Africa 1993](https://microdata.worldbank.org/index.php/catalog/902): Data hosted on gov. server