# Goji Data Compilation

<a href="https://colab.research.google.com/github/jasmine-schoch/goji-data-analysis/blob/main/goji_data_compilation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import needed libraries and define constants

In [1]:
import math
import numpy as np
import os
import pandas as pd
import pyarrow
from slugify import slugify

# Sheets file containing all of the links to individual company's information:
DATA_SOURCES_GOOGLE_SHEETS = "https://docs.google.com/spreadsheets/d/1oEaZVl3YF6JCxS63wsk10HO2dCVFs_CyfZ-cmlMFU0c/edit?usp=sharing"
GOOGLE_SHEETS_TO_CSV_SUFFIX = "/export?format=csv"
CACHE_DIR = "local_cache/"

## Initialize Variables and Data

### Define output data frame and columns

In [2]:
# Final data frame
Final = pd.DataFrame()
Final.index = [
    "Total",
    "Carbon Emmisions",
    "Water Usage",
    "Ethical Sourcing",
    "Labor Rights",
    "Transparency & Policy",
    "DEI",
    "More Info",
    "URL",
]

### Get data sources URLs into a data frame

In [3]:
csv_url = (
    DATA_SOURCES_GOOGLE_SHEETS[: DATA_SOURCES_GOOGLE_SHEETS.rindex("/")]
    + GOOGLE_SHEETS_TO_CSV_SUFFIX
)
data_source_urls_df = pd.read_csv(csv_url, header=None)

## Get all company data
Load data from local cache if available. Download the data otherwise.

In [27]:
# Dictionary of company website URL -> data frame
url_to_data = {}
url_to_more_info_url = {}

# Create cache dir if needed
if not os.path.exists(CACHE_DIR):
    os.makedirs(CACHE_DIR, exist_ok=True)

# Iterate through each company and load the data
for index, row in data_source_urls_df.iterrows():
    # Define cache path
    cache_filename = CACHE_DIR + slugify(row[0]) + ".feather"

    # Define data url
    data: pd.DataFrame

    try:
        # Try loading the data locally
        data = pd.read_feather(cache_filename)
    except FileNotFoundError:
        # Get the data online
        online_url = row[1][: row[1].rindex("/")] + GOOGLE_SHEETS_TO_CSV_SUFFIX
        data = pd.read_csv(online_url, header=None)

        # Cache data (convert columns to strings)
        data.columns = data.columns.map(str)
        data.to_feather(cache_filename)

    # Record data frame into dictionary
    url_to_data[row[2]] = data
    url_to_more_info_url[row[2]] = str(row[3])

## Compile each company's data into `Final`

In [34]:
for url, data in url_to_data.items():
    # Pull info for each column
    total = int(data.iloc[68, 9])
    carbon = int(data.iloc[84, 9])
    water = int(data.iloc[85, 9])
    sourcing = sum(map(int, data.iloc[[80, 81], 9]))
    labor = sum(map(int, data.iloc[[71, 72, 73, 76, 79, 92], 9]))
    trans = sum(map(int, data.iloc[[69, 75, 83, 87, 89, 91, 92], 9]))
    dei = int(data.iloc[90, 9])

    # Save into Final data frame
    Final[data.iloc[2, 2]] = [
        total,
        carbon,
        water,
        sourcing,
        labor,
        trans,
        dei,
        url_to_more_info_url[url],
        url,
    ]

## Calculate percentile rankings

In [33]:
for x in range(len(Final) - 2):  # looping through each row
    rowcurr = list(Final.iloc[x, :])  # setting row as a list
    length = len(rowcurr)
    for y in range(length):  # looping through each data point
        count = 0
        j = rowcurr[y]
        for z in range(length):
            if j > rowcurr[z]:
                count += 1
        Final.iloc[x, y] = int(math.trunc(round(((count / length) * 100))))

# Sum up each column (skipping the URL column)
for x in range(len(Final.columns)):
    Final.iloc[0, x] = int(math.trunc(round((sum(Final.iloc[1:-2, x]) / 6))))

## Save the results to a file

In [35]:
# Re-order columns into desired order
Reversed = Final.loc[
    [
        "Carbon Emmisions",
        "Water Usage",
        "Ethical Sourcing",
        "Labor Rights",
        "Transparency & Policy",
        "DEI",
        "Total",
        "More Info",
        "URL",
    ]
]

# Swap the axes (the columsn were technically 'indices' in pandas)
Reversed = Reversed.swapaxes("index", "columns")

# Export the data frame to a file
Reversed.to_csv("data.csv")

### Extra: Save match list for `manifest.json`

In [18]:
# Remove all nan/null URLs
urls = [url for url in Final.loc["URL"] if not pd.isnull(url)]

# Add '/*' to the end of each URL and wrap each quotes
for index, url in enumerate(urls):
    urls[index] = '"' + urls[index]
    if url[-1] != "/":
        urls[index] += "/"
    urls[index] += "*"
    urls[index] += '"'

# Write the results to a file
urls_txt = open("urls.txt", "w")
urls_txt.write("[")
urls_txt.write(",".join(urls))
urls_txt.write("]")
urls_txt.close()