# Processing of raw data to eliminate anomalies

In [None]:
import pandas as pd
import os

## Process addresses.csv

In [None]:
addresses = pd.read_csv('../data/raw/addresses.csv', header=0)
addresses['country'] = addresses['country'].str.capitalize()
addresses.drop_duplicates(inplace=True)
addresses.sort_values(by='supplier_id', inplace=True)
addresses.to_csv('../data/processed/addresses_cleaned.csv', index=False)

## Process articles.csv

In [None]:

articles = pd.read_csv('../data/raw/articles.csv', header=0)
articles = articles.rename(columns={'Article_ID': 'article_id', 'Article': 'article', 'Industry': 'industry'})
articles[['article', 'industry']] = articles[['article', 'industry']].apply(lambda x: x.str.capitalize())
articles['article_id'] = articles['article_id'].str.replace('"', '', regex=True).astype(int)
articles.sort_values(by='article_id', inplace=True)
articles.to_csv('../data/processed/articles_cleaned.csv', index=False)


## Process indices.csv

In [None]:
indices = pd.read_csv('../data/raw/indices.csv', header=0)
indices = indices.rename(columns={'enivronmental_risk': 'environmental_risk'})
indices[['human_rights_index', 'environmental_risk']] = indices[['human_rights_index', 'environmental_risk']].apply(pd.to_numeric, errors='coerce')
indices.sort_values(by='country_id', inplace=True)
indices.to_csv('../data/processed/indices_cleaned.csv', index=False, na_rep='NaN')

## Process orders.csv

In [None]:
orders = pd.read_csv('../data/raw/orders.csv', header=0)
orders['supplier_id'] = orders['supplier_id'].str.replace('"', '', regex=True).astype(int)
orders.sort_values(by='order_id', inplace=True)
orders.to_csv('../data/processed/orders_cleaned.csv', index=False)

## Process suppliers.csv

In [None]:
suppliers = pd.read_csv('../data/raw/suppliers.csv', header=0)
suppliers.drop(columns='domain', inplace=True)
suppliers.drop_duplicates(inplace=True)
suppliers.sort_values(by=['status', 'supplier_id'], inplace=True)
suppliers.to_csv('../data/processed/suppliers_cleaned.csv', index=False)