# Notebook for initial data exploration

In [None]:
#import necessary libraries for data analysis
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Exploration of addresses.csv

### Identify duplicated supplier ID in addresses.csv

In [None]:
addresses = pd.read_csv('../data/raw/addresses.csv', header=0)

# Find duplicate key-value pairs: supplier_id with multiple countries
duplicates = addresses.groupby('supplier_id')['country'].nunique().reset_index()
duplicates = duplicates[duplicates['country'] > 1]

# Show affected supplier_ids and their countries
for id in duplicates['supplier_id']:
    countries = addresses[addresses['supplier_id'] == id]['country'].unique()
    indices = addresses[addresses['supplier_id'] == id].index
    print(f"Supplier ID {id} is present at multiple positions {indices.values}\n")
    print(addresses.loc[indices])

### Identify switched cases in addresses.csv

In [None]:
# Identify country names with inconsistent capitalization
country_variants = addresses.groupby(addresses['country'].str.lower())['country'].unique()
for base, variants in country_variants.items():
    if len(variants) > 1:
        print(f"Country '{base}' has multiple variants: {variants}")

## Exploration of articles.csv

### Identify switched cases in articles.csv

In [None]:
articles = pd.read_csv('../data/raw/articles.csv', header=0)

# Identify Article names with inconsistent capitalization
article_variants = articles.groupby(articles['Industry'].str.lower())['Industry'].unique()
for base, variants in article_variants.items():
    if len(variants) > 1:
        print(f"Industry '{base}' has multiple variants: {variants}")

## Exploration of indices.csv

### Checking for duplicates

In [None]:
indices = pd.read_csv('../data/raw/indices.csv', header=0)

# Find duplicated country_id values
duplicate_ids = indices[indices.duplicated('country_id', keep=False)]

# Show all rows with duplicated country_id
if not duplicate_ids.empty:
    print("Duplicated country_id entries:")
    print(duplicate_ids)
else:
    print("No duplicated country_id found.")

### Identify missing values for human_rights_index and enivronmental_risk

In [None]:
indices = pd.read_csv('../data/raw/indices.csv', header=0)


# Convert both columns to numeric, non-convertible values become NaN
indices[['human_rights_index', 'enivronmental_risk']] = indices[['human_rights_index', 'enivronmental_risk']].apply(pd.to_numeric, errors='coerce')
# Count missing values for hri
missing_count_hri = indices['human_rights_index'].isna().sum()
present_values_count_hri = indices['human_rights_index'].notna().sum()

missing_count_env = indices['enivronmental_risk'].isna().sum()
present_values_count_env = indices['enivronmental_risk'].notna().sum()

print(f"Number of present values in 'human_rights_index': {present_values_count_hri}")
print(f"Number of missing values in 'human_rights_index': {missing_count_hri}")

print(f"Number of present values in 'enivronmental_risk': {present_values_count_env}")
print(f"Number of missing values in 'enivronmental_risk': {missing_count_env}")


### Plotting human rights index values

In [None]:
# Plot a histogram of 'human_rights_index' with a separate bar for missing values

# Plot histogram for available values
plt.figure(figsize=(10,6))
plt.hist(indices['human_rights_index'].dropna(), bins=100, range=(0,100), color='skyblue', edgecolor='black', label='Available values')

# Add a bar for missing values
plt.bar(105, missing_count_hri, width=2, color='orange', label='Missing values')

plt.xlabel('Human Rights Index')
plt.ylabel('Count')
plt.title('Distribution of Human Rights Index (including missing values)')
plt.legend()
plt.xticks(list(np.linspace(0,100,11)) + [110], labels=[str(int(x)) for x in np.linspace(0,100,11)] + ['Missing'])
plt.show()

### Plotting environmental risk values

In [None]:
# Plot a histogram of 'enivronmental_risk' with a separate bar for missing values

# Plot histogram for available values
plt.figure(figsize=(10,6))
plt.hist(indices['enivronmental_risk'].dropna(), bins=100, range=(0,100), color='green', edgecolor='black', label='Available values')

# Add a bar for missing values
plt.bar(105, missing_count_env, width=2, color='orange', label='Missing values')

plt.xlabel('Human Rights Index')
plt.ylabel('Count')
plt.title('Distribution of Human Rights Index (including missing values)')
plt.legend()
plt.xticks(list(np.linspace(0,100,11)) + [110], labels=[str(int(x)) for x in np.linspace(0,100,11)] + ['Missing'])
plt.show()

### Plotting distribution of environmental risk values

In [None]:
# Plot the percentage distribution of all non-null values for 'enivronmental_risk'
env_risk_counts = indices['enivronmental_risk'].dropna().value_counts(normalize=True).sort_index() * 100
plt.figure(figsize=(12,6))
env_risk_counts.plot(kind='bar', color='green', edgecolor='black')
plt.xlabel('Environmental Risk')
plt.ylabel('Percentage (%)')
plt.title('Percentage Distribution of Environmental Risk (non-null values)')
plt.show()

### Investigating correlation between hri and env

In [None]:
# Scatterplot: human_rights_index vs. enivronmental_risk with point size by frequency
counts = indices.groupby(['human_rights_index', 'enivronmental_risk']).size().reset_index(name='count')
plt.figure(figsize=(8,6))
plt.scatter(counts['human_rights_index'], counts['enivronmental_risk'], s=counts['count']*10, alpha=0.7, color='purple', edgecolor='k')
plt.xlabel('Human Rights Index')
plt.ylabel('Environmental Risk')
plt.title('Scatterplot of Human Rights Index vs. Environmental Risk (point size = frequency)')
plt.grid(True)
plt.show()

## Exploration of suppliers.csv

### Checking for duplicates

In [None]:
suppliers = pd.read_csv('../data/raw/suppliers.csv', header=0)

# Find duplicated supplier_id values
duplicate_ids = suppliers[suppliers.duplicated('supplier_id', keep=False)]

# Show all rows with duplicated supplier_id
if not duplicate_ids.empty:
    print("Duplicated supplier_id entries:")
    print(duplicate_ids)
else:
    print("No duplicated supplier_id found.")