In [35]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict 


# Loading Datasets
- item_df: dataset of the scrapped articles of clothing
- higg_df: sustainability index dataset from Higg MSI

In [36]:
folder_path = ''

item_df = pd.read_csv(folder_path + '/E-Weaver_data.csv', index_col=0)
higg_df = pd.read_json('../data/HIGG_MSI_data')

In [37]:
#remove the "fiber" or "fabric" in the material names and make it all lower case
regex_pat = re.compile('(fabric|fiber)', flags = re.IGNORECASE)
higg_df.loc[:,'MATERIAL'] = higg_df['MATERIAL'].str.replace(regex_pat, '', regex=True).str.lower().str.strip()

##ONLY ONE POLYSTYRENE INSTANCE - MIGHT DROP IT BECAUSE WE DON'T HAVE IT IN THE HIGG
##(df[df['polystyrene']!=0])

## Correcting names and replacing columns
In our scrapped dataset, many items have synonymous names or do not directly exist in the Higg MSI (e.g. 'cupro' is a type of rayon, but is not in the Higg MSI, thus we assign it as general rayon for measuring sustainability).

Furthermore, many of the Higg MSI textiles are listed as a single instance, so we create a new column to allow for all unique names.

In [38]:
#correcting the "polystyrene" - for some reason, mispelled in the Higg MSI
higg_df.replace('polysterene (ps) plastic', 'polystyrene', inplace=True)

#dict of equivalent textile names to create a new column
equiv_names_col = {'flax':'linen', 
                   'viscose/rayon':'rayon, cupro',
                   'elastane/spandex':'spandex',
                   'goat leather':'lambskin',
                   'acetate, triacetate':'triacetate'}
higg_df.insert(loc = 2, column='EQUIV_NAME', value = [equiv_names_col[i] if i in equiv_names_col else 'nan' for i in higg_df['MATERIAL']])

#replace viscose/rayon with viscose
#replace elastane/spandex with elastane
higg_df.replace({'viscose/rayon': 'viscose', 'elastane/spandex':'elastane', 'acetate, triacetate':'acetate'}, inplace=True)

## Connecting Item Composition to Higg MSI - creating scores

## Normalization reference values, summing to single score ("comp_score" Function)
- Global Warming - 1.00E + 00 kg−1 CO2 eq 
- Eutrophication- 1.00E + 03 kg−1 PO eq
- Water Scarcity - 3.31E + 01 (m3)−1 
- Resource Depletion, Fossil Fuels - 7.59E−02 MJ−1

In [39]:
def comp_score(material_scores, ratios, weights = 1):

    norm_values = {i:j for i,j  in zip(['Global Warming','Eutrophication','Water Scarcity','Resource Depletion, Fossil Fuels'], [1, 1000, 33.1, 0.0759])}
#tst_item = item_higg_df.iloc[0]['mat_higg'][0]

    NR = list(norm_values.values())

    #NF is the inverse of NR
    #normalization factor (the inverse of a normalization reference value)
    NF = [1/i for i in NR]
    
    comp_sustain_score = 0
    for score_i, ratio_i in zip(material_scores, ratios):
        if len(score_i) == 0:
             pass
        else:
            comp_sustain_score += np.sum(score_i*NF*weights)*ratio_i
        #higg_tst = np.sum(tst_item*NF)

    return(comp_sustain_score)

## Connect material names to Higg MSI name and scores ("item_to_higg" function)

Function below (item_to_higg) creates a new dataframe with the textile sustainability scores scaled by their relative composition in the item.

 - - Note: need to update this function later to allow it to adaptive add scores rather than going through the entire dataset each time.

In [40]:
def item_to_higg(item_df, higg_df):
    mat_dict = defaultdict(dict)
    
    #names of the Higg_msi characteristics
    index_names = ['Global Warming','Eutrophication','Water Scarcity','Resource Depletion, Fossil Fuels']
    
    
    
    mat_names = ['linen', 'nylon', 'cotton', 'lyocell', 'lambskin',
       'elastane', 'wool', 'viscose', 'polyamide', 'silk', 'acrylic',
       'polyester', 'modal', 'polystyrene', 'rayon', 'spandex', 'cupro',
       'hemp']
    
    #iterate through the items
    for item_i in item_df[mat_names].iterrows():
        #grab only the non-zero instances - name and the fraction
        item_index = item_i[0]
        
        row_i = item_i[1][item_i[1]!=0]
        mats, ratios = row_i.index, row_i.values
        
        mat_dict[item_index] = {'mat_higg':[], 'ratio':[], 'comp_score':None}
        #iterate through each material and search the higg_index
        for mat_i, ratio_i in zip(mats, ratios):
            mat_dict[item_index]['ratio'].append(ratio_i)
            
            
            
            
            #check if name is in MATERIAL
            if sum(higg_df['MATERIAL'].str.contains(mat_i))!=0:
                matches = higg_df[higg_df['MATERIAL'].str.contains(mat_i)]
                
                #if there are more than 1 match, choose the one that is textiles - 
                ## keeping it general in case there are more types of material in teh future
                if len(matches)>1:
                    tar_higg = matches[matches['MAT_FAMILY']=='TEXTILES'][index_names].values[0]
                
                else:
                    tar_higg = matches[index_names].values[0]
                    
                    
            #then check EQUIV_NAME names
            elif sum(higg_df['EQUIV_NAME'].str.contains(mat_i))!=0:
                matches = higg_df[higg_df['EQUIV_NAME'].str.contains(mat_i)]
                tar_higg = matches[index_names].values[0]
                
            else:
                tar_higg = []
#                return('NOT IN HIGG INDEX')
            mat_dict[item_index]['mat_higg'].append(tar_higg)
    
        mat_dict[item_index]['comp_score']= (comp_score(mat_dict[item_index]['mat_higg'], mat_dict[item_index]['ratio']))
    return(pd.DataFrame(mat_dict).T)

In [41]:
item_higg_df = item_to_higg(item_df, higg_df)

## User's input their Ranking of concerns to weight each of the metrics

May possible integrate with comp_score function later if we want to assign weights to the different characterizations

In [42]:
user_concern_preference = {'gloabl_warming': 1, 'ocean': 4, 'water':3, 'resource_depletion':2}

In [43]:
def normal_user_pref(user_pref):
    #columns
    df_cols = ['Global Warming',	'Eutrophication',	'Water Scarcity',	'Resource Depletion, Fossil Fuels']
    
    #convert ranking to fractions 
    vals = 1/np.array(list(user_pref.values()))
    vals = np.round(vals, decimals=1)
    normal_vals = vals/sum(vals)
    
    return({i:j for i,j in zip(df_cols, normal_vals)})

In [44]:
user_norms=normal_user_pref(user_concern_preference)
pref_weighted_textiles = higg_df[user_norms.keys()]*user_norms.values()

# Adding in goodonyou ratings

In [53]:
goy_path = '../data/GOY_brand_data.json'
#goy_df = pd.read_json(goy_path)
import json

with open(goy_path, 'r') as f:
    goy_df = json.load(f)

In [106]:
from fuzzywuzzy import fuzz
#use fuzzy matching

#lower_names = [str.lower(i) for i in list(goy_df.keys())]
matches = {}

for i in item_df['brand'].unique():
    max_ratio = 0
    for j in list(goy_df.keys()):
        ratio = fuzz.ratio(i.lower(),j.lower()) 
        if ratio > max_ratio:
            max_ratio = ratio
            match = j
        else:
            continue
    matches[i] = match
    
#check matches
matches['Mate']='MATE the Label'

In [109]:
def assign_ratings(target_brand):
    rename_brand = matches[target_brand]
    rating = goy_df[rename_brand]['rating']
    
    return(rating)
ratings_list = [assign_ratings(i) for i in item_df['brand']]


In [122]:
full_weighted_sus_score = pd.DataFrame(ratings_list, index = item_df.index)* item_higg_df[['comp_score']].values

In [123]:
full_weighted_sus_score.to_csv(folder_path+'/item_sus_score_df.csv')