# Standardize categorical columns to be used as labels for dataset

### Description

- Use when there is variability between templates for categorical columns which should be consistent in combined dataset
- Can also use for large dataset cleaning
- Edit categorical column string contents to match a standard needed in the future combined dataset
- Use fuzzy matching for user-generated spelling issues
- Useful when templates are inconsistent in labeling
- Can incorporate into cleaning function when iterating through files in a directory 

## Import libraries

In [1]:
import pandas as pd
from rapidfuzz import process, fuzz

## Import data

In [2]:
dfa1 = pd.read_csv('data_labels_raw.csv')

In [3]:
dfa1

Unnamed: 0,sector,department,id,count_1,count_2
0,finance,retirment,12.104,1412,3064
1,Finance,retriment_2,12.221,19662,42758
2,admin,Purchassing,21.632,34,72
3,administrative,account1,22.701,14,19
4,AM,account1,22.702,634,778


## Edit categorical labels to match standard

### Update 'sector' column to fix capitalization, abbreviation inconsistencies

In [4]:
# Check for different values in column to see what needs to be fixed
dfa1['sector'].unique()

array(['finance', 'Finance', 'admin', 'administrative', 'AM'],
      dtype=object)

In [5]:
# Fix capialization inconsistencies
dfa1['sector'] = dfa1['sector'].str.capitalize()

In [6]:
dfa1['sector'].unique()

array(['Finance', 'Admin', 'Administrative', 'Am'], dtype=object)

In [7]:
# Eliminate abbreviations
dfa1.loc[dfa1['sector'].str.contains('admin|am', case = False), 'sector'] = 'Administrative'

In [8]:
dfa1['sector'].unique()

array(['Finance', 'Administrative'], dtype=object)

### Update 'department' column to fix spelling inconstencies & eliminate numbering

In [9]:
dfa1['department'].unique()

array(['retirment', 'retriment_2', 'Purchassing', 'account1'],
      dtype=object)

In [10]:
# Create a dictionary with corrected labels
label_dict = {'Retirement': 'Retirement', # Edit these to include any values desired as labels in cleaned template
    'Purchasing': 'Purchasing', 
    'Accounting': 'Accounting'}

# Turn dictionary into a list of keys to search against with rapidfuzz
label_keys = list(label_dict.keys())

print(label_keys)

['Retirement', 'Purchasing', 'Accounting']


In [11]:
# Write a function using rapidfuzz to find best fuzzy match for label & return corrected label
def fix_spelling(label, label_keys, label_dict, score_cutoff = 60):
    # label = values in 'department' column
    # label_keys = list of keys made from label dictionary
    # label_dict = dictionary defined above with corrected label names
    # score_cutoff = minimum threshold for fuzzy match function to not overcorrect label names- adjust if needed
    
    if pd.isna(label):
        return label # Returns the original label if label is null
        
    match = process.extractOne(label, label_keys, scorer = fuzz.ratio, score_cutoff = score_cutoff)
    
    if match:
        return label_dict[match[0]]
    else:
        return label # Returns the original label if the spelling is too inaccurate for the function to fix

In [12]:
dfa1['department'] = dfa1['department'].apply(lambda x: fix_spelling(x, label_keys, label_dict))

In [13]:
dfa1['department'].unique()

array(['Retirement', 'Purchasing', 'Accounting'], dtype=object)

In [14]:
dfa1

Unnamed: 0,sector,department,id,count_1,count_2
0,Finance,Retirement,12.104,1412,3064
1,Finance,Retirement,12.221,19662,42758
2,Administrative,Purchasing,21.632,34,72
3,Administrative,Accounting,22.701,14,19
4,Administrative,Accounting,22.702,634,778


### Update 'id' column to add sector & department abbreviations for standardization

In [15]:
dfa1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   sector      5 non-null      object 
 1   department  5 non-null      object 
 2   id          5 non-null      float64
 3   count_1     5 non-null      int64  
 4   count_2     5 non-null      int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 332.0+ bytes


In [16]:
# Create id_new column to update ids to include sector & department abbreviations
dfa1['id_new'] = dfa1['sector'].str[0] + '_' + dfa1['department'].str[0] + '_' + dfa1['id'].astype(str)

In [17]:
dfa1

Unnamed: 0,sector,department,id,count_1,count_2,id_new
0,Finance,Retirement,12.104,1412,3064,F_R_12.104
1,Finance,Retirement,12.221,19662,42758,F_R_12.221
2,Administrative,Purchasing,21.632,34,72,A_P_21.632
3,Administrative,Accounting,22.701,14,19,A_A_22.701
4,Administrative,Accounting,22.702,634,778,A_A_22.702


In [18]:
# Drop original 'id' column
dfa1 = dfa1.drop('id', axis = 1)

In [19]:
# Move 'id_new' to 'id' position
col = dfa1.pop('id_new')
dfa1.insert(2, 'id_new', col)

# Rename 'id_new' to 'id'
dfa1.rename(columns = {'id_new': 'id'}, inplace = True)

In [20]:
dfa1

Unnamed: 0,sector,department,id,count_1,count_2
0,Finance,Retirement,F_R_12.104,1412,3064
1,Finance,Retirement,F_R_12.221,19662,42758
2,Administrative,Purchasing,A_P_21.632,34,72
3,Administrative,Accounting,A_A_22.701,14,19
4,Administrative,Accounting,A_A_22.702,634,778


## Export data

In [21]:
dfa1.to_csv('cleaned_data_labels.csv', encoding = 'utf-8', index = False, header = True)