# Setup

Please run everything in the set up, and double check the working directory so that the data can be read from that same directory

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os 
import re
from difflib import SequenceMatcher

Make sure to set to the correct working directory

In [2]:
#Change working directory to the same place where you saved the test datasets
#os.chdir('C:/Users/luos/OneDrive - DFO-MPO/Python') #change directory
os.getcwd() #check where the directory is (and whether the change was successful or not)

'c:\\Users\\luos\\OneDrive - DFO-MPO\\Python'

Function to read either csv or xlsx data 

In [2]:
# Function 0: Reading the dataset file
def read_data(dataset_path):  
    _, file_extension = os.path.splitext(dataset_path)
    if file_extension == '.csv':  
        df = pd.read_csv(dataset_path)    
    elif file_extension == '.xlsx':
        df = pd.read_excel(dataset_path)  
    else:
        print('Unsupported file type')  
        df = None  
    return df

# Data Quality Tests

### Consistency

#### Consistency Type 1 (C1)

Calculate consistency score of a dataset

This code is best run on CSV data where the column names are in the first row. It can also accept files that are in xlsx formats but it will only take data from the first sheet if there are more than one sheet in the excel file.

Limitations: It will not check for differences in capitalization of the same word (since all the words will be changed to lower case before the similarity score is calculated)

In [15]:
# Consistency Type 1 (C1) function

# Dictionary mapping Canadian province abbreviations to their full names
province_abbreviations = {
    'BC': 'British Columbia',
    'ON': 'Ontario',
    'QC': 'Quebec',
    'AB': 'Alberta',
    'MB': 'Manitoba',
    'SK': 'Saskatchewan',
    'NS': 'Nova Scotia',
    'NB': 'New Brunswick',
    'NL': 'Newfoundland and Labrador',
    'PE': 'Prince Edward Island',
    'NT': 'Northwest Territories',
    'YT': 'Yukon',
    'NU': 'Nunavut'
}

def normalize_text(text, remove_numbers=False):
    """
    Normalize input text by converting to lowercase, stripping whitespace,
    replacing province abbreviations with full names, and removing non-alphanumeric characters.
    Optionally remove numbers based on the flag.
    """
    text = str(text).lower().strip()
    for abbr, full in province_abbreviations.items():
        text = re.sub(r'\b' + abbr.lower() + r'\b', full.lower(), text)
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    return ' '.join(text.split())

def extract_numbers(text):
    """
    Extract all numbers from the input text and return them as a list of strings.
    """
    return re.findall(r'\d+', text)

def remove_short_numbers(text):
    """
    Remove numbers with 1 or 2 digits from the input text.
    """
    return re.sub(r'\b\d{1,4}\b', '', text)

def numeric_similarity(num1_list, num2_list):
    """
    Calculate the similarity between two lists of numbers by comparing each digit.
    Return the proportion of matching digits.
    """
    num1, num2 = ' '.join(num1_list), ' '.join(num2_list)
    matches = sum(1 for a, b in zip(num1, num2) if a == b)
    max_length = max(len(num1), len(num2))
    return matches / max_length if max_length > 0 else 0

def string_similarity(str1, str2):
    """
    Calculate the similarity between two strings using the SequenceMatcher from difflib.
    Return the similarity ratio.
    """
    return SequenceMatcher(None, str1, str2).ratio()

def calculate_cosine_similarity(text_list, ref_list, Stop_Words):
    """
    Calculate the cosine similarity between lists of texts using TF-IDF vectorization.
    """
    vectorizer = TfidfVectorizer(stop_words=Stop_Words, analyzer='word', ngram_range=(1, 2))
    ref_vec = vectorizer.fit_transform(ref_list)
    text_vec = vectorizer.transform(text_list)
    return cosine_similarity(text_vec, ref_vec)

def contains_short_number(num_list):
    """
    Check if any number in the list has 1 or 2 digits.
    """
    return any(len(num) <= 4 for num in num_list)

def numbers_match(num_list1, num_list2):
    """
    Check if any number in the first list is present in the second list.
    """
    return any(num in num_list2 for num in num_list1)

def calculate_combined_similarity(df, unique_observations, text_similarity_matrix):
    """
    Combine text and numeric similarities into a single similarity matrix.
    """
    # Make a copy of the text similarity matrix to modify it
    combined_sim_matrix = np.copy(text_similarity_matrix)
    
    # Extract numeric parts from each unique observation
    numeric_parts = [extract_numbers(obs) for obs in unique_observations]
    
    # Iterate over each pair of unique observations to calculate numeric similarity
    for i, num_i in enumerate(numeric_parts):
        for j, num_j in enumerate(numeric_parts):
            if i != j:
                # Calculate the numeric similarity for the current pair
                num_sim = numeric_similarity(num_i, num_j)
                
                # Update the combined similarity matrix with the maximum value between text and numeric similarity
                combined_sim_matrix[i, j] = max(combined_sim_matrix[i, j], num_sim)
    
    # Iterate over each pair of unique observations to calculate string similarity
    for i, obs_i in enumerate(unique_observations):
        for j, obs_j in enumerate(unique_observations):
            if i != j:
                # Calculate the string similarity for the current pair
                seq_sim = string_similarity(obs_i, obs_j)
                
                # Update the combined similarity matrix with the maximum value between existing and sequence matcher 
                combined_sim_matrix[i, j] = max(combined_sim_matrix[i, j], seq_sim)
    
    return combined_sim_matrix

def average_consistency_score(cosine_sim_matrix, threshold):
    """
    Calculate the average consistency score based on the cosine similarity matrix and a given threshold.
    """
    num_rows, num_columns = cosine_sim_matrix.shape
    inconsistency = 0

    for i in range(num_rows):
        if np.any((cosine_sim_matrix[i] > threshold) & (cosine_sim_matrix[i] <= 1.0000000)):
            inconsistency += 1
    
    return (num_rows - inconsistency) / num_rows

def process_and_calculate_similarity(dataset_path, column_names, threshold, Stop_Words=['the', 'and']):
    """
    Process the dataset, normalize the text, and calculate the similarity scores for multiple columns.
    """
    # Read the dataset from the provided Excel file path
    df = read_data(dataset_path)
    overall_consistency_scores = []

    # Iterate over each specified column
    for column_name in column_names:
        # Normalize the text in the specified column and store the results in a new column
        df[f'Normalized {column_name}'] = df[column_name].apply(normalize_text)
        
        # Get unique normalized observations by removing duplicates and NaN values
        unique_observations = pd.unique(df[f'Normalized {column_name}'].dropna().values.ravel())
        
        # Calculate the cosine similarity matrix for the unique normalized observations
        text_sim_matrix = calculate_cosine_similarity(unique_observations.tolist(), unique_observations.tolist(), Stop_Words)
        
        # Set the diagonal of the similarity matrix to 0 to ignore self-similarity
        np.fill_diagonal(text_sim_matrix, 0)
        
        # Combine text similarity with numeric similarity to get a final similarity matrix
        combined_sim_matrix = calculate_combined_similarity(df, unique_observations, text_sim_matrix)
        
        # Initialize columns in the dataframe to store the recommended organization matches and all matches
        df[f'Recommended {column_name}'] = None
        df[f'All Matches {column_name}'] = None

        # Iterate over each normalized organization in the dataframe
        for i, norm_org in enumerate(df[f'Normalized {column_name}']):
            # Find the index of the current normalized organization in the unique observations
            try:
                current_index = np.where(unique_observations == norm_org)[0][0]
            except IndexError:
                df.at[i, f'Recommended {column_name}'] = "No significant match"
                df.at[i, f'All Matches {column_name}'] = []
                continue
            
            # Get the similarities for the current organization from the combined similarity matrix
            similarities = combined_sim_matrix[current_index]
            
            # Find the indices and values of all matching organizations
            matched_indices = np.where(similarities >= threshold)[0]
            all_matches = [unique_observations[idx] for idx in matched_indices]
            all_match_scores = [similarities[idx] for idx in matched_indices]

            best_score = 0
            best_match = "No significant match"

            # Extract numbers from the current organization
            num_list_current = extract_numbers(norm_org)

            for idx in matched_indices:
                candidate_match = unique_observations[idx]
                num_list_candidate = extract_numbers(candidate_match)

                if contains_short_number(num_list_current) or contains_short_number(num_list_candidate):
                    # If short numbers are present, ensure they match; otherwise, skip this match
                    if not numbers_match(num_list_current, num_list_candidate):
                        continue
                    # Recalculate similarity excluding short numbers
                    norm_org_no_nums = remove_short_numbers(norm_org)
                    candidate_no_nums = remove_short_numbers(candidate_match)
                    recalculated_similarity = string_similarity(norm_org_no_nums, candidate_no_nums)
                    if recalculated_similarity > best_score:
                        best_score = recalculated_similarity
                        best_match = candidate_match
                else:
                    if similarities[idx] > best_score:
                        best_score = similarities[idx]
                        best_match = candidate_match

            # Assign the best match to the dataframe
            if best_score > threshold:
                df.at[i, f'Recommended {column_name}'] = f"{best_match} ({best_score:.2f})"
            else:
                df.at[i, f'Recommended {column_name}'] = "No significant match"

            # Store all matches
            df.at[i, f'All Matches {column_name}'] = ', '.join([f"{match} ({score:.2f})" for match, score in zip(all_matches, all_match_scores) if score > threshold])

        # Calculate the overall consistency score for the current column
        consistency_score = average_consistency_score(text_sim_matrix, threshold)
        overall_consistency_scores.append(consistency_score)

    # Calculate the overall consistency score as the average of individual consistency scores
    overall_consistency_score = np.mean(overall_consistency_scores)
    df['Overall Consistency Score'] = overall_consistency_score

    return df 

Test dataset

In [13]:
# Test Consistency Calculations
processed_df = process_and_calculate_similarity(
    dataset_path='data/restoration projects_dataportal.csv', # Define clear path for the data file with quotes
    column_names = ['Project Name'], 
    threshold = 0.91)

processed_df['Overall Consistency Score'].min()

1.0

##### Habitat Restoration Data for the Portal

In [16]:
processed_df = process_and_calculate_similarity(
    dataset_path = 'data/test/SalmonHabitatRestorationProjects_DataPortal_June_FinalFields_20240613.csv', # Define clear path for the data file with quotes
    column_names=['project_name'], 
    threshold = 0.91, 
    Stop_Words = ['']
    )  

processed_df['Overall Consistency Score'].min()

1.0

##### Salmonid Enhancement Program Post-Season Reports

In [25]:
processed_df = process_and_calculate_similarity(
    dataset_path = 'data/test/Salmonid_Enhancement_Program_Releases.xlsx', # Define clear path for the data file with quotes
    column_names=['PROJ_NAME', 'FACILITY_NAME'], 
    threshold = 0.91, 
    Stop_Words = ['']
    )  

processed_df['Overall Consistency Score'].min()

0.9883483483483484

other tests

In [23]:
processed_df = process_and_calculate_similarity(
    dataset_path = 'data/test/Hatchery Releases EPDA Data - NOT FINAL.xlsx', # Define clear path for the data file with quotes
    column_names=['PROJ_NAME', 'FACILITY_NAME'], 
    threshold = 0.91, 
    Stop_Words = ['']
    )  

processed_df['Overall Consistency Score'].min()

0.9901664467242773

In [8]:
avg_consistency_score = process_and_calculate_similarity(
    dataset_path = 'data/Pacific Salmon Population Unit Crosswalk_Final_20240513.xlsx', # Define clear path for the data file with quotes
    selected_columns=['STOCK_POP_NAME'], 
    threshold = 0.91, 
    Stop_Words = ['']
    )  

avg_consistency_score 

0.9556493049921575

#### Consistency Type 2 (C2)

Calculate consistency score of datasets with a reference list

The compared columns in question must be identical to the ref list, otherwise they will be penalized more harshly.

In [24]:
# Function 1: Get names used for a single column  
def get_names_used_for_column(df, column_name):  
    unique_observations = pd.unique(df[column_name].dropna().values.ravel())  
    return unique_observations  

# Function 2: Calculate Cosine Similarity  
def calculate_cosine_similarity(text_list, ref_list, Stop_Words):  
    count_vectorizer = CountVectorizer(stop_words= Stop_Words)  
    ref_vec = count_vectorizer.fit_transform(ref_list).todense()  
    ref_vec_array = np.array(ref_vec) 
    text_vec = count_vectorizer.transform(text_list).todense()  
    text_vec_array = np.array(text_vec) 
    cosine_sim = np.round((cosine_similarity(text_vec_array, ref_vec_array)), 2)  
    return cosine_sim  

# Function 3: Average Consistency Score  
def average_consistency_score(cosine_sim_df, threshold=0.91):
    num_rows, num_columns = cosine_sim_df.shape
    total_count = 0  # This will count all values above or equal to the threshold  
    
    for i in range(num_rows):
        if np.max(cosine_sim_df[i]) >= threshold: #Include all comparisons 
            total_count += 1
    total_observations = num_rows  # Total number of observations  
    average_consistency_score = total_count / total_observations  
    return average_consistency_score 
   
def process_and_calculate_similarity_ref(dataset_path, column_mapping, ref_dataset_path = None, threshold = 0.91, Stop_Words = 'activity'):    
    #Read the data file  
    df = read_data(dataset_path)  
  
    # Initialize ref_df if a ref dataset is provided  
    if ref_dataset_path:  
        df_ref = read_data(ref_dataset_path)  
        ref_data = True #Flag to indicate we are using a ref dataset  
    else:  
        ref_data = False #No ref dataset, compare within the same dataset  
  
    all_consistency_scores = []    
      
    for selected_column, m_selected_column in column_mapping.items():    
        if ref_data:  
             # Compare to ref dataset    
            unique_observations = get_names_used_for_column(df_ref, m_selected_column)    
        else:    
            # Use own column for comparison    
            unique_observations = get_names_used_for_column(df, selected_column)  
              
        cosine_sim_matrix = calculate_cosine_similarity(df[selected_column].dropna(), unique_observations, Stop_Words=Stop_Words)    
        column_consistency_score = average_consistency_score(cosine_sim_matrix, threshold)    
        all_consistency_scores.append(column_consistency_score)    
  
    # Calculate the average of all consistency scores    
    overall_avg_consistency = sum(all_consistency_scores) / len(all_consistency_scores) if all_consistency_scores else None    
  
    return overall_avg_consistency 

Test

In [11]:
column_mapping = {'STOCK_CU_NAME':'CU_Display', 'STOCK_CU_INDEX':'FULL_CU_IN'}  #the pattern for comparison is 'dataset column' : 'reference column'
process_and_calculate_similarity_ref(
    dataset_path='data/test/2024-03-28 1_qryThermal_NatEmerg.xlsx', 
    column_mapping=column_mapping, 
    ref_dataset_path = 'data/Pacific Salmon Population Unit Crosswalk_Final_20240513.xlsx',
    threshold = 1, 
    Stop_Words = [''])

0.993045537439778

##### Habitat Restoration Projects

In [19]:
column_mapping = {'CU_Name':'CU_Display', 'FULL_CU_IN':'FULL_CU_IN','SMU_Display':'SMU_Display', 'SMU_ID':'SMU_ID'}  #the pattern for comparison is 'dataset column' : 'reference column'
process_and_calculate_similarity_ref(
    dataset_path='data/test/SalmonHabitatRestorationProjects_DataPortal_June_FinalFields_20240613.csv', 
    column_mapping=column_mapping, 
    ref_dataset_path = 'data/Pacific Salmon Population Unit Crosswalk_Final_20240513.xlsx',
    threshold = 1,
    Stop_Words = [''])

0.9924551386623165

##### Salmonid Enhancement Program Post-Season Reports 

In [31]:
column_mapping = {'STOCK_CU_NAME':'CU_Display', 'STOCK_CU_INDEX':'FULL_CU_IN'}  #the pattern for comparison is 'dataset column' : 'reference column'
process_and_calculate_similarity_ref(
    dataset_path='data/test/Salmonid_Enhancement_Program_Releases.xlsx', 
    column_mapping=column_mapping, 
    ref_dataset_path = 'data/Pacific Salmon Population Unit Crosswalk_Final_20240513.xlsx',
    threshold = 1,
    Stop_Words = [''])

0.9930453481408895

##### Pacific Salmon Outlook

In [26]:
column_mapping = {'SMU NAME':'SMU_Display', 'CU NAME':'CU_Display', 'FULL CU INDEX':'FULL_CU_IN'}  #the pattern for comparison is 'dataset column' : 'reference column'
process_and_calculate_similarity_ref(
    dataset_path='data/test/salmonoutlook_dataportal_DRAFT_PLACEHOLDER.xlsx', 
    column_mapping=column_mapping, 
    ref_dataset_path = 'data/Pacific Salmon Population Unit Crosswalk_Final_20240513.xlsx',
    threshold = 1,
    Stop_Words = [''])

1.0

### Accuracy

#### Accuracy Type 1 (A1, Mixed Data Types, Symbols in Numerics) 

In [8]:
# Function 1: Using isdigit to find non-numerical entries
def find_non_digits(s):  
    # Ensure the value is treated as a string  
    s = str(s)  
    return [char for char in s if not (char.isdigit() or char == '.')]  

# Function 2 : Calculate the score
def accuracy_score(dataset_path, selected_columns):
    adf = read_data(dataset_path)
    selected_columns = [col for col in adf.columns if col in selected_columns] 

    all_accuracy_scores = []
    
    for column_name in selected_columns:  
        # Drop NA, null, or blank values from column  
        column_data = adf[column_name].dropna()  
          
        total_rows = len(column_data)  
          
        if total_rows > 0:  # to avoid division by zero  
            non_digit_chars_per_row = column_data.apply(find_non_digits)  
            non_numerical_count = non_digit_chars_per_row.apply(lambda x: len(x) > 0).sum()   
            accuracy_score = (total_rows - non_numerical_count) / total_rows  
            all_accuracy_scores.append(accuracy_score)    
  
    overall_accuracy_score = sum(all_accuracy_scores) / len(all_accuracy_scores) if all_accuracy_scores else None   

    return overall_accuracy_score  

Test

In [52]:
accuracy_score(
    dataset_path = 'data/test/SEP Facilities.xlsx',
    selected_columns = ['LicNo', 'FRN'])

0.9639175257731958

##### Salmonoid Enhancement Program (SEP) Post-Season Reports

In [34]:
accuracy_score(
    dataset_path = 'data/test/Salmonid_Enhancement_Program_Releases.xlsx',
    selected_columns = ['AVE_WEIGHT','AVE_LENGTH', 'TotalRelease'])

1.0

In [39]:
test1  =read_data('data/test/Hatchery Releases EPDA Data - NOT FINAL.xlsx')

In [47]:
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
is_numeric_dtype(test1['AVE_LENGTH'])


True

#### Accuracy Type 2 (A2 Outliers)

In [35]:
def find_outliers_iqr(dataset_path, selected_columns, groupby_column = None, threshold = 1.5, minimum_score= 0.85):  
    df = read_data(dataset_path)
    outliers_dict = {}

   # If a groupby column is specified, perform the IQR calculation within each group  
    if groupby_column:  
        grouped = df.groupby(groupby_column)  
        for column in selected_columns:  
            # Apply the outlier detection for each group  
            outliers = grouped[column].apply(lambda x: ((x < x.quantile(0.25) - threshold * (x.quantile(0.75) - x.quantile(0.25))) |  
                                                        (x > x.quantile(0.75) + threshold * (x.quantile(0.75) - x.quantile(0.25))))) 
            # Combine the outlier Series into a single Series that corresponds to the original DataFrame index  
            outliers_dict[column] = (1 - outliers.groupby(groupby_column).mean())
    else:  
        # Perform the IQR calculation on the whole column if no groupby column is specified  
        for column in selected_columns:  
            Q1 = df[column].quantile(0.25)  
            Q3 = df[column].quantile(0.75)  
            IQR = Q3 - Q1  
  
            lower_bound = Q1 - threshold * IQR  
            upper_bound = Q3 + threshold * IQR  
  
            outliers = (df[column] < lower_bound) | (df[column] > upper_bound)  
            outliers_dict[column] = (1 - outliers.mean())
    
    # compute final score  
    #total_groups = len(outliers_dict)  
    #groups_above = sum(1 for score in outliers_dict.values() if score > minimum_score)  
    #final_score = groups_above / total_groups if total_groups > 0 else 0  
    
    final_score = {}
    
    for key in outliers_dict.keys():
        arr = outliers_dict[key].values
        value_out = np.sum(arr > minimum_score)/len(arr)
        final_score[key] = value_out
  
    return outliers_dict, final_score        
    

Tests

In [70]:
find_outliers_iqr(
    dataset_path='data/test/new Escapement Data 2022.xlsx',
    selected_columns=['Escapement_Total'],
    groupby_column='Species',
    threshold=1.5,
    minimum_score= 0.85
)

({'Escapement_Total': Species
  Chinook    0.929138
  Chum       0.935484
  Coho       0.879154
  Pink       0.966667
  Sockeye    0.892490
  Name: Escapement_Total, dtype: float64},
 {'Escapement_Total': 1.0})

In [72]:
find_outliers_iqr(
    dataset_path='data/test/new Escapement Data 2022.xlsx',
    selected_columns=['Escapement_Total'],
    groupby_column=['SMU'],
    threshold=1.5,
    minimum_score= 0.85
)

({'Escapement_Total': SMU
  ALSEK CHINOOK SALMON                           0.978723
  ALSEK SOCKEYE SALMON                           1.000000
  ECVI/MAINLAND PINK SALMON                      0.800000
  FRASER CHUM SALMON                             0.952381
  FRASER FALL RUN 41 CHINOOK SALMON              0.985714
  FRASER PINK SALMON - ODD                       0.971429
  FRASER SOCKEYE SALMON - EARLY SUMMER           0.886301
  FRASER SOCKEYE SALMON - LATE                   0.871233
  FRASER SOCKEYE SALMON - SUMMER                 0.899543
  FRASER SOCKEYE SALMON -EARLY STUART            0.917808
  FRASER SPRING RUN 42 CHINOOK SALMON            0.971429
  FRASER SPRING RUN 52 CHINOOK SALMON            0.933333
  FRASER SUMMER RUN 41 CHINOOK SALMON            0.961905
  FRASER SUMMER RUN 52 CHINOOK SALMON            0.974286
  INNER SOUTH COAST CHUM SALMON                  1.000000
  INTERIOR FRASER COHO SALMON                    0.949749
  JOHNSTONE STRAIT/MAINLAND INLET COHO SALMON 

##### Salmonid Enhancement Program

In [36]:
find_outliers_iqr(
    dataset_path='data/test/Salmonid_Enhancement_Program_Releases.xlsx',
    selected_columns=['TotalRelease'],
    groupby_column= ['FACILITY_NAME','SPECIES_NAME', 'BROOD_YEAR'],
    threshold=1.5,
    minimum_score= 0.85
)

({'TotalRelease': FACILITY_NAME      SPECIES_NAME  BROOD_YEAR
  232nd Street Pond  Chum          1995          1.0
                                   1996          1.0
                                   1997          1.0
                                   1998          1.0
                                   1999          1.0
                                                ... 
  Zeballos Schools   Chinook       2013          1.0
                     Chum          2012          1.0
                                   2019          1.0
                                   2022          1.0
                     Coho          2018          1.0
  Name: TotalRelease, Length: 16564, dtype: float64},
 {'TotalRelease': 0.9448804636561217})

In [78]:
import statistics
scores = [0.9613893007870423, 0.9447359415991787]
print(statistics.mean(scores))

0.9530626211931106


#### Accuracy Type 3 (A3 Duplicates)

In [10]:
# function 1: finding duplicates
def find_duplicates_and_percentage(dataset_path):

    df = read_data(dataset_path)

    # Find duplicate rows
    duplicate_rows = df[df.duplicated(keep=False)]
    
    # Calculate percentage of duplicate rows
    total_rows = len(df)
    total_duplicate_rows = len(duplicate_rows)
    percentage_duplicate = 1-(total_duplicate_rows / total_rows)
    
    # Print duplicate rows
    print("Duplicate Rows:")
    print(duplicate_rows)
    
    # Print percentage of duplicate rows
    print(f"\nDuplication Score: {percentage_duplicate*100}%")

Test

In [6]:
find_duplicates_and_percentage(dataset_path = 'data/test/new Escapement Data 2022.xlsx')

Duplicate Rows:
Empty DataFrame
Columns: [Year, CU_ID, CU_Name, SMU, Species, Escapement_Wild, Escapement_Total, Recruits_Wild, Recruits_Total, IntStatus.Status, IntStatus.Year, Contact, Comments]
Index: []

Duplication Score: 100.0%


##### Habitat Restoration Projects

In [20]:
find_duplicates_and_percentage(dataset_path= 'data/test/SalmonHabitatRestorationProjects_DataPortal_June_FinalFields_20240613.csv')

Duplicate Rows:
Empty DataFrame
Columns: [Unnamed: 0, site_species_id, project_name, project_description, reporting_fy, site_latitude, site_longitude, ecosystem_type, species_name, CU_Name, FULL_CU_IN, SMU_Display, SMU_ID]
Index: []

Duplication Score: 100.0%


##### Salmonid Enhancement Program (SEP)

In [40]:
find_duplicates_and_percentage(dataset_path='data/test/Salmonid_Enhancement_Program_Releases.xlsx')

Duplicate Rows:
      PROGRAM_CODE         PROJ_NAME SPECIES_NAME RUN_NAME  BROOD_YEAR  \
83             AFS  Homalco-Taggares         Coho     Fall        1999   
84             AFS  Homalco-Taggares         Coho     Fall        1999   
254            AFS         Victor Cr      Sockeye     Fall        2005   
256            AFS         Victor Cr      Sockeye     Fall        2005   
1049           CDP       Fort Babine      Chinook   Summer        2002   
...            ...               ...          ...      ...         ...   
30375          PIP     Poco Hatchery         Coho     Fall        2008   
30387          PIP     Poco Hatchery         Coho     Fall        2010   
30388          PIP     Poco Hatchery         Coho     Fall        2010   
32596          PIP       Terminal Cr         Coho     Fall        2006   
32597          PIP       Terminal Cr         Coho     Fall        2006   

           STOCK_NAME STOCK_PROD_AREA_CODE  STOCK_GFE_ID  \
83           Orford R              

##### Pacific Salmon Outlook

In [9]:
find_duplicates_and_percentage(dataset_path='data/test/salmonoutlook_dataportal_DRAFT_PLACEHOLDER.xlsx')

Duplicate Rows:
      Year                      Area  SPECIES  \
1378  2021  Fraser River/BC Interior  Sockeye   
1379  2021  Fraser River/BC Interior  Sockeye   
1470  2021               South Coast  Chinook   
1471  2021               South Coast  Chinook   
1472  2021               South Coast  Chinook   
1616  2021               South Coast     Pink   
1617  2021               South Coast     Pink   

                                  SMU NAME                       CU NAME  \
1378  FRASER SOCKEYE SALMON - EARLY SUMMER  Francois-Early Summer Timing   
1379  FRASER SOCKEYE SALMON - EARLY SUMMER  Francois-Early Summer Timing   
1470                 MIDDLE GEORGIA STRAIT                           NaN   
1471                 MIDDLE GEORGIA STRAIT                           NaN   
1472                 MIDDLE GEORGIA STRAIT                           NaN   
1616                      WCVI PINK SALMON         West Vancouver Island   
1617                      WCVI PINK SALMON         West Van

##### Crosswalk: SMU-CU-DU

In [10]:
find_duplicates_and_percentage(dataset_path='data/Pacific Salmon Population Unit Crosswalk_Final_20240513.xlsx')

Duplicate Rows:
    FULL_CU_IN       CU_Display          CU_DFO_Area CU_Species  CU_Type  \
496      CM-46  PORCUPINE RIVER  YUKON TRANSBOUNDARY       Chum  Current   
497      CM-46  PORCUPINE RIVER  YUKON TRANSBOUNDARY       Chum  Current   

           SMU_Display      SMU_ID         SMU_DFO_Area SMU_Species  
496  YUKON CHUM SALMON  CM-YTRA-01  YUKON TRANSBOUNDARY        Chum  
497  YUKON CHUM SALMON  CM-YTRA-01  YUKON TRANSBOUNDARY        Chum  

Duplication Score: 99.60079840319361%


##### Pacific Salmon Business Glossary

In [12]:
find_duplicates_and_percentage(dataset_path='data/test/PSSI_Salmon_Business Glossary_V1.xlsx')

Duplicate Rows:
Empty DataFrame
Columns: [Term Name

, Acronym

, Topic

, Definition

, Synonyms

, Related_Term

]
Index: []

Duplication Score: 100.0%


  for idx, row in parser.parse():


### Completeness

The threshold is for removing a column that meets the threshold of the percentage of blanks.

In [21]:
def completeness_test(dataset_path, exclude_columns = [], threshold=0.75):  
    dataset = read_data(dataset_path)

    # Exclude the 'Comment' column if it exists in the dataset  
    if 'Comment' in dataset.columns:  
        dataset = dataset.drop(columns=['Comment'])  
  
    # Exclude columns in exclude_columns if they exist in the dataset    
    dataset = dataset.drop(columns=[col for col in exclude_columns if col in dataset.columns])
    
    # Calculate the percentage of non-null (non-missing) values in each column  
    is_null_percentage = dataset.isna().mean()  
      
    # Identify columns with non-null percentage less than or equal to the threshold  
    columns_to_keep = is_null_percentage[is_null_percentage <= threshold].index  
      
    # Keep columns that exceed the threshold of non-null values  
    dataset2 = dataset[columns_to_keep]  
      
    # Calculate the actual percentage of non-missing values in the dataset  
    total_non_missing = dataset2.notna().sum().sum()  
    total_obs = dataset2.shape[0] * dataset2.shape[1]  
    completeness_score = total_non_missing / total_obs  
      
    return completeness_score  

Test

In [8]:
completeness_test('data/test/2023-sep-production-plan-en.csv', threshold = 0.75)

0.9629798335334796

In [16]:
completeness_test('data/restoration projects_dataportal.csv', threshold = 0.75)

0.8995842553476833

##### Habitat Restoration Projects

In [22]:
completeness_test(dataset_path='data/test/salmon_projects SalmonHabitatRest DP June.csv',
                  exclude_columns=['CU_Name', 'FULL_CU_IN', 'SMU_Display', 'SMU_ID'],
                  threshold = 0.75)

0.9475298408488063

##### Salmonid Enhancement Program

In [42]:
completeness_test(dataset_path='data/test/Salmonid_Enhancement_Program_Releases.xlsx',
                  threshold = 0.75)

0.9604727880589949

##### Pacific Salmon Outlook

In [20]:
completeness_test(dataset_path='data/test/salmonoutlook_dataportal_DRAFT_PLACEHOLDER.xlsx',
                  threshold = 0.75)

0.9515664543153853

##### Crosswalk: SMU-CU-DU

In [21]:
completeness_test(dataset_path='data/Pacific Salmon Population Unit Crosswalk_Final_20240513.xlsx',
                  threshold = 0.75)

0.932579285872699

##### Pacific Salmon Business Glossary

In [35]:
completeness_test(dataset_path='data/test/PSSI_Salmon_Business Glossary_V1.xlsx',
                  exclude_columns=['Acronym\n\n', 'Synonyms\n\n', 'Related_Term\n\n'],
                  threshold = 0.75)

  for idx, row in parser.parse():


0.9990942028985508

### Timeliness

In [16]:
from datetime import datetime

def calc_timeliness(refresh_date, cycle_day):
    refresh_date = pd.to_datetime(refresh_date)
    unupdate_cycle = np.max([((datetime.now() - refresh_date).days/cycle_day)-1, 0])

    #unupdate_cycle = np.floor((datetime.now() - refresh_date).days/cycle_day)
    #print((datetime.now() - refresh_date).days/cycle_day)
    return np.max([0, 100 - (unupdate_cycle * (100/3))])

In [17]:
calc_timeliness('2022-12-01', cycle_day=365)

86.30136986301369

# Output Reports
Run all the functions above first before running this section

#### Note that output reports can be generated through the data quality tests of
<p>    - Consistency type 1
<p>    - Accuracy type 2
<p>    - Accuracy type 3
<p>    - Completeness
<p>          
<p>  *Completeness test does not require an output report (just find the blanks in the dataset). The rest can be found below

### Consistency Type 2

In [34]:
def compare_datasets(dataset_path, column_mapping, ref_dataset_path = None):      
    # Read the data file      
    df = read_data(dataset_path)      
      
    # Initialize ref_df if a ref dataset is provided      
    if ref_dataset_path:      
        df_ref = read_data(ref_dataset_path)      
        ref_data = True #Flag to indicate we are using a ref dataset      
    else:      
        ref_data = False #No ref dataset, compare within the same dataset      
      
    for selected_column, m_selected_column in column_mapping.items():        
        if ref_data:      
             # Compare to ref dataset        
            unique_observations = get_names_used_for_column(df_ref, m_selected_column)    
        else:        
            # Use own column for comparison        
            unique_observations = get_names_used_for_column(df, selected_column)    
              
        # Iterate over each row in the selected column    
        column_results = []  
        for value in df[selected_column]:    
            # Check if the value exists in unique_observations and append the result to column_results  
            if pd.isnull(value):  
                column_results.append(False) # or True, depending on how you want to handle NaN values  
            else:  
                column_results.append(value in unique_observations)  
          
        # Add the results as a new column in the DataFrame  
        df[selected_column + '_comparison'] = column_results  
        
    return df  

In [35]:
column_mapping = {'STOCK_CU_NAME':'CU_Display', 'STOCK_CU_INDEX':'FULL_CU_IN'}  #the pattern for comparison is 'dataset column' : 'reference column'
compare_datasets(
    dataset_path='data/test/Salmonid_Enhancement_Program_Releases.xlsx', 
    column_mapping=column_mapping, 
    ref_dataset_path = 'data/Pacific Salmon Population Unit Crosswalk_Final_20240513.xlsx')

Unnamed: 0,PROGRAM_CODE,PROJ_NAME,SPECIES_NAME,RUN_NAME,BROOD_YEAR,STOCK_NAME,STOCK_PROD_AREA_CODE,STOCK_GFE_ID,STOCK_GFE_NAME,STOCK_POP_ID,...,REL_WATERSHED_CODE,REL_LATITUDE,REL_LONGITUDE,RELEASE_STAGE_NAME,RELEASE_YEAR,START_DATE,END_DATE,TotalRelease,STOCK_CU_NAME_comparison,STOCK_CU_INDEX_comparison
0,AFS,Emily Lk,Sockeye,Summer,2001,Tankeeah R,CCST,1001.0,TANKEEAH RIVER,51935.0,...,910-463800-00000-00000-0000-0000-000-000-000-0...,52.298184,-128.261590,Unfed,2002,20020401,20020402,25954,True,True
1,AFS,Emily Lk,Sockeye,Summer,2002,Tankeeah R,CCST,1001.0,TANKEEAH RIVER,51935.0,...,910-463800-00000-00000-0000-0000-000-000-000-0...,52.298184,-128.261590,Unfed,2003,20030404,20030404,41982,True,True
2,AFS,Emily Lk,Sockeye,Summer,2003,Tankeeah R,CCST,1001.0,TANKEEAH RIVER,51935.0,...,910-463800-00000-00000-0000-0000-000-000-000-0...,,,Fed Fry,2004,20040623,20040623,63829,True,True
3,AFS,Emily Lk,Sockeye,Summer,2004,Tankeeah R,CCST,1001.0,TANKEEAH RIVER,51935.0,...,910-463800-00000-00000-0000-0000-000-000-000-0...,,,Fed Fry,2005,20050316,20050316,9526,True,True
4,AFS,Emily Lk,Sockeye,Summer,2005,Tankeeah R,CCST,1001.0,TANKEEAH RIVER,51935.0,...,910-463800-00000-00000-0000-0000-000-000-000-0...,52.298184,-128.261590,Unfed,2006,20060303,20060303,64686,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37550,RRD,Yukalup Ch,Pink,Fall,1997,Chilliwack R,LWFR,62.0,CHILLIWACK RIVER,46983.0,...,100-065700-09700-00000-0000-0000-000-000-000-0...,49.125663,-122.098166,Chan Fry,1998,19980201,19980331,300000,True,True
37551,RRD,Yukalup Ch,Pink,Fall,1999,Chilliwack R,LWFR,62.0,CHILLIWACK RIVER,46983.0,...,100-065700-09700-00000-0000-0000-000-000-000-0...,49.125663,-122.098166,Chan Fry,2000,20000201,20000331,300000,True,True
37552,RRD,Yukalup Ch,Pink,Fall,2001,Chilliwack R,LWFR,62.0,CHILLIWACK RIVER,46983.0,...,100-065700-09700-00000-0000-0000-000-000-000-0...,49.125663,-122.098166,Chan Fry,2002,20020201,20020331,300000,True,True
37553,RRD,Yukalup Ch,Pink,Fall,2003,Chilliwack R,LWFR,62.0,CHILLIWACK RIVER,46983.0,...,100-065700-09700-00000-0000-0000-000-000-000-0...,49.125663,-122.098166,Chan Fry,2004,20040201,20040430,400000,True,True


### Accuracy Type 1

In [19]:
# Function 1: Using isdigit to find non-numerical entries  
def find_non_digits(s):    
    # Ensure the value is treated as a string    
    s = str(s)    
    return [char for char in s if not (char.isdigit() or char == '.')]  
  
# Function 2 : Check if each row has only numbers in each selected column and add results as new columns  
def add_only_numbers_columns(dataset_path, selected_columns):  
    adf = read_data(dataset_path)  
    selected_columns = [col for col in adf.columns if col in selected_columns]   
  
    for column_name in selected_columns:    
        adf[column_name+'_Only_Numbers'] = adf[column_name].apply(lambda x: len(find_non_digits(x)) == 0)  
  
    return adf  

Test

In [20]:
add_only_numbers_columns(
    dataset_path = 'data/test/SEP Facilities.xlsx',
    selected_columns = ['LicNo', 'FRN'])

Unnamed: 0,Legal_Name,Facility_Name,CIP_Program,FRN,LicType,LicNo,Licensee,OpGrp,FacName,Latitude,...,LICENSEE_ADDRESS_LINE1,LICENSEE_ADDRESS_LINE2,LICENSEE_CITY,LICENSEE_POSTAL_CODE,LICENSEE_PROVINCE,LICENSEE_PROVINCE_CODE,LICENSEE_BUSINESS_PHONE_NUMBER,LICENSEE_EMAIL_ADDRESS,FRN_Only_Numbers,LicNo_Only_Numbers
0,4Mile Creek Enhancement Society,4 Mile Creek Hatchery / San Juan Hatchery,CEDP,8987,SEP Community Involvement,A129621,"Community Advisor, Lower Van Is, Cowichan R & ...",4 Mile Creek Enhancement Society,4 Mile Creek Hatchery Project,48.590000,...,5245 Trans Canada Highway,,Duncan,V0R 2C0,British Columbia,BC,(250) 466-4007,Heather.Wright@dfo-mpo.gc.ca,True,False
1,4Mile Creek Enhancement Society,4 Mile Creek Hatchery / San Juan Hatchery,CEDP,8988,SEP Community Involvement,129548,"Community Advisor, Lower Van Is, Cowichan R & ...",Port Renfrew Enhancement Society,San Juan River Seapen Project,48.550000,...,5245 Trans Canada Highway,,Duncan,V0R 2C0,British Columbia,BC,(250) 466-4007,Heather.Wright@dfo-mpo.gc.ca,True,True
2,A Rocha Canada - Houston,Buck Creek Canfor Hatchery,PIP,9005,SEP Community Involvement,129533,"Community Advisor, Smithers and Northwestern BC",A Rocha Canada,A Rocha Project,54.396110,...,_,,-,-,British Columbia,BC,-,Jonathan.Minson@dfo-mpo.gc.ca,True,True
3,Abbotsford Ravine Park Salmonid Enhancement So...,Ravine Park Hatchery / Matsqui Slough Watershe...,PIP,9068,SEP Community Involvement,",129606","Community Advisor, Mission/Abbotsford to past ...",Abbotsford Ravine Park Salmon Enhancement Society,Ravine Park Project,49.045270,...,4222 Columbia Valley Hwy,,Cultus Lake,V2R 5B6,British Columbia,BC,(604) 378-4216,Paul.Neufeld@dfo-mpo-gc.ca,True,False
4,Alberni Valley Enhancement Association,Jake Leyenaar Hatchery / Dave Chitty Resource ...,PIP,9006,SEP Community Involvement,129532,"Community Advisor, Central W Coast of Van Is, ...",Alberni Valley Enhancement Association,Alberni Project,49.284000,...,4706 Tebo Avenue,,Port Alberni,V9Y 8B1,British Columbia,BC,250-918-4782,Ryan.Cyr@dfo-mpo.gc.ca,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,Tsolum River Restoration Society,Tsolum River Hatchery,PIP,9078,SEP Community Involvement,129536,"Community Advisor, Central E&W Van Is, Nanoose...",Tsolum River Restoration Society,Tsolum River Hatchery,49.615080,...,148 Port Augusta Street,,Comox,V9M 3N6,British Columbia,BC,(250) 465-8348,Jacob.Melville@dfo-mpo.gc.ca,True,True
93,University of Northern British Columbia,Quesnel River Research Centre,Other,9067,SEP Community Involvement,129614,"Community Advisor, Central Interior: North of ...","University of Northern British Columbia, Quesn...",Quesnel River Research Centre Project,52.617440,...,_,,-,-,British Columbia,BC,-,Tyler.Thibault@dfo-mpo.gc.ca,True,True
94,West Vancouver Streamkeepers Society,Nelson Creek Hatchery,PIP,9067,SEP Community Involvement,129596,"Community Advisor, West Vancouver and Howe Sound",West Vancouver Streamkeepers,Nelson Creek Project,49.364840,...,4500 Capilano Park Road,,North Vancouver,V7R 4L3,British Columbia,BC,0,Gillian.Steele@dfo-mpo.gc.ca,True,True
95,Western Forest Products,Cordy Creek Hatchery,PIP,9028,SEP Community Involvement,129878,"Community Advisor, N Vancouver Island; Mainlan...",Western Forest Products,Holberg-Cordy Creek Project,50.659810,...,148 Port Augusta Street,,Comox,V9M 3N6,British Columbia,BC,(250) 703-3270,Dave.Davies@dfo-mpo.gc.ca,True,True


#### Salmonid Enhancement Program

In [22]:
add_only_numbers_columns(
    dataset_path = 'data/test/Salmonid_Enhancement_Program_Releases.xlsx',
    selected_columns = ['TotalRelease'])

Unnamed: 0,PROGRAM_CODE,PROJ_NAME,SPECIES_NAME,RUN_NAME,BROOD_YEAR,STOCK_NAME,STOCK_PROD_AREA_CODE,STOCK_GFE_ID,STOCK_GFE_NAME,STOCK_POP_ID,...,REL_WATERBODY_NAME,REL_WATERSHED_CODE,REL_LATITUDE,REL_LONGITUDE,RELEASE_STAGE_NAME,RELEASE_YEAR,START_DATE,END_DATE,TotalRelease,TotalRelease_Only_Numbers
0,AFS,Emily Lk,Sockeye,Summer,2001,Tankeeah R,CCST,1001.0,TANKEEAH RIVER,51935.0,...,TANKEEAH RIVER,910-463800-00000-00000-0000-0000-000-000-000-0...,52.298184,-128.261590,Unfed,2002,20020401,20020402,25954,True
1,AFS,Emily Lk,Sockeye,Summer,2002,Tankeeah R,CCST,1001.0,TANKEEAH RIVER,51935.0,...,TANKEEAH RIVER,910-463800-00000-00000-0000-0000-000-000-000-0...,52.298184,-128.261590,Unfed,2003,20030404,20030404,41982,True
2,AFS,Emily Lk,Sockeye,Summer,2003,Tankeeah R,CCST,1001.0,TANKEEAH RIVER,51935.0,...,,910-463800-00000-00000-0000-0000-000-000-000-0...,,,Fed Fry,2004,20040623,20040623,63829,True
3,AFS,Emily Lk,Sockeye,Summer,2004,Tankeeah R,CCST,1001.0,TANKEEAH RIVER,51935.0,...,,910-463800-00000-00000-0000-0000-000-000-000-0...,,,Fed Fry,2005,20050316,20050316,9526,True
4,AFS,Emily Lk,Sockeye,Summer,2005,Tankeeah R,CCST,1001.0,TANKEEAH RIVER,51935.0,...,TANKEEAH RIVER,910-463800-00000-00000-0000-0000-000-000-000-0...,52.298184,-128.261590,Unfed,2006,20060303,20060303,64686,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37550,RRD,Yukalup Ch,Pink,Fall,1997,Chilliwack R,LWFR,62.0,CHILLIWACK RIVER,46983.0,...,CHILLIWACK/VEDDER RIVER,100-065700-09700-00000-0000-0000-000-000-000-0...,49.125663,-122.098166,Chan Fry,1998,19980201,19980331,300000,True
37551,RRD,Yukalup Ch,Pink,Fall,1999,Chilliwack R,LWFR,62.0,CHILLIWACK RIVER,46983.0,...,CHILLIWACK/VEDDER RIVER,100-065700-09700-00000-0000-0000-000-000-000-0...,49.125663,-122.098166,Chan Fry,2000,20000201,20000331,300000,True
37552,RRD,Yukalup Ch,Pink,Fall,2001,Chilliwack R,LWFR,62.0,CHILLIWACK RIVER,46983.0,...,CHILLIWACK/VEDDER RIVER,100-065700-09700-00000-0000-0000-000-000-000-0...,49.125663,-122.098166,Chan Fry,2002,20020201,20020331,300000,True
37553,RRD,Yukalup Ch,Pink,Fall,2003,Chilliwack R,LWFR,62.0,CHILLIWACK RIVER,46983.0,...,CHILLIWACK/VEDDER RIVER,100-065700-09700-00000-0000-0000-000-000-000-0...,49.125663,-122.098166,Chan Fry,2004,20040201,20040430,400000,True


# Data Quality Test Log