In [1]:
import common_functions
import os
import pandas as pd
from rapidfuzz import fuzz, process
from tqdm import tqdm  # For progress tracking
import common_functions
import string

In [2]:
site = "Headout"
file_manager = common_functions.FilePathManager(site, "NA")
headout_file_path_xlsx_operator = file_manager.get_file_paths()['file_path_xlsx_operator']

site = "Musement"
file_manager = common_functions.FilePathManager(site, "NA")
musement_file_path_xlsx_operator = file_manager.get_file_paths()['file_path_xlsx_operator']

site = "GYG"
file_manager = common_functions.FilePathManager(site, "NA")
gyg_file_path_xlsx_operator = file_manager.get_file_paths()['file_path_xlsx_operator']


viator_file_path_xlsx_operator = file_manager.get_file_paths()['file_path_xlsx_operator'].replace('Operators_GYG', 'Operators_Groups')

logger = common_functions.LoggerManager(file_manager)


In [3]:
# List all dataset filenames
dataset_files = [headout_file_path_xlsx_operator, musement_file_path_xlsx_operator, gyg_file_path_xlsx_operator, viator_file_path_xlsx_operator]  # Add all your dataset filenames here

# Load datasets into a dictionary of DataFrames
datasets = {}
for file in dataset_files:
    dataset_name = file.split("\\")[-1] # e.g., 'dataset_A'
    datasets[dataset_name] = pd.read_excel(os.path.join(file))


In [None]:
def preprocess(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Optional: Define a list of stopwords to exclude
stopwords = set([
    'and', 'or', 'the', 'a', 'an', 'to', 'from', 'of', 'in', 'with', 'on', 'for', 'by'
    # Add more stopwords as needed
])

def remove_stopwords(text):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stopwords]
    return ' '.join(filtered_tokens)

# Apply preprocessing to all datasets
for name, df in datasets.items():
    try:
        df['Tytul_preprocessed'] = df['Tytul'].apply(preprocess).apply(remove_stopwords)
        df['City_preprocessed'] = df['City'].apply(preprocess).apply(remove_stopwords)
        logger.logger_info.info(f"Preprocessed 'Tytul' and 'City' columns for dataset: {name}")
    except Exception as e:
        logger.logger_err.error(f"Error preprocessing dataset {name}: {e}")


In [5]:
def get_similarity_threshold(text_length):
    if text_length < 20:
        return 90  # Higher threshold for short strings
    elif text_length < 40:
        return 80
    else:
        return 70  # Lower threshold for longer strings


In [6]:
def calculate_similarity(a, b):
    return fuzz.token_set_ratio(a, b)


In [None]:
# Initialize Dataset A
dataset_A = datasets.get('Operators_Headout.xlsx')
if dataset_A is None:
    logger.logger_err.error("Dataset A not found. Please ensure 'Operators_Headout.xlsx' is in the dataset folder.")
    raise FileNotFoundError("Dataset A not found.")

# Enforce 'Operator' column to be string across all datasets
for name, df in datasets.items():
    try:
        df['Operator'] = df['Operator'].astype(str)
    except Exception as e:
        logger.logger_err.error(f"Error converting 'Operator' to string in dataset {name}: {e}")

# Initialize new columns with empty strings to match DataFrame length
new_columns = ['Matched_Operators', 'Similarity_Scores', 'Matched_Tytuls', 'Links']
for col in new_columns:
    dataset_A[col] = ''

logger.logger_info.info("Initialized new columns: Matched_Operators, Similarity_Scores, Matched_Tytuls, Links.")


In [None]:
# Combine all other datasets into a single DataFrame for comparison
other_datasets = [df for name, df in datasets.items() if name != 'Operators_Headout.xlsx']
if not other_datasets:
    logger.logger_err.error("No other datasets found to compare with Dataset A.")
    raise ValueError("No other datasets available for comparison.")

combined_other = pd.concat(other_datasets, ignore_index=True)
combined_other = combined_other.drop_duplicates(subset=['Tytul_preprocessed', 'City_preprocessed'])
logger.logger_info.info("Combined all other datasets into 'combined_other' DataFrame.")


In [None]:
# Define a minimum length for Tytul to be considered for matching
MIN_TYTL_LENGTH = 5  # Adjust as needed

total_rows = len(dataset_A)
logger.logger_info.info(f"Starting mapping operators for {total_rows} entries in Dataset A.")

# Initialize tqdm progress bar
with tqdm(total=total_rows, desc="Processing Dataset A", unit="row") as pbar:
    for index, row in dataset_A.iterrows():
        try:
            city = row['City_preprocessed']
            tytul = row['Tytul_preprocessed']
            
            # Skip if tytul is too short
            if len(tytul) < MIN_TYTL_LENGTH:
                logger.logger_info.info(f"Skipped '{row['Tytul']}' due to short length.")
                pbar.update(1)
                continue
            
            # Determine similarity threshold based on tytul length
            threshold = get_similarity_threshold(len(tytul))
            
            # Filter combined_other for the same city
            same_city = combined_other[combined_other['City_preprocessed'] == city]
            
            # Log if no entries found in the same city
            if same_city.empty:
                logger.logger_info.info(f"No matches found for '{row['Tytul']}' in city '{row['City']}'.")
                pbar.update(1)
                continue
            
            # Use RapidFuzz's process.extract to find all matches above the threshold
            matches = process.extract(
                tytul,
                same_city['Tytul_preprocessed'],
                scorer=fuzz.token_set_ratio,
                limit=None
            )
            
            # Filter matches above the dynamic threshold
            good_matches = [match for match in matches if match[1] >= threshold]
            
            logger.logger_info.info(f"Found {len(good_matches)} good matches for '{row['Tytul']}' in city '{row['City']}' with threshold {threshold}.")
            
            if good_matches:
                # Sort good_matches by similarity descending
                good_matches_sorted = sorted(good_matches, key=lambda x: x[1], reverse=True)
                
                operator_similarity = {}
                matched_tytuls = {}
                links = {}
                
                for match in good_matches_sorted:
                    matched_tytul_preprocessed = match[0]
                    similarity = match[1]
                    # Get the Operator and Link from the matched record
                    matched_records = same_city[same_city['Tytul_preprocessed'] == matched_tytul_preprocessed]
                    if not matched_records.empty:
                        operator = matched_records.iloc[0]['Operator']
                        link = matched_records.iloc[0].get('Link', '')  # Ensure 'Link' exists
                        # Ensure operator is a valid string
                        if not isinstance(operator, str) or operator.lower() == 'nan':
                            logger.logger_err.error(f"Invalid Operator '{operator}' for Tytul: '{matched_tytul_preprocessed}'. Skipping this match.")
                            continue
                        # Avoid matching generic terms like 'Combo'
                        if operator.lower() in stopwords:
                            logger.logger_info.info(f"Skipped generic operator '{operator}' for Tytul: '{matched_tytul_preprocessed}'.")
                            continue
                        # Keep the highest similarity score for each operator
                        if operator not in operator_similarity or similarity > operator_similarity[operator]:
                            operator_similarity[operator] = similarity
                            matched_tytuls[operator] = matched_records.iloc[0]['Tytul']  # Original Tytul
                            links[operator] = link
                    else:
                        logger.logger_err.error(f"No Operator found for matched Tytul: '{matched_tytul_preprocessed}'.")
                
                # Now, operator_similarity has operator: highest_similarity_score
                matched_operators_unique = list(operator_similarity.keys())
                similarity_scores_unique = [str(int(sim)) for sim in operator_similarity.values()]
                matched_tytuls_unique = [matched_tytuls[op] for op in matched_operators_unique]
                links_unique = [links[op] for op in matched_operators_unique]
                
                # Assign to the DataFrame
                dataset_A.at[index, 'Matched_Operators'] = ', '.join(matched_operators_unique)
                dataset_A.at[index, 'Similarity_Scores'] = ', '.join(similarity_scores_unique)
                dataset_A.at[index, 'Matched_Tytuls'] = ', '.join(matched_tytuls_unique)
                dataset_A.at[index, 'Links'] = ', '.join(links_unique)
                
                logger.logger_info.info(
                    f"Matched Operators for '{row['Tytul']}': {matched_operators_unique} with similarities {similarity_scores_unique}, Matched Tytuls {matched_tytuls_unique}, Links {links_unique}"
                )
        except Exception as e:
            logger.logger_err.error(f"Error processing row index {index} for Tytul '{row['Tytul']}': {e}")
        finally:
            pbar.update(1)  # Update the progress bar regardless of success or failure


In [None]:
# ==========================================
# Step 8: Explode the Matched Columns into Separate Rows
# ==========================================

try:
    dataset_A_exploded = dataset_A.copy()
    
    # Split the concatenated strings into lists
    dataset_A_exploded['Matched_Operators'] = dataset_A_exploded['Matched_Operators'].str.split(', ')
    dataset_A_exploded['Similarity_Scores'] = dataset_A_exploded['Similarity_Scores'].str.split(', ')
    dataset_A_exploded['Matched_Tytuls'] = dataset_A_exploded['Matched_Tytuls'].str.split(', ')
    dataset_A_exploded['Links'] = dataset_A_exploded['Links'].str.split(', ')
    
    # Verify that all lists have the same number of elements
    mismatched_counts = dataset_A_exploded[
        (dataset_A_exploded['Matched_Operators'].apply(len) != dataset_A_exploded['Similarity_Scores'].apply(len)) |
        (dataset_A_exploded['Matched_Operators'].apply(len) != dataset_A_exploded['Matched_Tytuls'].apply(len)) |
        (dataset_A_exploded['Matched_Operators'].apply(len) != dataset_A_exploded['Links'].apply(len))
    ]
    
    if not mismatched_counts.empty:
        logger.logger_err.error(f"Found {len(mismatched_counts)} rows with mismatched list lengths. These rows will be skipped during explosion.")
        # Optionally, log the first few rows for inspection
        logger.logger_err.error(f"Sample mismatched rows:\n{mismatched_counts.head()}")
        # Remove these rows to prevent explosion errors
        dataset_A_exploded = dataset_A_exploded[
            (dataset_A_exploded['Matched_Operators'].apply(len) == dataset_A_exploded['Similarity_Scores'].apply(len)) &
            (dataset_A_exploded['Matched_Operators'].apply(len) == dataset_A_exploded['Matched_Tytuls'].apply(len)) &
            (dataset_A_exploded['Matched_Operators'].apply(len) == dataset_A_exploded['Links'].apply(len))
        ]
    
    # Explode the lists to create separate rows for each match
    dataset_A_exploded = dataset_A_exploded.explode(['Matched_Operators', 'Similarity_Scores', 'Matched_Tytuls', 'Links'])
    
    # Define a safe conversion function for similarity scores
    def safe_convert_sim(x):
        try:
            return int(float(x))
        except ValueError:
            logger.logger_err.error(f"Cannot convert similarity score '{x}' to int.")
            return None
    
    # Convert similarity scores to integers
    dataset_A_exploded['Similarity_Score'] = dataset_A_exploded['Similarity_Scores'].apply(safe_convert_sim)
    
    # Drop rows where similarity score conversion failed
    num_failed_conversions = dataset_A_exploded['Similarity_Score'].isnull().sum()
    if num_failed_conversions > 0:
        logger.logger_err.error(f"Found {num_failed_conversions} rows with invalid similarity scores after conversion. These rows will be removed.")
        dataset_A_exploded = dataset_A_exploded.dropna(subset=['Similarity_Score'])
    
    # Sort the exploded DataFrame by 'Similarity_Score' in descending order
    dataset_A_exploded = dataset_A_exploded.sort_values(by='Similarity_Score', ascending=False)
    
    # Drop the old 'Similarity_Scores' column
    dataset_A_exploded = dataset_A_exploded.drop(columns=['Similarity_Scores'])
    
    # Rename columns for clarity
    dataset_A_exploded = dataset_A_exploded.rename(columns={
        'Matched_Operators': 'Matched_Operator',
        'Similarity_Score': 'Similarity_Score'
    })
    
    logger.logger_info.info("Exploded matched operators, similarity scores, matched Tytuls, and links into separate rows, sorted by similarity.")
except Exception as e:
    logger.logger_err.error(f"Error exploding matched operators: {e}")



In [None]:
# ==========================================
# Step 9: Save the Updated Datasets
# ==========================================

try:
    updated_dataset_path = 'Operators_Headout_updated.xlsx'  # Replace with your desired path
    dataset_A.to_excel(updated_dataset_path, index=False)
    logger.logger_done.info(f"Successfully saved the updated Dataset A to '{updated_dataset_path}'.")
    
    # Save the exploded version
    exploded_dataset_path = 'Operators_Headout_exploded.xlsx'  # Replace with your desired path
    dataset_A_exploded.to_excel(exploded_dataset_path, index=False)
    logger.logger_done.info(f"Successfully saved the exploded Dataset A to '{exploded_dataset_path}'.")
except Exception as e:
    logger.logger_err.error(f"Error saving updated datasets: {e}")
