In [1]:
import os
import csv
import pandas as pd
from difflib import get_close_matches
import numpy as np
import re
from transliterate import translit
from unidecode import unidecode
import Levenshtein


In [None]:
os.getcwd()

# INGEST DATE

In [3]:
csv_leagues = pd.read_csv('/Users/finneganlaister-smith/Downloads/DEV ENVIRONMENT/data-science-jupyter-template-main/latest_capology_data_money_fixed.csv')

In [4]:
csv_leagues['League'].value_counts()

League
Italian Serie A             8796
English Premier League      7488
Turkish Süper Lig          7441
French Ligue 1              7184
Spanish Liga Santander      6921
Portuguese Primeira Liga    6743
1. Bundesliga               6454
Dutch Eredivisie            6358
Name: count, dtype: int64

# FUNCTIONS

In [5]:
def is_cyrillic(input_string):
    # Check if the string contains non-ASCII characters
    return not input_string.isascii()

In [6]:
def cyrillic_to_latin(input_string):
    try:
        # Use the "translit" function to convert Cyrillic to Latin
        latin_string = translit(input_string, 'ru', reversed=True)
        return latin_string
    except Exception as e:
        # Handle exceptions, e.g., if the input is not valid Cyrillic
        print(f"Error: {e}")
        return input_string

In [7]:
def find_best_match(array, final_tokens, ORIGINAL_STRING):
    # Concatenate final tokens to form the expected full name
    expected_name = ' '.join(final_tokens)
    
    # Filter names that start with the initial letter
    filtered_names = [name for name in array if name.startswith(final_tokens[0])]
    
    if not filtered_names:
        # Try switching the order of final tokens
        filtered_names = [name for name in array if name.startswith(final_tokens[1])]
        final_tokens = [final_tokens[1], final_tokens[0]]
    
    if not filtered_names:
        return None  # No matching names found
    
    # Check if the ORIGINAL_STRING contains a backtick/apostrophe
    has_backtick_apostrophe = "'" in ORIGINAL_STRING or "`" in ORIGINAL_STRING
    
    # Filter names based on the presence of a backtick/apostrophe
    filtered_names = [name for name in filtered_names if "'" in name or "`" in name] if has_backtick_apostrophe else filtered_names
    
    if not filtered_names:
        return None  # No matching names found
    
    # Calculate Levenshtein distance between the expected name and each remaining name
    distances = [Levenshtein.distance(unidecode(expected_name), unidecode(name.replace(" ", ""))) for name in filtered_names]
    
    # Find the index of the minimum distance
    min_distance_index = distances.index(min(distances))
    
    # Return the name with the minimum distance
    return filtered_names[min_distance_index]

In [8]:
def process_string_newest_ii(input_string):
    cleaned_string = re.sub(r'^\d{1,2}[. ]', '', input_string)
    tokens = cleaned_string.split()
    final_string = ""

    if len(tokens) >= 1 and not re.match(r'^[A-Za-zÀ-ÖØ-öø-ÿ]{2,}$', tokens[0]):
        initial_match = re.match(r'^([A-Za-zÀ-ÖØ-öø-ÿ]+\.)+$|[A-Za-zÀ-ÖØ-öø-ÿ]\.$|[A-Za-zÀ-ÖØ-öø-ÿ]$', tokens[0])
        if initial_match:
            final_string += initial_match.group()

    main_phrase = " ".join(word for word in tokens if len("".join(char for char in word if char.isalpha())) >= 2)
    if main_phrase:
        final_string += " " + main_phrase

    if len(tokens) >= 2 and not re.match(r'^[A-Za-zÀ-ÖØ-öø-ÿ]{2,}$', tokens[1]):
        end_initial_match = re.match(r'^([A-Za-zÀ-ÖØ-öø-ÿ]+\.)+$|[A-Za-zÀ-ÖØ-öø-ÿ]\.$|[A-Za-zÀ-ÖØ-öø-ÿ]$', tokens[1])
        if end_initial_match:
            final_string = "".join(char for char in end_initial_match.group() if char.isalpha()) + " " + final_string

    # Check if the final phrase ends in a period
    if final_string.endswith("."):
        # Extract the last word, remove the period, and move it to the start of final_string
        last_word = final_string.split()[-1].rstrip('.')
        final_string = last_word + " " + final_string

        # Remove the last word from the end of the string
        final_string = ' '.join(final_string.split()[:-1])

        final_string = final_string.strip()

    # Separate the final string by " " and remove non-alphabet characters for each token
    final_tokens = [re.sub(r'[^A-Za-zÀ-ÖØ-öø-ÿćč-]', '', token) for token in final_string.split()]

    # If the first two tokens are the same, remove one token
    if len(final_tokens) >= 2 and final_tokens[0] == final_tokens[1]:
        final_tokens.pop(0)

    joined_string = " ".join(final_tokens)
    return joined_string, final_tokens

    #return " ".join(final_tokens)

In [9]:
def extract_first_name(match_apostrophes_accounted, lastname_match):
    # Check if lastname_match is part of match_apostrophes_accounted
    if lastname_match in match_apostrophes_accounted:
        # Split the string using lastname_match as the separator
        first_name = match_apostrophes_accounted.split(lastname_match)[0].strip()
        return first_name
    else:
        # Handle the case where lastname_match is not found in match_apostrophes_accounted
        print(f"{lastname_match} not found in {match_apostrophes_accounted}")
        return None

In [10]:
def add_backticks(lastname_match, original_string_nojersey):
    # Find the indices of backticks/apostrophes in the original string
    special_indices = [i for i, char in enumerate(original_string_nojersey) if char in ("`", "'")]

    # Add backticks in the corresponding places in the last name match
    for index in special_indices:
        # Check if the index is within the range of the last_name_match
        if 0 <= index < len(lastname_match):
            # Insert backtick in the appropriate position
            lastname_match = lastname_match[:index] + original_string_nojersey[index] + lastname_match[index:]

    return lastname_match



In [None]:
def find_closest_string_OLD(input_string, string_list):
    # Use get_close_matches to find the closest matching string
    closest_match = get_close_matches(input_string, string_list, n=1, cutoff=0.8)

    # Check if a close match is found
    if closest_match:
        return closest_match[0]
    else:
        # If no close matches, extract the last word from the input_string
        last_word = input_string.split()[-1]

        # Return strings from the list if the last word is in those strings
        matching_strings = [s for s in string_list if last_word in s]

        if matching_strings:
            if(len(matching_strings) == 1):
                return matching_strings[0]
            else:
                
                return matching_strings
        else:
            return "No close match found."
            

In [None]:
def find_closest_string_new(input_string, string_list, input_final_tokens, ORIGINAL_NAME_STRING):
    #string_list is nationality name list 
    #input_string  is strong_for_search
    #input_final_tokens is final_tokens
    closest_match_3 = []
    closest_match_2 = []


    #nationality_names_accents_removed = remove_accents_from_strings(nationality_name_list)

    closest_match = get_close_matches(input_string, string_list, n=1, cutoff=0.8)
    #closest_match = get_close_matches(string_for_search, nationality_names_accents_removed, n=1, cutoff=0.8)

    # Check if a close match is found
    if closest_match:
        #RETURN
        return closest_match[0]
        #print(closest_match[0], closest_match in string_list)
    else:
        closest_match_ii = get_close_matches(input_string, string_list)
        if closest_match_ii:
            if((type(closest_match_ii) == list) & (len(closest_match_ii) >= 2)):
                # result_1 = find_best_match(closest_match_ii, input_final_tokens)
                original_string_nojersey = re.sub(r'^\d+(\.)?\s*', '', ORIGINAL_NAME_STRING)
                result_1 = find_best_match(closest_match_ii, input_final_tokens, original_string_nojersey)

                return result_1
                #print('hey' + result_1)
                #RETURN HERE
            else:
                return closest_match_ii[0]
                #print('1 match ' + closest_match_ii[0])
                #RETURN HERE
        else:
            # If no close matches, extract the last word from the input_string
            last_word = ORIGINAL_NAME_STRING.split()[-1]#.split('.')
            if(last_word.strip('.') == 'ii') | (last_word.strip('.') == 'II'):

                last_word = ORIGINAL_NAME_STRING.split()[-2]
                if(last_word.endswith('.')):
                    #return ('Hi')
                    #it's an initial or shortener 
                    startinginitial_shortener = last_word.split('.')[0]
                    last_word = ORIGINAL_NAME_STRING.split()[-3]
                    closest_match_3 = get_close_matches(last_word, string_list)
                    if(closest_match_3):
                        0==0
                    else:
                        #return ('Krejci Hi')
                        #Krejčí case
                        closest_match_3 = get_close_matches(last_word, string_list, n=1, cutoff=0.3)
                        if(closest_match_3):
                            0==0
                else:
                    closest_match_2 = get_close_matches(last_word, string_list, n=1, cutoff=0.8)
                    #it's a word 
            elif(last_word.endswith('.')):
                0==0
                #it's an initial
            else:
                0==0
                closest_match_4 = get_close_matches(last_word, string_list, n=1, cutoff=0.8)

            
            #if this is an initial you need to save it as an initial or a word start 
            #if(last_word)

            # Return strings from the list if the last word is in those strings
            matching_strings = [s for s in string_list if last_word in s]

            if matching_strings:
                if(len(matching_strings) == 1):
                    #return matching_strings[0]
                    print(matching_strings[0])
                else:
                    return matching_strings
                    #print(matching_strings)
                    #return matching_strings
            elif(closest_match):
                #RETURN
                return closest_match[0]
                #print(closest_match[0], closest_match[0] in string_list)
            elif(closest_match_ii):
                #RETURN
                return closest_match_ii[0]
                #print(closest_match_ii[0], closest_match_ii[0] in string_list)
            elif(closest_match_3):
                #RETURN
                return closest_match_3[0]
                #print(closest_match_3[0], closest_match_3[0] in string_list)
            elif(closest_match_2):
                #RETURN
                return closest_match_2[0]
                #print(closest_match_2[0], closest_match_2[0] in string_list)
            elif(closest_match_4):
                #RETURN
                return closest_match_4[0]
                #print(closest_match_4[0], closest_match_4[0] in string_list)
            else:
                #RETURN
                return "No close match found."
                #print("No close match found.")

In [11]:
def find_closest_string_newEST(input_string, string_list, input_final_tokens, ORIGINAL_NAME_STRING):
    #replace nationality name list with string_list
    #replace string_for_search with input_string 
    #replace final_tokens with input_final_tokens
    #replace input_string with ORIGINAL_NAME_STRING

    closest_match = get_close_matches(input_string, string_list, n=1, cutoff=0.8)

    closest_match_4 = []
    closest_match_3 = []
    closest_match_2 = []
    index_match = ""
    matching_indices = []

    if closest_match:
        #TIGHT CLOSE MATCH FUNCTION RETURNS A NAME
        #print('0', closest_match[0], closest_match in string_list)
        #RETURN HERE
        return closest_match[0]
    else:
        #Reduce match constraints
        closest_match_ii = get_close_matches(input_string, string_list)
        #Produces a match
        if closest_match_ii:
            if((type(closest_match_ii) == list) & (len(closest_match_ii) >= 2)):
                #Closest match II returns 1 name
                original_string_nojersey = re.sub(r'^\d+(\.)?\s*', '', ORIGINAL_NAME_STRING)
                #Find best match from set of names
                result_1 = find_best_match(closest_match_ii, input_final_tokens, original_string_nojersey)
                if(pd.isna(result_1)):
                    #none of the names from closest match ii were a good match
                    0==0
                else:
                    #1 of the names from closest match ii were a good match
                    #print('closest match ii best match: ' + result_1)
                    closest_match_ii = result_1
                    #RETURN HERE
                    return closest_match_ii
            else:
                #Closest match II returns 1 name
                #print('1 match ' + closest_match_ii[0])
                #RETURN HERE
                return closest_match_ii[0]
        else:
            # no close matches
            last_word = input_string.split()[-1]
            #KREJCI CASE 
            if(last_word == 'Krejčí'):
                match_krejci = get_close_matches(last_word, string_list, n=1, cutoff=0.380952)
                if(type(match_krejci) == list):
                    if(len(match_krejci) == 1):
                        match_krejci = match_krejci[0]
                        return match_krejci
                    
                
                #HANDLED SAISS CASE HERE 
                # closest_match_3 = get_close_matches(last_word, nationality_name_list, n=1, cutoff=0.4)
                # if(closest_match_3):
                #     print('closest match 3 case match Saïss case')
                #     0==0

            
            #if this is an initial you need to save it as an initial or a word start 
            #if(last_word)

            # Return strings from the list if the last word is in those strings
            matching_strings = [s for s in string_list if last_word in s]

            if matching_strings:
                if(len(matching_strings) == 1):
                    #print('match string ' + matching_strings[0])
                    #RETURN HERE
                    return matching_strings[0]
                else:
                    #print(matching_strings)
                    setofmatches = matching_strings
                    
        #elif(closest_match):       
        if(closest_match):
            #RETURN
            print('1', closest_match[0], closest_match[0] in string_list)
        elif(closest_match_ii):
            if(type(closest_match_ii) == str):
                print(closest_match_ii, closest_match_ii in string_list)
            #RETURN
            else:
                print('ii', closest_match_ii[0], closest_match_ii[0] in string_list)
        elif(closest_match_3):
            #RETURN
            print('3', closest_match_3[0], closest_match_3[0] in string_list)
        elif(closest_match_2):
            #RETURN
            print('2', closest_match_2[0], closest_match_2[0] in string_list)
        elif(closest_match_4):
            #RETURN
            print('4', closest_match_4[0], closest_match_4[0] in string_list)
        elif(index_match != ""):
            #RETURN
            print('end ' + index_match + ORIGINAL_NAME_STRING, matching_indices)
            return(index_match)
        else:
            #RETURN
            return("No close match found.")
            #print("No close match found.")

In [12]:
def filter_candidates(NAMESTRING, LISTCANDIDATES):
    # Get the first token of the NAMESTRING
    first_token = re.split(r'\s', NAMESTRING)[0]

    # Create a regex pattern for matching candidates that start with the first token
    pattern = re.compile(fr'^{re.escape(first_token)}', re.IGNORECASE)

    # Filter candidates based on the pattern
    filtered_candidates = list(filter(lambda x: re.match(pattern, x), LISTCANDIDATES))

    return filtered_candidates


In [13]:
def remove_accents_from_strings(input_array):
    # Ensure the input is a numpy array
    if not isinstance(input_array, np.ndarray) or input_array.dtype != np.dtype('O'):
        raise ValueError("Input must be a NumPy array of strings")

    # Define a function to remove accents from a single string
    def remove_accents_single_string(s):
        return unidecode(s)

    # Vectorize the function to apply it element-wise to the array
    remove_accents_vectorized = np.vectorize(remove_accents_single_string)

    # Apply the vectorized function to each element in the array
    result_array = remove_accents_vectorized(input_array)

    return result_array

In [14]:
def find_names_with_accents(target_name, name_array):
    # Ensure the input is a numpy array
    if not isinstance(name_array, np.ndarray) or name_array.dtype != np.dtype('O'):
        raise ValueError("Input must be a NumPy array of strings")

    # Remove accents from the target name
    target_name_without_accents = unidecode(target_name)

    # Define a function to check if a name with accents matches the target name
    def has_accent_match(name):
        return unidecode(name) == target_name_without_accents

    # Vectorize the function to apply it element-wise to the array
    has_accent_match_vectorized = np.vectorize(has_accent_match)

    # Apply the vectorized function to each element in the array
    matching_names = name_array[has_accent_match_vectorized(name_array)]

    if(len(matching_names) == 1):
        return matching_names[0]

    return matching_names

In [15]:
def remove_apostrophes_backticks(input_array):
    # Ensure the input is a numpy array
    if not isinstance(input_array, np.ndarray) or input_array.dtype != np.dtype('O'):
        raise ValueError("Input must be a NumPy array of strings")

    # Define a function to remove apostrophes and backticks from a single string
    def remove_chars_single_string(s):
        return np.char.replace(np.char.replace(s, "'", ''), "`", '')

    # Vectorize the function to apply it element-wise to the array
    remove_chars_vectorized = np.vectorize(remove_chars_single_string)

    # Apply the vectorized function to each element in the array
    result_array = remove_chars_vectorized(input_array)

    return result_array

In [16]:
def transform_korean_name(name):
    # Split the name into parts
    parts = name.split()

    # Check if the name has at least two parts
    if len(parts) >= 2:
        # Format the name as "Ja-cheol Koo"
        transformed_name = f"{parts[1].capitalize()}-{parts[0].capitalize()}"
        return transformed_name
    else:
        # Return the original name if it doesn't have at least two parts
        return name

In [17]:
def remove_apostrophes_backticks_single_string(input_string):
    # Ensure the input is a string
    if not isinstance(input_string, str):
        raise ValueError("Input must be a string")

    # Define a function to remove apostrophes and backticks from a single string
    def remove_chars_single_string(s):
        return s.replace("'", '').replace("`", '')

    # Apply the function to the input string
    result_string = remove_chars_single_string(input_string)

    return result_string


In [18]:
def filter_names_first_initial_lastname(database, search_string):
    # Filter out non-string elements from the database
    string_database = [str(item) for item in database if isinstance(item, str)]
    
    # Convert search string to lowercase for case-insensitive matching
    search_string_lower = search_string.lower()
    
    # Split the search string into parts
    parts = search_string_lower.split()
    
    # Filter names based on conditions
    filtered_names = [name for name in string_database if all(part in name.lower() for part in parts)]
    
    return filtered_names

In [33]:
def get_names_with_conditions(df):
    # Create an empty list to store names that satisfy the conditions
    result_names = []

    # Iterate through unique names in the DataFrame
    for name in df['Name'].unique():
        # Create a subset of the DataFrame for the current name
        subset = df[df['Name'] == name].reset_index()

        # Check conditions: length of subset is 1 and 'Weekly Salary' is NaN
        if len(subset) == 1 and pd.isna(subset['Weekly Salary'].iloc[0]):
            result_names.append(name)

    return result_names

# Example usage:
# Assuming 'csv_leagues.csv' is your CSV file or provide the DataFrame directly
# csv_leagues = pd.read_csv('csv_leagues.csv')
result_names_null = get_names_with_conditions(csv_leagues)


# TESTING / WORKING

In [None]:
#NEW TEST CASES

#WORKING - 1 CALL TO FIND CLOSEST STRING
# example_problem = "10. Mane Sa."
# natl_test = 'Senegal'
# input_year_test = "19.06.18"

#WORKING - 1 CALL TO FIND CLOSEST STRING
# example_problem = "9. Son Heungmin"
# natl_test = 'South Korea'
# input_year_test = '17.01.17'

#WORKING - 1 CALL TO FIND CLOSEST STRING
# example_problem = "13. Koo Jacheol"
# natl_test = 'South Korea'
# input_year_test = '26.06.14'

#WORKING - 1 CALL TO FIND CLOSEST STRING
# example_problem = "17. Sarr P."
# natl_test = 'Senegal'
# input_year_test = "04.12.22"

#WORKING - 1 CALL TO FIND CLOSEST STRING
# example_problem = '4. Torres Pau'
# natl_test = 'Spain'
# input_year_test = "14.11.21"

#WORKING - 1 CALL TO FIND CLOSEST STRING
# example_problem = "6 Sane S."
# natl_test = 'Senegal'
# input_year_test = '19.06.18'

#WORKING - 1 CALL TO FIND CLOSEST STRING
# example_problem = '11 A Kolarov'
# natl_test = 'Serbia'
# input_year_test = '15.10.13'

#WORKING - 1 CALL TO FIND CLOSEST STRING
# example_problem = "22. Wague M." 
# natl_test = "Senegal"
# input_year_test = "24.06.18"

#WORKING - 1 CALL TO FIND CLOSEST STRING
# example_problem = "17. Badou Ndiaye"
# natl_test = "Senegal"
# input_year_test = "28.01.17"

#WORKING - 1 CALL TO FIND CLOSEST STRING
# example_problem = "8. Sylla Y." 
# natl_test = 'Mali'
# input_year_test = "17.01.17"

#WORKING - 1 CALL TO FIND CLOSEST STRING
# example_problem = "6. Krejčí L. II."
# natl_test = 'Czech Republic'
# input_year_test = '02.06.22'

#WORKING - 2 CALLs TO FIND CLOSEST STRING (accent case)
# example_problem = "6. Saiss"
# natl_test = 'Morocco'
# input_year_test = "30.01.22"


#WORKING - 3 CALLS FIND CLOSEST STRING (backtick case)
# example_problem = "9. Eto`o"
# natl_test = 'Cameroon'
# input_year_test = '13.06.14'


#WORKING - 1 CALL TO FIND CLOSEST STRING
# example_problem = "13. A. N`Diaye"
# natl_test = 'Senegal'
# input_year_test = "14.11.2017"


###WORKING - 3 CALLS TO FIND CLOSEST STRING, 1 MORE TO NEW FUNCTION
# example_problem = "17 A. Baba"
# natl_test = "Ghana" 
# input_year_test = "07.10.16"



###AAA CASES
#WORKING - 1 CALL, FOUND THE WRONG PLAYER. 

# example_problem = "17. N'Diaye M."
# natl_test = 'Mali'
# input_year_test = '17.01.17'
##############################this returned Mahamadou N'Diaye but Mamoutou N'Diaye actually played in that match

###BBB CASES
#WORKING - 3 CALLS TO FIND CLOSEST STRING. DIDN'T FIND 
# example_problem = '10. Zelarayan'
# natl_test = 'Armenia'
# input_year_test = "08.10.21"

###$$$ CASES 
#WORKING - 1 CALL, FOUND THE RIGHT PLAYER BUT NAN SALARY DATA
# example_problem = "J. Seri"
# natl_test = 'Cote d\'Ivoire'
# input_year_test = '17.11.15'



##BROKEN CASES

#2. Traore Ha-i Mali



In [32]:
#Draft 2 - Moving step by step

# example_problem = "17. N'Diaye M."
# natl_test = 'Mali'
# input_year_test = '17.01.17'


example_problem = "J. Seri"
natl_test = 'Cote d\'Ivoire'
input_year_test = '17.11.15'


candidate_name = ""

#players from their country 
dataset_nationality = csv_leagues[csv_leagues['Nationality'] == f"{natl_test}"]['Name'].unique()

if(is_cyrillic(example_problem)):
    #change from cyrillic to english
    example_problem = cyrillic_to_latin(example_problem)

#remove jersey Nums and order initials correctly. 
search_name, final_tokens_name = process_string_newest_ii(example_problem) 

#look their name up in the list of names from their nationality. 
# result = find_closest_string(search_name, dataset_nationality)
result = find_closest_string_newEST(search_name, dataset_nationality, final_tokens_name, example_problem)
if(result in dataset_nationality):
    print(result)
    #RETURN 
    candidate_name = result
else:
    #no match found after first call 
    print('no initial match found: ', search_name)
    nationality_names_accents_removed = remove_accents_from_strings(dataset_nationality)
    match_accent_accounted = find_closest_string_newEST(search_name, nationality_names_accents_removed,final_tokens_name, example_problem)
    if(match_accent_accounted in nationality_names_accents_removed):
        #print(match_accent_accounted)
        matching_names_with_accents = find_names_with_accents(match_accent_accounted, dataset_nationality)
        if(type(matching_names_with_accents) == str):
            print(matching_names_with_accents)
            #RETURN
            candidate_name = matching_names_with_accents
        elif(len(matching_names_with_accents) == 0):
            print(f'accent-less name found: {match_accent_accounted}. But name not in original dataset')
        else:
            print(f'multiple names found after adding accents: {matching_names_with_accents}')


        #MAKE SURE THE NAME WITH ACCENTS IS IN DATASET NATIONALITY 
    else:
        print('no accent match found: ', search_name)

        dataset_nationality_backticks = remove_apostrophes_backticks(dataset_nationality) #dataset_nationality_updated
        match_apostrophes_accounted = find_closest_string_newEST(search_name, dataset_nationality_backticks,final_tokens_name, example_problem)
    
        if(match_apostrophes_accounted in dataset_nationality_backticks):
            #print(match_apostrophes_accounted)

            lastname_match = match_apostrophes_accounted.split()[-1] 
            original_string_nojersey = re.sub(r'^\d+(\.)?\s*', '', example_problem)
            correct_lastname = add_backticks(lastname_match, original_string_nojersey)
            correct_firstname = extract_first_name(match_apostrophes_accounted, lastname_match)
            
            correct_name_full = correct_firstname + ' ' + correct_lastname

            if(correct_name_full in dataset_nationality):
                print(correct_name_full)
                #RETURN
                candidate_name = correct_name_full
            elif(correct_name_full.replace('`', "'") in dataset_nationality):
                print(correct_name_full.replace('`', "'"))
                #RETURN
                candidate_name = correct_name_full.replace('`', "'")
            else:
                print(f'backtick-less name found: {match_apostrophes_accounted}. But name not in original dataset')
           
        else:
            print('no backtick match found: ', search_name)

if(candidate_name != ""):
    #There's a match



    ###CHECK AGAINST NULL LIST

    yearstr = input_year_test.split(".")[2]
    full_num = '20' + yearstr

    that_season_that_guy = csv_leagues[(csv_leagues['Name'] == candidate_name) & (csv_leagues['Season'] == int(full_num))]

    #NO DATA FOR YEAR OF THE MATCH
    if(len(that_season_that_guy) == 0):

        prev_season_that_guy = csv_leagues[(csv_leagues['Name'] == candidate_name) & (csv_leagues['Season'] == (int(full_num) + 1))]
        next_season_that_guy = csv_leagues[(csv_leagues['Name'] == candidate_name) & (csv_leagues['Season'] == (int(full_num) - 1))]
        thatguy_3seasons = pd.concat([that_season_that_guy, prev_season_that_guy, next_season_that_guy], ignore_index=True)
        
        #NO DATA FOR YEAR BEFORE OR AFTER THE MATCH
        if(len(thatguy_3seasons) == 0):
            print(f'{candidate_name} wasn\'t in the db in {full_num}, {int(full_num) + 1} or {int(full_num) - 1} ')
            ###AAA
        
        #SOME DATA FOR YEAR BEFORE OR AFTER THE MATCH
        else:
            0==0
            print(f'{candidate_name} was in the db in {full_num}, but was in {int(full_num) + 1} or {int(full_num) - 1} ')
            
    
    #SOME DATA FOR YEAR OF THE MATCH
    else:
        print(f"{candidate_name} is in dataset for {int(full_num)} in {full_num}")

        ###CHECK AGAINST NULL LIST


        array_with_nan = that_season_that_guy['Yearly Salary'].unique()

        # Convert the values to numeric, treating 'nan' as NaN
        numeric_values = pd.to_numeric(array_with_nan, errors='coerce')
        # Find the maximum value excluding NaNs

        ###$$$
        if np.isnan(numeric_values).all():
            print("Test case: All-NaN slice encountered")
            #USING THEIR OWN SALARIES 


            # prev_season_that_guy = csv_leagues[(csv_leagues['Name'] == candidate_name) & (csv_leagues['Season'] == (int(full_num) + 1))]
            # next_season_that_guy = csv_leagues[(csv_leagues['Name'] == candidate_name) & (csv_leagues['Season'] == (int(full_num) - 1))]
            # thatguy_3seasons = pd.concat([that_season_that_guy, prev_season_that_guy, next_season_that_guy], ignore_index=True)
            # szn_array_with_nan = thatguy_3seasons['Yearly Salary'].unique()
            # numeric_values_3szn = pd.to_numeric(szn_array_with_nan, errors='coerce')
            
            #USING LEAGUE AVG SALARIES
            # league = that_season_that_guy.reset_index().at[0, 'League']
            # season = that_season_that_guy.reset_index().at[0, 'Season']


            # mean_salary = pd.to_numeric(csv_leagues[(csv_leagues['League'] == league) & (csv_leagues['Season'] == season)]['Inflation-Adjusted Yearly Salary'], errors='coerce').mean()
            # mediansalary = csv_leagues[(csv_leagues['League'] == league) & (csv_leagues['Season'] == season)]['Inflation-Adjusted Yearly Salary'].median()
            # print(mean_salary, mediansalary)


        else:
            # Find the maximum value excluding NaNs
            max_value_excluding_nan = np.nanmax(numeric_values)
            print(f"salary in {input_year_test} is {max_value_excluding_nan}")

else:
    #No matches after 3 tries 
    result = filter_names_first_initial_lastname(csv_leagues['Name'].unique(), search_name)
    list_left = filter_candidates(search_name, result)

    natl_list = []
    for i in range(0, len(list_left)):
        if(natl_test in csv_leagues[csv_leagues['Name'] == list_left[i]]['Nationality'].unique()):
            natl_list.append(list_left[i])

    if(len(natl_list) == 1):
        
        #one match remaining. 
        #RETURN
        print(f'after filtering 4th time found {natl_list[0]}')

    elif(len(natl_list) == 2):
        #still not quite matched up
        #print something. probably should search this guy as part of BBB
        0==0
    else:
        ###BBB
        0==0
        print('no match after 4')
    


Jean Michaël Seri
Jean Michaël Seri is in dataset for 2015 in 2015
Test case: All-NaN slice encountered


# Debug

## Process String

In [None]:
#TEST CASES

# nationality_string = 'Senegal'
# #input_string = '13. A. N`Diaye'
# input_string = '10. N\'Diaye M.'

#to try 
# 12. Rui Patricio - Portugal ####DONE
#2. O`Donnell S.G. - Scotland 
#6. Maksimovic Nem. - Serbia  ####DONE
#22. Lazović Dar. - Serbia ####DONE
#9. Fuellkrug - Germany ####DONE
#12. Toko Ekambi - Cameroon ####DONE
#6. Krejči L. II. - Czech Republic
#input_string = "4 J O'Shea"
#nationality_string = 'Ireland'


#NDIAYE CASE III - RESOLVED
# input_string = "13. A. N`Diaye"
# nationality_string = 'Senegal'

#NDIAYE CASE II - need to fix still 
# input_string = '17. N\'Diaye M.'
# nationality_string = 'Mali'

#Pape Sarr Case
# input_string = '17. Sarr P.'
# nationality_string = 'Senegal'

#NOT IN DB CASE 
# input_string = "10. Zelarayan"
# nationality_string = 'Armenia'


In [None]:
#CASES - DRAFT 2
# input_string = "10. Mane Sa."
# nationality_string = 'Senegal'
# input_string = "7. De Bruyne"
# nationality_string = 'Belgium'

# input_string = "9. Son Heungmin"
# nationality_string = 'South Korea'

# input_string = "17. Sarr P."
# nationality_string = 'Senegal'

# input_string = "6. Saiss"
# nationality_string = 'Morocco'

#ETOO CASE
# input_string = "9. Eto`o"
# nationality_string = 'Cameroon'

#Pau Torres case
# input_string = '4. Torres Pau'
# nationality_string = 'Spain'


In [None]:
input_string = "6. Krejčí L. II."
nationality_string = 'Czech Republic'


cleaned_string = re.sub(r'^\d{1,2}[. ]', '', input_string)
tokens = cleaned_string.split()
final_string = ""

if len(tokens) >= 1 and not re.match(r'^[A-Za-zÀ-ÖØ-öø-ÿ]{2,}$', tokens[0]):
    initial_match = re.match(r'^([A-Za-zÀ-ÖØ-öø-ÿ]+\.)+$|[A-Za-zÀ-ÖØ-öø-ÿ]\.$|[A-Za-zÀ-ÖØ-öø-ÿ]$', tokens[0])
    if initial_match:
        final_string += initial_match.group()

main_phrase = " ".join(word for word in tokens if len("".join(char for char in word if char.isalpha())) >= 2)
if main_phrase:
    final_string += " " + main_phrase

if len(tokens) >= 2 and not re.match(r'^[A-Za-zÀ-ÖØ-öø-ÿ]{2,}$', tokens[1]):
    end_initial_match = re.match(r'^([A-Za-zÀ-ÖØ-öø-ÿ]+\.)+$|[A-Za-zÀ-ÖØ-öø-ÿ]\.$|[A-Za-zÀ-ÖØ-öø-ÿ]$', tokens[1])
    if end_initial_match:
        final_string = "".join(char for char in end_initial_match.group() if char.isalpha()) + " " + final_string

# Check if the final phrase ends in a period
if final_string.endswith("."):
    # Extract the last word, remove the period, and move it to the start of final_string
    last_word = final_string.split()[-1].rstrip('.')
    final_string = last_word + " " + final_string

    # Remove the last word from the end of the string
    final_string = ' '.join(final_string.split()[:-1])

    final_string = final_string.strip()

# Separate the final string by " " and remove non-alphabet characters for each token
final_tokens = [re.sub(r'[^A-Za-zÀ-ÖØ-öø-ÿćč-]', '', token) for token in final_string.split()]
# If the first two tokens are the same, remove one token
if len(final_tokens) >= 2 and final_tokens[0] == final_tokens[1]:
    final_tokens.pop(0)

string_for_search = " ".join(final_tokens)
final_tokens



## Find closest string (all accents included)

In [None]:
last_word = string_for_search.split()[-1]#.split('.')
last_word

In [None]:

nationality_name_list = csv_leagues[csv_leagues['Nationality'] == f"{nationality_string}"]['Name'].unique()
#nationality_name_list = remove_accents_from_strings(nationality_name_list)


matching_strings = ""
#nationality_name_list = remove_apostrophes_backticks(nationality_name_list)


index_match = ""
closest_match = get_close_matches(string_for_search, nationality_name_list, n=1, cutoff=0.8)

closest_match_3 = []
closest_match_2 = []
closest_match_4 = []

# Check if a close match is found
if closest_match:
    #RETURN
    print('0', closest_match[0], closest_match in nationality_name_list)
else:
    closest_match_ii = get_close_matches(string_for_search, nationality_name_list)
    #closest_match_ii = get_close_matches(string_for_search, nationality_names_accents_removed)
    if closest_match_ii:
        if((type(closest_match_ii) == list) & (len(closest_match_ii) >= 2)):
            original_string_nojersey = re.sub(r'^\d+(\.)?\s*', '', input_string)
            result_1 = find_best_match(closest_match_ii, final_tokens, original_string_nojersey)
            if(pd.isna(result_1)):
                #none of the closest match II candidates were a good match
                0==0
            else:
                #A name came back
                print('closest match ii best match: ' + result_1)
                closest_match_ii = result_1
                #RETURN HERE
        else:
            #CLOSE MATCH RETURNED 1 GUY
            print('1 match ' + closest_match_ii[0])
            #RETURN HERE
    else:
        # no close match ii (took off n and cutoff)
        last_word = string_for_search.split()[-1]#.split('.')
        if(last_word == 'Krejčí'):
            match_krejci = get_close_matches(last_word, nationality_name_list, n=1, cutoff=0.380952)
            if(type(match_krejci) == list):
                if(len(match_krejci) == 1):
                    match_krejci = match_krejci[0]

        
        matching_strings = [s for s in nationality_name_list if last_word in s]

    if matching_strings:
        if(len(matching_strings) == 1):
            print('matching strings ' + matching_strings[0])
        else:
            print(matching_strings)
            #return matching_strings
    #elif(closest_match):       
    if(closest_match):
        #RETURN
        print('0', closest_match[0], closest_match[0] in nationality_name_list)
    elif(closest_match_ii):
        if(type(closest_match_ii) == str):
            print('ii single', closest_match_ii, closest_match_ii in nationality_name_list)
        #RETURN
        else:
            print('ii', closest_match_ii[0], closest_match_ii[0] in nationality_name_list)
    elif(closest_match_3):
        #RETURN
        print('3', closest_match_3[0], closest_match_3[0] in nationality_name_list)
    elif(closest_match_2):
        #RETURN
        print('2', closest_match_2[0], closest_match_2[0] in nationality_name_list)
    elif(closest_match_4):
        #RETURN
        print('4', closest_match_4[0], closest_match_4[0] in nationality_name_list)
    elif(match_krejci):
        print('krejci', match_krejci)
    elif(index_match != ""):
        print('end ' + index_match)
    else:
        #RETURN
        print("No close match found.")

In [None]:
last_word

In [None]:
#where it's happening

# match string Romain Saiss
# end Romain Saiss6. Saiss [24]

In [None]:
#KOREA CASE
example_problem = "11. Koo Jacheol"
natl_test = 'South Korea'
dataset_nationality = csv_leagues[csv_leagues['Nationality'] == f"{natl_test}"]['Name'].unique()
if(is_cyrillic(example_problem)):
    example_problem = cyrillic_to_latin(example_problem)
search_name = process_string_newest_ii(example_problem) 
result = find_closest_string(search_name, dataset_nationality)

if(type(result) == list):
    result = filter_candidates(search_name, result)[0]

if((result == 'No close match found.')):
    
    if((natl_test == 'South Korea')):
        if(search_name == 'Ki Sungyueng'):
            search_name = 'Ki Sungyong'
        example_problem_korean_name = transform_korean_name(search_name)

        result = find_closest_string(example_problem_korean_name, dataset_nationality)

        if((result) == 'No close match found.'):
            0==0
            #print(search_name, result)
        else:
            print(f"lineups data says {search_name} , results data has {result} after making Korea update")
else:

    print(f"lineups data says {search_name} , results data has {result}")