In [513]:
import os
import pandas as pd
import numpy as np
import regex as re

# Set Up

### Get CSV and check duplicates

In [514]:
pd.set_option('display.max_row', 8353 )

In [515]:
df_dat = pd.read_csv('data/dat_files_index.csv', usecols=('name', 'size'))
df_dat.rename(columns={'name':'Name', 'size':'Size'}, inplace=True)
print(df_dat.shape)
df_dat.head()

(8353, 2)


Unnamed: 0,Name,Size
0,a18.DAT,945
1,a18sm.DAT,1795
2,A6014-S.DAT,6116
3,A6016-S.DAT,5914
4,A6018-S.DAT,5889


In [516]:
# Checking if the dataset contains duplicates
doublon_datfile = df_dat['Name'].value_counts().index[df_dat['Name'].value_counts().values > 1]
print(f'Nombre de doublons dans la bigtable : {len(doublon_datfile)}')
print([i for i in doublon_datfile])

Nombre de doublons dans la bigtable : 0
[]


In [517]:
df_bigtable = pd.read_csv('data/ailes_avion.csv', usecols=('Name', 'Family'))
print(df_bigtable.shape)
df_bigtable.head()

(6324, 2)


Unnamed: 0,Name,Family
0,63A108 MOD C,NASA
1,A18,Uncategorized
2,A18 (SMOOTHED),Uncategorized
3,A6014-S,Ayers
4,A6016-S,Ayers


In [518]:
# Checking if the dataset contains duplicates
doublon_bigtable = df_bigtable['Name'].value_counts().index[df_bigtable['Name'].value_counts().values > 1]
print(f'Nombre de doublons dans la bigtable : {len(doublon_bigtable)}')
print([i for i in doublon_bigtable])

Nombre de doublons dans la bigtable : 2
['FX 66-17AII-182', 'BOEING 737 MIDSPAN']


### Execute Basic Regex and exceptions

In [519]:
# Create a column Name_modified with all names in lowercase and without the .DAT ending 
df_dat['Name_modified'] =  df_dat['Name'].apply(lambda x : (re.split(".DAT$", str(x)))[0])
df_dat['Name_modified'] = df_dat['Name_modified'].apply(lambda x : str(x).lower())
# Create a column Name_modified with all names in lowercase
df_bigtable['Name_modified'] = df_bigtable['Name'].apply(lambda x : str(x).lower())

In [520]:
# Manual affectation of the duplicates after verification
df_bigtable.loc[596, ['Name_modified']] = 'fx6617ai'
df_bigtable.loc[597, ['Name_modified']] = 'fx6617a2'
df_bigtable.loc[154, ['Name_modified']] = 'b737c'
df_bigtable.loc[155, ['Name_modified']] = 'b737b'

In [521]:
# Family exceptions beginning by 'g'
df_bigtable.loc[df_bigtable['Name_modified'] == 'gu25-5(11)8', 'Name_modified'] = 'gu255118'
df_bigtable.loc[df_bigtable['Name_modified'] == 'gs-1', 'Name_modified'] = 'gs1'
df_bigtable.loc[df_bigtable['Name_modified'] == 'griffith 30% suction airfoil', 'Name_modified'] = 'griffith30symsuction'
df_bigtable.loc[df_bigtable['Name_modified'] == 'goe 167 (v.karman prop.2)', 'Name_modified'] = 'goe167'
df_bigtable.loc[df_bigtable['Name_modified'] == 'glenn martin 2', 'Name_modified'] = 'glennmartin2'
df_bigtable.loc[df_bigtable['Name_modified'] == 'glenn martin 3', 'Name_modified'] = 'glennmartin3'
df_bigtable.loc[df_bigtable['Name_modified'] == 'glenn martin 4', 'Name_modified'] = 'glennmartin4'

In [522]:
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler 377mod', 'Name_modified'] = 'e377m'
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler e193gu-k24', 'Name_modified'] = 'e193gu-k24'
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler ste 87(-3)-914', 'Name_modified'] = 'ste87391'
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler ste 871-514', 'Name_modified'] = 'ste87151'
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler stf 863-615', 'Name_modified'] = 'stf86361'

In [523]:
df_bigtable.loc[df_bigtable['Name_modified'] == 'aquila 9.3% smoothed', 'Name_modified'] = 'aquilasm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'bergey bw-3 (smoothed)', 'Name_modified'] = 'bw3'
df_bigtable.loc[df_bigtable['Name_modified'] == 'clark-y 11.7% smoothed', 'Name_modified'] = 'clarkysm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'cody robertson cr 001 r/c hand-launch low reynolds number airfoil (smoothed)', 'Name_modified'] = 'cr001sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'dale house dh4009 (smoothed) used on the storm r/c aerobatic aircraft', 'Name_modified'] = 'dh4009sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'fx 60-100 10.0% smoothed', 'Name_modified'] = 'fx60100sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'fx 63-137 13.7% smoothed', 'Name_modified'] = 'fx63137sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'fx 74-cl5-140 mod (smoothed)', 'Name_modified'] = 'fx74modsm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'mb253515 15.0% smoothed', 'Name_modified'] = 'mb253515sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'r140 12.04% (smoothed)', 'Name_modified'] = 'r140sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'spica 11.73% smoothed', 'Name_modified'] = 'spicasm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'usnps4 (smoothed)', 'Name_modified'] = 'usnps4'
df_bigtable.loc[df_bigtable['Name_modified'] == 'wb-135/35 13.5% smoothed', 'Name_modified'] = 'wb13535sm'

In [524]:
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .08 span', 'Name_modified'] = 'b707a'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .19 span', 'Name_modified'] = 'b707b'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .40 span', 'Name_modified'] = 'b707c'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .54 span', 'Name_modified'] = 'b707d'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .99 span', 'Name_modified'] = 'b707e'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 103', 'Name_modified'] = 'boe103'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 106', 'Name_modified'] = 'boe106'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 737 root', 'Name_modified'] = 'b737a'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 737 outboard', 'Name_modified'] = 'b737d'
df_bigtable.loc[df_bigtable['Name_modified'] == 'kc-135 bl124.32', 'Name_modified'] = 'kc135b'
df_bigtable.loc[df_bigtable['Name_modified'] == 'kc-135 bl200.76', 'Name_modified'] = 'kc135c'
df_bigtable.loc[df_bigtable['Name_modified'] == 'kc-135 bl351.6', 'Name_modified'] = 'kc135d'
df_bigtable.loc[df_bigtable['Name_modified'] == 'kc-135 bl52.44', 'Name_modified'] = 'kc135a'

In [525]:
df_bigtable.loc[df_bigtable['Name_modified'] == 'naca m6 (65%)', 'Name_modified'] = 'm6_65'
df_bigtable.loc[df_bigtable['Name_modified'] == 'naca m6 (85%)', 'Name_modified'] = 'm6_85'

# Functions

### Basic Functions

In [526]:
# Function to merge both dataset and return the df merged
def merge_df(df_bigtable, df_dat, on_column):
    df_merge = pd.merge(df_bigtable, df_dat, on=on_column, how='left', suffixes=('_big', '_dat'))
    nb_mismatch = df_merge[df_merge['Name_dat'].isna()].shape[0]
    nb_match = df_merge.shape[0] - nb_mismatch
    print(f'Le dataset contient {df_merge.shape[0]} valeurs dont {nb_match} correspondent aux fichiers dat.')
    print(f'Il reste {nb_mismatch} valeurs à matcher.')
    df_merge.head()
    return df_merge


# Function to get the list of family fully matched
def get_family_done(df_merge, nb):
    family = df_merge.groupby(['Family']).count()
    mask_family = (family['Name_big'] == family['Name_dat'])
    res = family[mask_family].Name_big.sort_values(ascending=False).head(nb)
    print(res)
    return res


# Function to get the list of missing values per family aka mismatch
def groupna_family(df_merge, nb):
    family = df_merge.groupby(['Family']).count()
    mask_family = (family['Name_big'] != family['Name_dat'])
    family_na = family[mask_family].copy()
    family_na['nb_na'] = family_na['Name_big'] - family_na['Size']
    # We are interested by the 'nb' families with the most of na
    print(family_na['nb_na'].sort_values(ascending=False).head(nb))


# Function to iinitialize df_big et df_dat
# We want to look through the wings left in each dataframe in order to avoid corrupting good matchs
def initiate_df_left(df_bigtable, df_dat, df_merge):
    # Create df_dat_left and df_big_left
    df_dat_left = pd.merge(df_bigtable, df_dat, on='Name_modified', how='right', suffixes=('_big', '_dat'))
    df_dat_left = df_dat_left[df_dat_left['Name_big'].isna()].copy()
    print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")
    df_big_left = df_merge[df_merge['Name_dat'].isna()].copy()
    print(f"Unmatched values left in big table : {df_big_left.shape}")

    # Create a new column for each df containing the first letter of each wing
    df_dat_left['First letter'] = [x[0] for x in df_dat_left['Name_modified'].values]
    df_big_left['First letter'] = [x[0] for x in df_big_left['Name_modified'].values]

    # Create a new column for each df with z copy of the modified name to further work on
    df_dat_left['Name_modified_by_family'] = df_dat_left['Name_modified'].copy()
    df_big_left['Name_modified_by_family'] = df_big_left['Name_modified'].copy()

    # Create a new column for each df with z copy of the modified name to further work on
    df_dat_left['Name_modified_by_pattern'] = df_dat_left['Name_modified'].copy()
    df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified'].copy()


    # df_dat_left ----> ['Name_big', 'Family', 'Name_modified', 'Name_dat', 'Size', 'First letter', 'Name_modified_by_family']
    # Name_big and Family are NaN
    df_dat_left.drop(columns=['Name_big', 'Family'], inplace=True)
    # df_big_left ----> ['Name_big', 'Family', 'Name_modified', 'Name_dat', 'Size', 'First letter', 'Name_modified_by_family']
    # Name_dat and Size are NaN
    df_big_left.drop(columns=['Name_dat', 'Size'], inplace=True)

    return df_dat_left, df_big_left

### Functions to match using specifig family pattern

In [527]:
# Function to create filter by first letter
def get_letter_group(df_dat_left, df_big_left, letter):
    big_letter = df_big_left[df_big_left['First letter'] == letter]
    dat_letter = df_dat_left[df_dat_left['First letter'] == letter]
    print(f'Big table number of wings left beginning by {letter} : {big_letter.shape}')
    print(f'Dat folder number of wings left beginning by {letter} : {dat_letter.shape}')
    print(f'Difference : {big_letter.shape[0] - dat_letter.shape[0]}')
    return big_letter, dat_letter


# Function to pass through pattern
def try_pattern_family(big_letter, family, pattern_list):
    big_letter = big_letter.copy()
    for pattern in pattern_list:
        # Apply pattern
        big_letter.loc[big_letter['Family'] == family, 'Name_modified_by_family'] = big_letter.loc[big_letter['Family'] == family, 'Name_modified_by_family'].apply(pattern)
    return big_letter


# Function to incorporate the new name found (Name_modified_by_family) into the dataset with the bigtable wings left to match
def incorporate_family_pattern(df_big_left, big_letter):
    df_big_left = pd.merge(df_big_left, big_letter[['Name_big', 'Name_modified_by_family']], on='Name_big', how='left', suffixes=('_left', '_big'))
    df_big_left['Name_modified_by_family_big'] = df_big_left['Name_modified_by_family_big'].fillna(df_big_left['Name_modified_by_family_left'])
    df_big_left.drop(["Name_modified_by_family_left"], inplace=True, axis=1)
    df_big_left.rename(columns={'Name_modified_by_family_big':'Name_modified_by_family'}, inplace=True)
    return df_big_left

### Functions to match using global pattern

In [528]:
def set_df_left(df_dat_left, df_merge, on_column):
    # Create df_dat_left and df_big_left
    df_dat_left = pd.merge(df_merge, df_dat_left, on=on_column, how='right', suffixes=('_big', '_dat'))
    df_dat_left = df_dat_left[df_dat_left['Name_big'].isna()]
    df_dat_left.dropna(axis=1, how='all', inplace=True)
    df_dat_left = df_dat_left.set_axis([re.sub('_dat', "", str(col)) for col in df_dat_left.columns], axis=1)
    df_dat_left.rename(columns={'Name':'Name_dat'}, inplace=True)
    print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")
    df_big_left = df_merge[df_merge['Name_dat'].isna()].copy()
    df_big_left.dropna(axis=1, how='all', inplace=True)
    df_big_left = df_big_left.set_axis([re.sub('_big', "", str(col)) for col in df_big_left.columns], axis=1)
    df_big_left.rename(columns={'Name':'Name_big'}, inplace=True)
    print(f"Unmatched values left in big table : {df_big_left.shape}")
    return df_dat_left, df_big_left


def try_pattern(df_big_left, df_dat_left, pattern_list):
    df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified'].copy()
    df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified'].copy()

    for pattern in pattern_list:
        df_big_left['Name_modified_by_pattern'] = df_big_left.loc[:, 'Name_modified_by_pattern'].apply(pattern)
        df_dat_left['Name_modified_by_pattern'] = df_dat_left.loc[:, 'Name_modified_by_pattern'].apply(pattern)

    return df_dat_left, df_big_left

# Application

In [529]:
df_merge = merge_df(df_bigtable, df_dat, 'Name_modified')
print('Famille complètes : ')
done_list = get_family_done(df_merge,10)
print("Nombre de familles total : ", len(df_bigtable['Family'].unique()))

Le dataset contient 6324 valeurs dont 4949 correspondent aux fichiers dat.
Il reste 1375 valeurs à matcher.
Famille complètes : 
Family
NACA 65-series       608
NACA 66-series       604
NACA 67-series       548
NACA 5-digit         401
Habbe                137
Horten               133
Joukowsky            132
Riblett 30-series    114
Riblett 40-series    110
Riblett 37-series    110
Name: Name_big, dtype: int64
Nombre de familles total :  68


In [530]:
# DROP only after all matching is done 
df_dat.drop(df_dat[df_dat['Name_modified'].str.contains('horten')].index, inplace=True)
df_dat.drop(df_dat[df_dat['Name_modified'].str.contains('joukowsky')].index, inplace=True)

### **1. Create dataframes with wings left to match**

In [531]:
df_dat_left, df_big_left = initiate_df_left(df_bigtable, df_dat, df_merge)

Unmatched values left in the dat folder : (2366, 5)
Unmatched values left in big table : (1375, 5)


### **2. Regex to match the family Yost,Eiffel,Eppler**

In [532]:
groupna_family(df_merge, 5)

Family
Gottingen        381
Eppler           196
Uncategorized    138
Wortmann         107
NASA              54
Name: nb_na, dtype: int64


- Family yost : correspondance if deleting the spaces points and / from big e
- Family eiffel : correspondance if deleting the spaces and content between parenthesis + special case for eiffel 10 (wright) - 1903 wright flyer airfoil	
- Family eppler : correspondance if replacing eppler by e and removing spaces

In [533]:
big_e, dat_e = get_letter_group(df_dat_left, df_big_left, 'e')

Big table number of wings left beginning by e : (216, 6)
Dat folder number of wings left beginning by e : (217, 6)
Difference : -1


In [534]:
yost = [lambda x : (re.sub("\s", "", str(x))), 
        lambda x : (re.sub("\.", "", str(x))), 
        lambda x : (re.sub("/", "", str(x)))]

eiffel = [lambda x : (re.sub('\(.*?\)', "", str(x))), 
          lambda x : (re.sub("\s", "", str(x))), 
          lambda x : (re.split("-", str(x)))[0]]

eppler = [lambda x : (re.sub("eppler", "e", str(x))),
          lambda x : (re.sub("ee", "e", str(x))),
          lambda x : (re.sub("hydrofoil", "", str(x))),
          lambda x : (re.sub("strut", "", str(x))),
          lambda x : (re.sub("-", "", str(x))),
          lambda x : (re.sub("\(", "", str(x))),
          lambda x : (re.sub("\)", "", str(x))),
          lambda x : (re.sub("\s", "", str(x)))]

big_e = try_pattern_family(big_e, 'Yost', yost)
big_e = try_pattern_family(big_e, 'Eiffel', eiffel)
big_e = try_pattern_family(big_e, 'Eppler', eppler)


In [535]:
# Function to incorporate the new name found (Name_modified_by_family) into the dataset with the bigtable wings left to match
df_big_left = incorporate_family_pattern(df_big_left, big_e)
# Merge to match 
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')

Le dataset contient 1375 valeurs dont 208 correspondent aux fichiers dat.
Il reste 1167 valeurs à matcher.


### **3. Regex to match the family Gottingen**

In [536]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')
big_g, dat_g = get_letter_group(df_dat_left, df_big_left, 'g')

Unmatched values left in the dat folder : (2158, 6)
Unmatched values left in big table : (1167, 6)
Big table number of wings left beginning by g : (399, 6)
Dat folder number of wings left beginning by g : (401, 6)
Difference : -2


In [537]:
gottingen = [lambda x : (re.sub("\s", "", str(x))), lambda x : (re.sub('\(.*?\)', "", str(x)))]
big_g = try_pattern_family(big_g, 'Gottingen', gottingen)
# Incorporating the new name found (Name_modified_by_family) into the dataset with the bigtable wings left to match
df_big_left = incorporate_family_pattern(df_big_left, big_g)
# Merge to match 
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')

Le dataset contient 1167 valeurs dont 372 correspondent aux fichiers dat.
Il reste 795 valeurs à matcher.


### **4. Global regex with quick manual verif**

In [538]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')

Unmatched values left in the dat folder : (1786, 6)
Unmatched values left in big table : (795, 6)


In [539]:
pattern_global =[lambda x : (re.sub("\s", "", str(x))), 
                 lambda x : (re.sub("-", "", str(x))), 
                 lambda x : (re.sub("%", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 795 valeurs dont 245 correspondent aux fichiers dat.
Il reste 550 valeurs à matcher.


In [540]:
groupna_family(df_merge, 10)

Family
Uncategorized    100
NASA              49
Hepperle          49
Selig et. al.     48
Quabeck           42
Leinauer          25
Wortmann          22
NACA/Munk         22
Boeing            19
Barth             16
Name: nb_na, dtype: int64


### **5. keep trying**

In [541]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (1541, 6)
Unmatched values left in big table : (550, 6)


In [542]:
pattern_global =[lambda x : (re.sub("\s", "", str(x))),
                 lambda x : (re.sub("\.", "", str(x))),
                 lambda x : (re.sub(",", "", str(x))),
                 lambda x : (re.sub("/", "", str(x))),
                 lambda x : (re.sub("-", "", str(x))),
                 lambda x : (re.sub("_", "", str(x))),  
                 lambda x : (re.sub("%", "", str(x))),
                 lambda x : (re.sub("\(", "", str(x))),
                 lambda x : (re.sub("\)", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 550 valeurs dont 126 correspondent aux fichiers dat.
Il reste 424 valeurs à matcher.


In [543]:
groupna_family(df_merge, 10)

Family
Uncategorized    89
Hepperle         49
Selig et. al.    48
Leinauer         25
NACA/Munk        22
NASA             20
Boeing           19
Wortmann         19
Lockheed         14
Gulfstream       14
Name: nb_na, dtype: int64


### **6. Smooth -> sm**

In [544]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (1415, 6)
Unmatched values left in big table : (424, 6)


In [545]:
pattern_global =[lambda x : (re.sub("\s", "", str(x))),
                 lambda x : (re.sub("\.", "", str(x))),
                 lambda x : (re.sub(",", "", str(x))),
                 lambda x : (re.sub("/", "", str(x))),
                 lambda x : (re.sub("-", "", str(x))),
                 lambda x : (re.sub("_", "", str(x))),  
                 lambda x : (re.sub("%", "", str(x))),
                 lambda x : (re.sub("\(", "", str(x))),
                 lambda x : (re.sub("\)", "", str(x))),
                 lambda x : (re.sub("smoothed", "sm", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 424 valeurs dont 13 correspondent aux fichiers dat.
Il reste 411 valeurs à matcher.


### **7. Removing whats between parenthesis with manual verif**

In [546]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (1402, 6)
Unmatched values left in big table : (411, 6)


In [547]:
pattern_global =[lambda x : (re.sub('\(.*?\)', "", str(x))),
                 lambda x : (re.sub("\s", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

df_big_left['parenthesis_content'] = df_big_left['Name_modified'].copy()
df_big_left['parenthesis_content'] = df_big_left.loc[:, 'parenthesis_content'].apply(lambda x : (re.findall('\(.*?\)', str(x))))
df_dat_left['parenthesis_content'] = df_dat_left['Name_modified'].copy()
df_dat_left['parenthesis_content'] = df_dat_left.loc[:, 'parenthesis_content'].apply(lambda x : (re.findall('\(.*?\)', str(x))))

df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 411 valeurs dont 29 correspondent aux fichiers dat.
Il reste 382 valeurs à matcher.


In [548]:
groupna_family(df_merge, 5)

Family
Uncategorized    70
Hepperle         49
Selig et. al.    28
Leinauer         25
NACA/Munk        22
Name: nb_na, dtype: int64


### **8. Family Hepperle**

In [549]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')
big_m, dat_m = get_letter_group(df_dat_left, df_big_left, 'm')

Unmatched values left in the dat folder : (1373, 7)
Unmatched values left in big table : (382, 7)
Big table number of wings left beginning by m : (57, 7)
Dat folder number of wings left beginning by m : (86, 7)
Difference : -29


In [550]:
hepperle = [lambda x : (((re.split("\s", str(x)))[0])+(re.split("\s", str(x)))[1])]
big_m = try_pattern_family(big_m, 'Hepperle', hepperle)

In [551]:
df_big_left = incorporate_family_pattern(df_big_left, big_m)
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')

Le dataset contient 382 valeurs dont 49 correspondent aux fichiers dat.
Il reste 333 valeurs à matcher.


### **9. Family NACA/Munk**

In [552]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')
big_n, dat_n = get_letter_group(df_dat_left, df_big_left, 'n')

Unmatched values left in the dat folder : (1324, 7)
Unmatched values left in big table : (333, 7)
Big table number of wings left beginning by n : (55, 7)
Dat folder number of wings left beginning by n : (1023, 7)
Difference : -968


In [553]:
nacamunk = [lambda x : (re.sub("naca","", str(x))), lambda x : (re.sub("\s", "", str(x)))]
big_n = try_pattern_family(big_n, 'NACA/Munk', nacamunk)
df_big_left = incorporate_family_pattern(df_big_left, big_n)
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')
df_dat_left.drop(df_dat_left[df_dat_left['Name_modified'].str.contains('naca')].index, inplace=True)

Le dataset contient 333 valeurs dont 22 correspondent aux fichiers dat.
Il reste 311 valeurs à matcher.


### **10. Family Gulfstream**

In [554]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')

Unmatched values left in the dat folder : (323, 7)
Unmatched values left in big table : (311, 7)


In [555]:
# As much of giii rows in both dataset its the Gulfstream family 
giii_dat = df_dat_left[df_dat_left['Name_modified'].str.contains("giii")]
giii_big = df_big_left[df_big_left['Name_modified'].str.contains("giii")]
giinb = giii_big['Name_modified'].apply(lambda x : (re.findall('\d+', str(x))))
giinb = [int(nb[0]) for nb in giinb.values]
giinb = sorted(giinb)
giiiletter = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n']
dict_giii = {}
[dict_giii.update({('giiibl'+str(nb)) : ('giii'+str(letter))}) for nb, letter in zip(giinb, giiiletter)]
df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified_by_pattern'].map(dict_giii).fillna(df_big_left['Name_modified_by_pattern'])

df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 311 valeurs dont 14 correspondent aux fichiers dat.
Il reste 297 valeurs à matcher.


# Similar test

### Init

In [556]:
# Get the unmatched names of each dataset
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

# Choose some regex to uniformize the names left
pattern_global =[lambda x : (re.sub("\s", "", str(x))),
                 lambda x : (re.sub("\.", "", str(x))),
                 lambda x : (re.sub(",", "", str(x))),
                 lambda x : (re.sub("/", "", str(x))),
                 lambda x : (re.sub("-", "", str(x))),
                 lambda x : (re.sub("_", "", str(x))),  
                 lambda x : (re.sub("%", "", str(x))),
                 lambda x : (re.sub("\(", "", str(x))),
                 lambda x : (re.sub("\)", "", str(x))),
                 lambda x : (re.sub("smoothed", "sm", str(x)))]

# Apply the pattern before trying to match by similarities
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

# Download the dataframe left before the similarity affectation if needed
# df_dat_left.to_csv('data/df_dat_left.csv')
# df_big_left.to_csv('data/df_big_left.csv')

Unmatched values left in the dat folder : (309, 7)
Unmatched values left in big table : (297, 7)


In [557]:
print(df_dat_left.shape)
print(df_big_left.shape)

(309, 7)
(297, 7)


In [558]:
# Get list of names to try matching
list_big_modified = df_big_left['Name_modified'].values
list_dat_modified = df_dat_left['Name_modified'].values

list_big_modified_by_pattern = df_big_left['Name_modified_by_pattern'].values
list_dat_modified_by_pattern = df_dat_left['Name_modified_by_pattern'].values

list_big_modified_by_family = df_big_left['Name_modified_by_family'].values
list_dat_modified_by_family = df_dat_left['Name_modified_by_family'].values

### Function

In [559]:
from difflib import SequenceMatcher
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

from fuzzywuzzy import fuzz
def similar_fuzz(string1, string2):
    return fuzz.partial_ratio(string1, string2)

def find_match(list_big, list_dat):
    ratio = []
    word = []
    match = []
    partial_match = []
    partial_ratio = []

    for i in list_big:
        maxscore = 0
        maxpartial = 0
        storeword = "temp"
        storeword_partial = "temp"
        for j in list_dat:
            score = similar(i,j)
            partial_score = similar_fuzz(i,j)
            if (score > maxscore):
                maxscore = score
                storeword = j
            # Equal score are not handled by this function
            # It could be a great improvement 
            #if (score == maxscore):
            if (partial_score > maxpartial):
                maxpartial = partial_score
                storeword_partial = j
        word.append(i)
        match.append(storeword)
        ratio.append(maxscore)
        partial_match.append(storeword_partial)
        partial_ratio.append(maxpartial)
        df_similar = pd.DataFrame(data={'Name' : word, 'Match' : match, 'Score' : ratio, 'Match_2' : partial_match, 'Score_2' : partial_ratio})
    return df_similar

In [560]:
def manual_verif_by_score(df, score):
    max_born = score
    min_born = score - 0.1
    mask_score = (df['Score'] < max_born) & (df['Score'] >= min_born)
    return df[mask_score].sort_values(by=['Score'], ascending=False)

def manual_validation(df, score):
    max_born = score
    min_born = score - 0.1
    mask_score = (df['Score'] < max_born) & (df['Score'] >= min_born)
    df.loc[df[mask_score].index, 'checked'] = True
    return df

### Test

In [561]:
df_similar_modified = find_match(list_big_modified, list_dat_modified)
df_similar_modified_by_pattern = find_match(list_big_modified_by_pattern, list_dat_modified_by_pattern)
df_similar_modified_by_family = find_match(list_big_modified_by_family, list_dat_modified_by_family)
print(df_similar_modified['Score'].max())
print(df_similar_modified_by_pattern['Score'].max())
print(df_similar_modified_by_family['Score'].max())

0.875
0.9411764705882353
0.9411764705882353


We will work with df_similar_modified_by_pattern since it contains the best score among the three.

In [562]:
# We create some columns to store the result of basic verif
df_similar_modified_by_pattern['checked'] = False
df_similar_modified_by_pattern['same'] = False
df_similar_modified_by_pattern['unique'] = False

# We check if the match found are unique and if the 2 match found are the same
mask_unique = df_similar_modified_by_pattern['Match'].value_counts() == 1
index_unique = mask_unique[mask_unique == True].index.to_list()
name_all = df_similar_modified_by_pattern['Match'].to_list()
unique_col = []
for name in name_all:
    if(name in index_unique):
        unique_col.append(True)
    else:
        unique_col.append(False)
df_similar_modified_by_pattern['unique'] = unique_col
df_similar_modified_by_pattern.loc[(df_similar_modified_by_pattern['Match'] == df_similar_modified_by_pattern['Match_2']), 'same'] = True

### Result where the match is unique

In [563]:
df = df_similar_modified_by_pattern[df_similar_modified_by_pattern['unique'] == True].copy()
print(df.shape)
df.head()

(175, 8)


Unnamed: 0,Name,Match,Score,Match_2,Score_2,checked,same,unique
1,ag44ct02f,ag44ct02r,0.888889,ag44ct02r,89,False,True,True
2,ag455ct02frot,ag455ct02r,0.869565,ag455ct02r,90,False,True,True
3,ag45c03f,ag45c03,0.933333,ag45c03,100,False,True,True
4,ag45ct02f,ag45ct02r,0.888889,ag455ct02r,89,False,False,True
5,ag46c03f,ag46c03,0.933333,ag46c03,100,False,True,True


In [564]:
[print(f'{x/10} : {manual_verif_by_score(df, x/10).shape}') for x in range(10,1,-1)]

1.0 : (25, 8)
0.9 : (49, 8)
0.8 : (39, 8)
0.7 : (29, 8)
0.6 : (13, 8)
0.5 : (11, 8)
0.4 : (3, 8)
0.3 : (2, 8)
0.2 : (0, 8)


[None, None, None, None, None, None, None, None, None]

In [565]:
manual_verif_by_score(df, 1)
df = manual_validation(df, 1)

In [566]:
manual_verif_by_score(df, 0.9)
df = manual_validation(df, 0.9)

In [567]:
# Found 1 error with the correction below 
df.loc[df['Name'] == 'jwl0438821', 'Match'] =  'jwl043'
manual_verif_by_score(df, 0.8)
df = manual_validation(df, 0.8)

In [568]:
# Found 1 error with the correction below 
df.loc[df['Name'] == 'ssv2316vansrv10', 'Match'] =  'ssv2316'
manual_verif_by_score(df, 0.7)
df = manual_validation(df, 0.7)

In [569]:
# Found 4 error with the correction below 
df.loc[df['Name'] == 'uag92170sf', 'Match'] =  'uag92170sf'
df.loc[df['Name'] == 'grummank2', 'Match'] =  'k2'
df.loc[df['Name'] == 'grummank3', 'Match'] =  'k3'
df.loc[df['Name'] == 'jh35', 'Match'] =  'jh35'

manual_verif_by_score(df, 0.6)
df = manual_validation(df, 0.6)

In [570]:
manual_verif_by_score(df, 0.5)
df = manual_validation(df, 0.5)

In [571]:
# Found 1 error with the correction below 
df.loc[df['Name'] == 'lockheedc5abl4882', 'Match'] =  'c5b'

manual_verif_by_score(df, 0.4)
df = manual_validation(df, 0.4)

In [572]:
# Found 1 error with the correction below 
df.loc[df['Name'] == 'flyingwingpeewee30', 'Match'] =  'flyingwingpeeWee30'

manual_verif_by_score(df, 0.3)
df = manual_validation(df, 0.3)

In [573]:
df.loc[df['Name'] == 'naca63215modb', 'checked'] =  True

### Result not unique

In [574]:
# Merge the verified result 
df['Name_similar'] = df['Name'].copy()
df_big_left['Name_similar'] = df_big_left['Name_modified_by_pattern'].copy()

df_merge = pd.merge(df_big_left, df, on='Name_similar', how='left', suffixes=('_big', '_match'))
nb_mismatch = df_merge[df_merge['Match'].isna()].shape[0]
nb_match = df_merge.shape[0] - nb_mismatch
print(f'Le dataset contient {df_merge.shape[0]} valeurs dont {nb_match} correspondent aux fichiers dat.')
print(f'Il reste {nb_mismatch} valeurs à matcher.')

Le dataset contient 297 valeurs dont 175 correspondent aux fichiers dat.
Il reste 122 valeurs à matcher.


In [575]:
# Get the dataset with the unmatched values left
# Merge the verified result 
df['Match_similar'] = df['Match'].copy()
df_dat_left['Match_similar'] = df_dat_left['Name_modified_by_pattern'].copy()
df_dat_left = pd.merge(df, df_dat_left, on='Match_similar', how='right', suffixes=('_match', '_dat'))
df_dat_left = df_dat_left[df_dat_left['Match'].isna()]
df_dat_left.dropna(axis=1, how='all', inplace=True)
print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")

df_big_left = df_merge[df_merge['Match'].isna()].copy()
df_big_left.dropna(axis=1, how='all', inplace=True)
print(f"Unmatched values left in big table : {df_big_left.shape}")

df_big_left = df_big_left[['Name_big', 'Family', 'Name_modified', 'Name_similar', 'parenthesis_content']]
df_dat_left = df_dat_left[['Match_similar', 'Name_modified', 'Name_dat', 'parenthesis_content']]

Unmatched values left in the dat folder : (137, 8)
Unmatched values left in big table : (122, 8)


In [576]:
# relaunch similar on df left
# Get list of names to try matching
list_big_left = df_big_left['Name_similar'].values
list_dat_left = df_dat_left['Match_similar'].values
df_similar_left = find_match(list_big_left, list_dat_left)

In [577]:
[print(f'{x/10} : {manual_verif_by_score(df_similar_left, x/10).shape}') for x in range(10,1,-1)]

1.0 : (6, 5)
0.9 : (11, 5)
0.8 : (25, 5)
0.7 : (15, 5)
0.6 : (15, 5)
0.5 : (29, 5)
0.4 : (16, 5)
0.3 : (5, 5)
0.2 : (0, 5)


[None, None, None, None, None, None, None, None, None]

In [578]:
manual_verif_by_score(df_similar_left, 1)
df_similar_left = manual_validation(df_similar_left, 1)

In [579]:
# Found 1 error with the correction below 
manual_verif_by_score(df_similar_left, 0.9)
# match 2 is better than 1
mask_score = (df_similar_left['Score'] < 0.8) & (df_similar_left['Score'] >= 0.9)
df_similar_left.loc[df_similar_left[mask_score].index, 'Match'] = df_similar_left.loc[df_similar_left[mask_score].index, 'Match_2']
df_similar_left = manual_validation(df_similar_left, 0.9)

In [580]:
df_similar_left.loc[df_similar_left['checked'].isna(), 'Match'] = None

In [581]:
# Found 1 error with the correction below 
df_similar_left.loc[df_similar_left['Name'] == 'hq17', 'Match'] = 'hq17'
df_similar_left.loc[df_similar_left['Name'] == 'qa001sm', 'Match'] =  'qa001sm'
df_similar_left.loc[df_similar_left['Name'] == 'qa002sm', 'Match'] =  'qa002sm'
df_similar_left.loc[df_similar_left['Name'] == 's301010384', 'Match'] =  's3010'
df_similar_left.loc[df_similar_left['Name'] == 's301409585', 'Match'] =  's3014'
df_similar_left.loc[df_similar_left['Name'] == 's301609587', 'Match'] =  's3016'
df_similar_left.loc[df_similar_left['Name'] == 's302109584', 'Match'] =  's3021'
df_similar_left.loc[df_similar_left['Name'] == 'qa002', 'Match'] =  'qa002'
df_similar_left.loc[df_similar_left['Name'] == 'jh817', 'Match'] =  'jh817'

df_similar_left.loc[df_similar_left['Name'] == 'du80176v1', 'Match'] = 'du80176v1'
df_similar_left.loc[df_similar_left['Name'] == 'jh25', 'Match'] = 'jh25'
df_similar_left.loc[df_similar_left['Name'] == 'qa001', 'Match'] =  'qa001'
df_similar_left.loc[df_similar_left['Name'] == 'qa003', 'Match'] =  'qa003'
df_similar_left.loc[df_similar_left['Name'] == 'du80141', 'Match'] =  'du80141'
df_similar_left.loc[df_similar_left['Name'] == 'hawkertempest61semispan', 'Match'] =  'tempest2'
df_similar_left.loc[df_similar_left['Name'] == 'hawkertempest375semispan', 'Match'] =  'tempest1'
df_similar_left.loc[df_similar_left['Name'] == 'hawkertempest9677semispan', 'Match'] =  'tempest3'
df_similar_left.loc[df_similar_left['Name'] == 'du80176v1alt', 'Match'] =  'du80176v1alt'
df_similar_left.loc[df_similar_left['Name'] == 'lockheedc141bl0', 'Match'] =  'c141a'
df_similar_left.loc[df_similar_left['Name'] == 's1010hpvairfoil', 'Match'] =  's1010'

df_similar_left.loc[df_similar_left['Name'] == 'fage&collins1', 'Match'] = 'fg1'
df_similar_left.loc[df_similar_left['Name'] == 'fage&collins2', 'Match'] = 'fg2'
df_similar_left.loc[df_similar_left['Name'] == 'fage&collins3', 'Match'] = 'fg3'
df_similar_left.loc[df_similar_left['Name'] == 'fage&collins4', 'Match'] = 'fg4'
df_similar_left.loc[df_similar_left['Name'] == 'defiantcanardbl110', 'Match'] = 'defcnd2'
df_similar_left.loc[df_similar_left['Name'] == 'defiantcanardbl145', 'Match'] = 'defcnd3'


df_similar_left.loc[df_similar_left['Name'] == 'marskepioneerianaca2311243012ahybrid', 'Match'] = 'marske2'
df_similar_left.loc[df_similar_left['Name'] == 'marskepioneeriidrootnaca431012a24112hybrid', 'Match'] = 'marske3'
df_similar_left.loc[df_similar_left['Name'] == 'marskepioneeriidtipnaca431012a', 'Match'] = 'marske4'
df_similar_left.loc[df_similar_left['Name'] == 'marskepioneeriidtipnaca431012a*833hybrid', 'Match'] = 'marske4'
df_similar_left.loc[df_similar_left['Name'] == 'marskemonarchnaca43012a', 'Match'] = 'marske5'
df_similar_left.loc[df_similar_left['Name'] == 'lockheedgeorgiasupercritical', 'Match'] = 'lg10sc'
df_similar_left.loc[df_similar_left['Name'] == 'be656865tc75cambersuitableforf1atowlinegliderc', 'Match'] = 'be6568'
df_similar_left.loc[df_similar_left['Name'] == 'be669966maxtc99maxcamberclmax21atre100000trans', 'Match'] = 'be6699'
df_similar_left.loc[df_similar_left['Name'] == 'sikorskysc1094r8', 'Match'] = 'sc1094r8'
df_similar_left.loc[df_similar_left['Name'] == 'dragonflycanard', 'Match'] = 'drgnfly'

### New Merge

In [582]:
# Merge the verified result 
df_similar_left['Name_similar'] = df_similar_left['Name'].copy()
df_merge = pd.merge(df_big_left, df_similar_left, on='Name_similar', how='left', suffixes=('_big', '_match'))
nb_mismatch = df_merge[df_merge['Match'].isna()].shape[0]
nb_match = df_merge.shape[0] - nb_mismatch
print(f'Le dataset contient {df_merge.shape[0]} valeurs dont {nb_match} correspondent aux fichiers dat.')
print(f'Il reste {nb_mismatch} valeurs à matcher.')

Le dataset contient 124 valeurs dont 53 correspondent aux fichiers dat.
Il reste 71 valeurs à matcher.


In [583]:
# Get the dataset with the unmatched values left
# Merge the verified result 
df_similar_left['Match_similar'] = df_similar_left['Match'].copy()
df_dat_left = pd.merge(df_similar_left, df_dat_left, on='Match_similar', how='right', suffixes=('_match', '_dat'))
df_dat_left = df_dat_left[df_dat_left['Match'].isna()]
df_dat_left.dropna(axis=1, how='all', inplace=True)
print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")

df_big_left = df_merge[df_merge['Match'].isna()].copy()
df_big_left.dropna(axis=1, how='all', inplace=True)
print(f"Unmatched values left in big table : {df_big_left.shape}")

df_big_left = df_big_left[['Name_big', 'Family', 'Name_modified', 'Name_similar', 'parenthesis_content']]
df_dat_left = df_dat_left[['Match_similar', 'Name_modified', 'Name_dat', 'parenthesis_content']]

Unmatched values left in the dat folder : (102, 4)
Unmatched values left in big table : (71, 9)


In [584]:
# Get list of names to try matching
list_big_left = df_big_left['Name_similar'].values
list_dat_left = df_dat_left['Match_similar'].values

df_similar_left2 = find_match(list_big_left, list_dat_left)

In [585]:
print("1 : ", manual_verif_by_score(df_similar_left2, 1).shape)
print("0.9 : ",manual_verif_by_score(df_similar_left2, 0.9).shape)
print("0.8 : ",manual_verif_by_score(df_similar_left2, 0.8).shape)
print("0.7 : ",manual_verif_by_score(df_similar_left2, 0.7).shape)
print("0.6 : ",manual_verif_by_score(df_similar_left2, 0.6).shape)
print("0.5 : ",manual_verif_by_score(df_similar_left2, 0.5).shape)
print("0.4 : ",manual_verif_by_score(df_similar_left2, 0.4).shape)
print("0.3 : ",manual_verif_by_score(df_similar_left2, 0.3).shape)

1 :  (0, 5)
0.9 :  (0, 5)
0.8 :  (24, 5)
0.7 :  (4, 5)
0.6 :  (4, 5)
0.5 :  (20, 5)
0.4 :  (16, 5)
0.3 :  (3, 5)


In [586]:
df_similar_left2['unique'] = False
df_similar_left2['check'] = False
mask_unique = df_similar_left2['Match'].value_counts() == 1
index_unique = mask_unique[mask_unique == True].index.to_list()
name_all = df_similar_left2['Match'].to_list()
unique_col = []
for name in name_all:
    if(name in index_unique):
        unique_col.append(True)
    else:
        unique_col.append(False)
df_similar_left2['unique'] = unique_col
df_similar_left2.drop(index=[14,19,30,44], axis=0, inplace=True)
df_unique = df_similar_left2[df_similar_left2['unique'] == True]
df_duplicates = df_similar_left2[df_similar_left2['unique'] == False]
print(df_unique.shape)
print(df_duplicates.shape)

(5, 7)
(62, 7)


In [587]:
df_similar_left2.loc[df_similar_left2['unique'] == True, 'check'] = True

In [588]:
df_duplicates.sort_values(by=['Score'], ascending=False, ignore_index=True)
df_duplicates.loc[:, 'Match'] = df_duplicates.loc[:, 'Match_2'] 
df_duplicates.loc[df_duplicates['Name'] == 'davisbasicb24wing', 'Match'] = 'davis_corrected'
df_duplicates.loc[df_duplicates['Name'] == 'lockheedc141bl1136', 'Match'] = 'c141b'
df_duplicates.loc[df_duplicates['Name'] == 'lockheedc141bl42657', 'Match'] = 'c141c'
df_duplicates.loc[df_duplicates['Name'] == 'lockheedc141bl61061', 'Match'] = 'c141d'
df_duplicates.loc[df_duplicates['Name'] == 'lockheedc141bl76111', 'Match'] = 'c141e'
df_duplicates.loc[df_duplicates['Name'] == 'lockheedc141bl95889', 'Match'] = 'c141f'
df_duplicates.loc[df_duplicates['Name'] == 'lockheedc5abl1256', 'Match'] = 'c5e'
df_duplicates.loc[df_duplicates['Name'] == 'lockheedc5abl576', 'Match'] = 'c5c'
df_duplicates.loc[df_duplicates['Name'] == 'lockheedc5abl7586', 'Match'] = 'c5d'
df_duplicates.loc[df_duplicates['Name'] == 'marskexm1df1430', 'Match'] = 'marske1'
df_duplicates.loc[df_duplicates['Name'] == 'naca001034a=08cli=02', 'Match'] = 'naca001034a08cli0'
df_duplicates.loc[df_duplicates['Name'] == 'ronczlowdragflyingwing', 'Match'] = 'marske7'

df_duplicates.loc[:, 'check'] = True
df_duplicates.loc[df_duplicates['Name'] == 'deesokay230', 'check'] = False
df_duplicates.loc[df_duplicates['Name'] == 'naca001264a=08cli=02', 'check'] = False
df_duplicates.loc[df_duplicates['Name'] == 'x35lowdragbody', 'check'] = False

In [589]:
df_final = pd.concat([df_duplicates, df_unique], ignore_index=True,axis=0)

In [590]:
# Cyrano II P-30 cyranoiip30
# jhsym10 JHSYM-10
# naca6621812a=6p51tip	 not found
# swallowp30 SwallowP-30
#deesokay230 surement problème

In [591]:
# Merge the verified result 
df_final['Name_similar'] = df_final['Name'].copy()
df_merge = pd.merge(df_big_left, df_final, on='Name_similar', how='left', suffixes=('_big', '_match'))
nb_mismatch = df_merge[df_merge['Match'].isna()].shape[0]
nb_match = df_merge.shape[0] - nb_mismatch
print(f'Le dataset contient {df_merge.shape[0]} valeurs dont {nb_match} correspondent aux fichiers dat.')
print(f'Il reste {nb_mismatch} valeurs à matcher.')

Le dataset contient 71 valeurs dont 67 correspondent aux fichiers dat.
Il reste 4 valeurs à matcher.


In [592]:
# Get the dataset with the unmatched values left
# Merge the verified result 
df_final['Match_similar'] = df_final['Match'].copy()
df_dat_left = pd.merge(df_final, df_dat_left, on='Match_similar', how='right', suffixes=('_match', '_dat'))
df_dat_left = df_dat_left[df_dat_left['Match'].isna()]
df_dat_left.dropna(axis=1, how='all', inplace=True)
print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")

df_big_left = df_merge[df_merge['Match'].isna()].copy()
df_big_left.dropna(axis=1, how='all', inplace=True)
print(f"Unmatched values left in big table : {df_big_left.shape}")

df_big_left = df_big_left[['Name_big', 'Family', 'Name_modified', 'Name_similar', 'parenthesis_content']]
df_dat_left = df_dat_left[['Match_similar', 'Name_modified', 'Name_dat', 'parenthesis_content']]

Unmatched values left in the dat folder : (46, 4)
Unmatched values left in big table : (4, 5)


In [593]:
df_big_left

Unnamed: 0,Name_big,Family,Name_modified,Name_similar,parenthesis_content
14,BOEING-VERTOL VR-9,Boeing,boeing-vertol vr-9,boeingvertolvr9,[]
19,HUGHES HELICOPTERS HH-02,Uncategorized,hughes helicopters hh-02,hugheshelicoptershh02,[]
30,JWL068 9.5/1.7,Leinauer,jwl068 9.5/1.7,jwl0689517,[]
44,NACA 5-H-10,NACA,naca 5-h-10,naca5h10,[]
