In [761]:
import os
import pandas as pd
import numpy as np
import regex as re

# Set Up

### Get CSV and check duplicates

In [762]:
pd.set_option('display.max_row', 8353 )

In [763]:
df_dat = pd.read_csv('data/dat_files_index.csv', usecols=('name', 'size'))
df_dat.rename(columns={'name':'Name', 'size':'Size'}, inplace=True)
print(df_dat.shape)
df_dat.head()

(8353, 2)


Unnamed: 0,Name,Size
0,a18.DAT,945
1,a18sm.DAT,1795
2,A6014-S.DAT,6116
3,A6016-S.DAT,5914
4,A6018-S.DAT,5889


In [764]:
# Checking if the dataset contains duplicates
doublon_datfile = df_dat['Name'].value_counts().index[df_dat['Name'].value_counts().values > 1]
print(f'Nombre de doublons dans la bigtable : {len(doublon_datfile)}')
print([i for i in doublon_datfile])

Nombre de doublons dans la bigtable : 0
[]


In [765]:
df_bigtable = pd.read_csv('data/ailes_avion.csv', usecols=('Name', 'Family'))
print(df_bigtable.shape)
df_bigtable.head()

(6324, 2)


Unnamed: 0,Name,Family
0,63A108 MOD C,NASA
1,A18,Uncategorized
2,A18 (SMOOTHED),Uncategorized
3,A6014-S,Ayers
4,A6016-S,Ayers


In [766]:
# Checking if the dataset contains duplicates
doublon_bigtable = df_bigtable['Name'].value_counts().index[df_bigtable['Name'].value_counts().values > 1]
print(f'Nombre de doublons dans la bigtable : {len(doublon_bigtable)}')
print([i for i in doublon_bigtable])

Nombre de doublons dans la bigtable : 2
['FX 66-17AII-182', 'BOEING 737 MIDSPAN']


### Execute Basic Regex and exceptions

In [767]:
# Create a column Name_modified with all names in lowercase and without the .DAT ending 
df_dat['Name_modified'] =  df_dat['Name'].apply(lambda x : (re.split(".DAT$", str(x)))[0])
df_dat['Name_modified'] = df_dat['Name_modified'].apply(lambda x : str(x).lower())
# Create a column Name_modified with all names in lowercase
df_bigtable['Name_modified'] = df_bigtable['Name'].apply(lambda x : str(x).lower())

In [768]:
# Manual affectation of the duplicates after verification
df_bigtable.loc[596, ['Name_modified']] = 'fx6617ai'
df_bigtable.loc[597, ['Name_modified']] = 'fx6617a2'
df_bigtable.loc[154, ['Name_modified']] = 'b737c'
df_bigtable.loc[155, ['Name_modified']] = 'b737b'

In [769]:
# Family exceptions beginning by 'g'
df_bigtable.loc[df_bigtable['Name_modified'] == 'gu25-5(11)8', 'Name_modified'] = 'gu255118'
df_bigtable.loc[df_bigtable['Name_modified'] == 'gs-1', 'Name_modified'] = 'gs1'
df_bigtable.loc[df_bigtable['Name_modified'] == 'griffith 30% suction airfoil', 'Name_modified'] = 'griffith30symsuction'
df_bigtable.loc[df_bigtable['Name_modified'] == 'goe 167 (v.karman prop.2)', 'Name_modified'] = 'goe167'
df_bigtable.loc[df_bigtable['Name_modified'] == 'glenn martin 2', 'Name_modified'] = 'glennmartin2'
df_bigtable.loc[df_bigtable['Name_modified'] == 'glenn martin 3', 'Name_modified'] = 'glennmartin3'
df_bigtable.loc[df_bigtable['Name_modified'] == 'glenn martin 4', 'Name_modified'] = 'glennmartin4'

In [770]:
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler 377mod', 'Name_modified'] = 'e377m'
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler e193gu-k24', 'Name_modified'] = 'e193gu-k24'
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler ste 87(-3)-914', 'Name_modified'] = 'ste87391'
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler ste 871-514', 'Name_modified'] = 'ste87151'
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler stf 863-615', 'Name_modified'] = 'stf86361'

In [771]:
df_bigtable.loc[df_bigtable['Name_modified'] == 'aquila 9.3% smoothed', 'Name_modified'] = 'aquilasm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'bergey bw-3 (smoothed)', 'Name_modified'] = 'bw3'
df_bigtable.loc[df_bigtable['Name_modified'] == 'clark-y 11.7% smoothed', 'Name_modified'] = 'clarkysm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'cody robertson cr 001 r/c hand-launch low reynolds number airfoil (smoothed)', 'Name_modified'] = 'cr001sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'dale house dh4009 (smoothed) used on the storm r/c aerobatic aircraft', 'Name_modified'] = 'dh4009sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'fx 60-100 10.0% smoothed', 'Name_modified'] = 'fx60100sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'fx 63-137 13.7% smoothed', 'Name_modified'] = 'fx63137sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'fx 74-cl5-140 mod (smoothed)', 'Name_modified'] = 'fx74modsm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'mb253515 15.0% smoothed', 'Name_modified'] = 'mb253515sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'r140 12.04% (smoothed)', 'Name_modified'] = 'r140sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'spica 11.73% smoothed', 'Name_modified'] = 'spicasm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'usnps4 (smoothed)', 'Name_modified'] = 'usnps4'
df_bigtable.loc[df_bigtable['Name_modified'] == 'wb-135/35 13.5% smoothed', 'Name_modified'] = 'wb13535sm'

In [772]:
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .08 span', 'Name_modified'] = 'b707a'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .19 span', 'Name_modified'] = 'b707b'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .40 span', 'Name_modified'] = 'b707c'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .54 span', 'Name_modified'] = 'b707d'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .99 span', 'Name_modified'] = 'b707e'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 103', 'Name_modified'] = 'boe103'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 106', 'Name_modified'] = 'boe106'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 737 root', 'Name_modified'] = 'b737a'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 737 outboard', 'Name_modified'] = 'b737d'
df_bigtable.loc[df_bigtable['Name_modified'] == 'kc-135 bl124.32', 'Name_modified'] = 'kc135b'
df_bigtable.loc[df_bigtable['Name_modified'] == 'kc-135 bl200.76', 'Name_modified'] = 'kc135c'
df_bigtable.loc[df_bigtable['Name_modified'] == 'kc-135 bl351.6', 'Name_modified'] = 'kc135d'
df_bigtable.loc[df_bigtable['Name_modified'] == 'kc-135 bl52.44', 'Name_modified'] = 'kc135a'

In [773]:
df_bigtable.loc[df_bigtable['Name_modified'] == 'naca m6 (65%)', 'Name_modified'] = 'm6_65'
df_bigtable.loc[df_bigtable['Name_modified'] == 'naca m6 (85%)', 'Name_modified'] = 'm6_85'

# Functions

### Basic Functions

In [774]:
# Function to merge both dataset and return the df merged
def merge_df(df_bigtable, df_dat, on_column):
    df_merge = pd.merge(df_bigtable, df_dat, on=on_column, how='left', suffixes=('_big', '_dat'))
    nb_mismatch = df_merge[df_merge['Name_dat'].isna()].shape[0]
    nb_match = df_merge.shape[0] - nb_mismatch
    print(f'Le dataset contient {df_merge.shape[0]} valeurs dont {nb_match} correspondent aux fichiers dat.')
    print(f'Il reste {nb_mismatch} valeurs à matcher.')
    df_merge.head()
    return df_merge


# Function to get the list of family fully matched
def get_family_done(df_merge, nb):
    family = df_merge.groupby(['Family']).count()
    mask_family = (family['Name_big'] == family['Name_dat'])
    res = family[mask_family].Name_big.sort_values(ascending=False).head(nb)
    print(res)
    return res


# Function to get the list of missing values per family aka mismatch
def groupna_family(df_merge, nb):
    family = df_merge.groupby(['Family']).count()
    mask_family = (family['Name_big'] != family['Name_dat'])
    family_na = family[mask_family].copy()
    family_na['nb_na'] = family_na['Name_big'] - family_na['Size']
    # We are interested by the 'nb' families with the most of na
    print(family_na['nb_na'].sort_values(ascending=False).head(nb))


# Function to iinitialize df_big et df_dat
# We want to look through the wings left in each dataframe in order to avoid corrupting good matchs
def initiate_df_left(df_bigtable, df_dat, df_merge):
    # Create df_dat_left and df_big_left
    df_dat_left = pd.merge(df_bigtable, df_dat, on='Name_modified', how='right', suffixes=('_big', '_dat'))
    df_dat_left = df_dat_left[df_dat_left['Name_big'].isna()].copy()
    print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")
    df_big_left = df_merge[df_merge['Name_dat'].isna()].copy()
    print(f"Unmatched values left in big table : {df_big_left.shape}")

    # Create a new column for each df containing the first letter of each wing
    df_dat_left['First letter'] = [x[0] for x in df_dat_left['Name_modified'].values]
    df_big_left['First letter'] = [x[0] for x in df_big_left['Name_modified'].values]

    # Create a new column for each df with z copy of the modified name to further work on
    df_dat_left['Name_modified_by_family'] = df_dat_left['Name_modified'].copy()
    df_big_left['Name_modified_by_family'] = df_big_left['Name_modified'].copy()

    # Create a new column for each df with z copy of the modified name to further work on
    df_dat_left['Name_modified_by_pattern'] = df_dat_left['Name_modified'].copy()
    df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified'].copy()


    # df_dat_left ----> ['Name_big', 'Family', 'Name_modified', 'Name_dat', 'Size', 'First letter', 'Name_modified_by_family']
    # Name_big and Family are NaN
    df_dat_left.drop(columns=['Name_big', 'Family'], inplace=True)
    # df_big_left ----> ['Name_big', 'Family', 'Name_modified', 'Name_dat', 'Size', 'First letter', 'Name_modified_by_family']
    # Name_dat and Size are NaN
    df_big_left.drop(columns=['Name_dat', 'Size'], inplace=True)

    return df_dat_left, df_big_left

### Functions to match using specifig family pattern

In [775]:
# Function to create filter by first letter
def get_letter_group(df_dat_left, df_big_left, letter):
    big_letter = df_big_left[df_big_left['First letter'] == letter]
    dat_letter = df_dat_left[df_dat_left['First letter'] == letter]
    print(f'Big table number of wings left beginning by {letter} : {big_letter.shape}')
    print(f'Dat folder number of wings left beginning by {letter} : {dat_letter.shape}')
    print(f'Difference : {big_letter.shape[0] - dat_letter.shape[0]}')
    return big_letter, dat_letter


# Function to pass through pattern
def try_pattern_family(big_letter, family, pattern_list):
    big_letter = big_letter.copy()
    for pattern in pattern_list:
        # Apply pattern
        big_letter.loc[big_letter['Family'] == family, 'Name_modified_by_family'] = big_letter.loc[big_letter['Family'] == family, 'Name_modified_by_family'].apply(pattern)
    return big_letter


# Function to incorporate the new name found (Name_modified_by_family) into the dataset with the bigtable wings left to match
def incorporate_family_pattern(df_big_left, big_letter):
    df_big_left = pd.merge(df_big_left, big_letter[['Name_big', 'Name_modified_by_family']], on='Name_big', how='left', suffixes=('_left', '_big'))
    df_big_left['Name_modified_by_family_big'] = df_big_left['Name_modified_by_family_big'].fillna(df_big_left['Name_modified_by_family_left'])
    df_big_left.drop(["Name_modified_by_family_left"], inplace=True, axis=1)
    df_big_left.rename(columns={'Name_modified_by_family_big':'Name_modified_by_family'}, inplace=True)
    return df_big_left

### Functions to match using global pattern

In [776]:
def set_df_left(df_dat_left, df_merge, on_column):
    # Create df_dat_left and df_big_left
    df_dat_left = pd.merge(df_merge, df_dat_left, on=on_column, how='right', suffixes=('_big', '_dat'))
    df_dat_left = df_dat_left[df_dat_left['Name_big'].isna()]
    df_dat_left.dropna(axis=1, how='all', inplace=True)
    df_dat_left = df_dat_left.set_axis([re.sub('_dat', "", str(col)) for col in df_dat_left.columns], axis=1)
    df_dat_left.rename(columns={'Name':'Name_dat'}, inplace=True)
    print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")
    df_big_left = df_merge[df_merge['Name_dat'].isna()].copy()
    df_big_left.dropna(axis=1, how='all', inplace=True)
    df_big_left = df_big_left.set_axis([re.sub('_big', "", str(col)) for col in df_big_left.columns], axis=1)
    df_big_left.rename(columns={'Name':'Name_big'}, inplace=True)
    print(f"Unmatched values left in big table : {df_big_left.shape}")
    return df_dat_left, df_big_left


def try_pattern(df_big_left, df_dat_left, pattern_list):
    df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified'].copy()
    df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified'].copy()

    for pattern in pattern_list:
        df_big_left['Name_modified_by_pattern'] = df_big_left.loc[:, 'Name_modified_by_pattern'].apply(pattern)
        df_dat_left['Name_modified_by_pattern'] = df_dat_left.loc[:, 'Name_modified_by_pattern'].apply(pattern)

    return df_dat_left, df_big_left

# Application

In [777]:
df_merge = merge_df(df_bigtable, df_dat, 'Name_modified')
print('Famille complètes : ')
done_list = get_family_done(df_merge,10)
print("Nombre de familles total : ", len(df_bigtable['Family'].unique()))

Le dataset contient 6324 valeurs dont 4949 correspondent aux fichiers dat.
Il reste 1375 valeurs à matcher.
Famille complètes : 
Family
NACA 65-series       608
NACA 66-series       604
NACA 67-series       548
NACA 5-digit         401
Habbe                137
Horten               133
Joukowsky            132
Riblett 30-series    114
Riblett 40-series    110
Riblett 37-series    110
Name: Name_big, dtype: int64
Nombre de familles total :  68


In [778]:
# DROP only after all matching is done 
df_dat.drop(df_dat[df_dat['Name_modified'].str.contains('horten')].index, inplace=True)
df_dat.drop(df_dat[df_dat['Name_modified'].str.contains('joukowsky')].index, inplace=True)

In [779]:
# Store the match in a dataframe 
df_match = df_merge[df_merge['Name_dat'].isna() == False].copy()
print(df_match.shape)

(4949, 5)


### **1. Create dataframes with wings left to match**

In [780]:
df_dat_left, df_big_left = initiate_df_left(df_bigtable, df_dat, df_merge)

Unmatched values left in the dat folder : (2366, 5)
Unmatched values left in big table : (1375, 5)


### **2. Regex to match the family Yost,Eiffel,Eppler**

In [781]:
groupna_family(df_merge, 5)

Family
Gottingen        381
Eppler           196
Uncategorized    138
Wortmann         107
NASA              54
Name: nb_na, dtype: int64


- Family yost : correspondance if deleting the spaces points and / from big e
- Family eiffel : correspondance if deleting the spaces and content between parenthesis + special case for eiffel 10 (wright) - 1903 wright flyer airfoil	
- Family eppler : correspondance if replacing eppler by e and removing spaces

In [782]:
big_e, dat_e = get_letter_group(df_dat_left, df_big_left, 'e')

Big table number of wings left beginning by e : (216, 6)
Dat folder number of wings left beginning by e : (217, 6)
Difference : -1


In [783]:
yost = [lambda x : (re.sub("\s", "", str(x))), 
        lambda x : (re.sub("\.", "", str(x))), 
        lambda x : (re.sub("/", "", str(x)))]

eiffel = [lambda x : (re.sub('\(.*?\)', "", str(x))), 
          lambda x : (re.sub("\s", "", str(x))), 
          lambda x : (re.split("-", str(x)))[0]]

eppler = [lambda x : (re.sub("eppler", "e", str(x))),
          lambda x : (re.sub("ee", "e", str(x))),
          lambda x : (re.sub("hydrofoil", "", str(x))),
          lambda x : (re.sub("strut", "", str(x))),
          lambda x : (re.sub("-", "", str(x))),
          lambda x : (re.sub("\(", "", str(x))),
          lambda x : (re.sub("\)", "", str(x))),
          lambda x : (re.sub("\s", "", str(x)))]

big_e = try_pattern_family(big_e, 'Yost', yost)
big_e = try_pattern_family(big_e, 'Eiffel', eiffel)
big_e = try_pattern_family(big_e, 'Eppler', eppler)


In [784]:
# Function to incorporate the new name found (Name_modified_by_family) into the dataset with the bigtable wings left to match
df_big_left = incorporate_family_pattern(df_big_left, big_e)
# Merge to match 
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')
# Concat the new match to the storage df 
df_big_match = df_merge[df_merge['Name_dat'].isna() == False].copy()
df_match =  pd.concat([df_match, df_big_match], ignore_index=True,axis=0)

Le dataset contient 1375 valeurs dont 208 correspondent aux fichiers dat.
Il reste 1167 valeurs à matcher.


### **3. Regex to match the family Gottingen**

In [785]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')
big_g, dat_g = get_letter_group(df_dat_left, df_big_left, 'g')

Unmatched values left in the dat folder : (2158, 6)
Unmatched values left in big table : (1167, 6)
Big table number of wings left beginning by g : (399, 6)
Dat folder number of wings left beginning by g : (401, 6)
Difference : -2


In [786]:
gottingen = [lambda x : (re.sub("\s", "", str(x))), lambda x : (re.sub('\(.*?\)', "", str(x)))]
big_g = try_pattern_family(big_g, 'Gottingen', gottingen)
# Incorporating the new name found (Name_modified_by_family) into the dataset with the bigtable wings left to match
df_big_left = incorporate_family_pattern(df_big_left, big_g)
# Merge to match 
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')
# Concat the new match to the storage df 
df_big_match = df_merge[df_merge['Name_dat'].isna() == False].copy()
df_match =  pd.concat([df_match, df_big_match], ignore_index=True,axis=0)

Le dataset contient 1167 valeurs dont 372 correspondent aux fichiers dat.
Il reste 795 valeurs à matcher.


### **4. Global regex with quick manual verif**

In [787]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')

Unmatched values left in the dat folder : (1786, 6)
Unmatched values left in big table : (795, 6)


In [788]:
pattern_global =[lambda x : (re.sub("\s", "", str(x))), 
                 lambda x : (re.sub("-", "", str(x))), 
                 lambda x : (re.sub("%", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')
# Concat the new match to the storage df 
df_big_match = df_merge[df_merge['Name_dat'].isna() == False].copy()
df_match =  pd.concat([df_match, df_big_match], ignore_index=True,axis=0)

Le dataset contient 795 valeurs dont 245 correspondent aux fichiers dat.
Il reste 550 valeurs à matcher.


In [789]:
groupna_family(df_merge, 10)

Family
Uncategorized    100
NASA              49
Hepperle          49
Selig et. al.     48
Quabeck           42
Leinauer          25
Wortmann          22
NACA/Munk         22
Boeing            19
Barth             16
Name: nb_na, dtype: int64


### **5. keep trying**

In [790]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (1541, 6)
Unmatched values left in big table : (550, 6)


In [791]:
pattern_global =[lambda x : (re.sub("\s", "", str(x))),
                 lambda x : (re.sub("\.", "", str(x))),
                 lambda x : (re.sub(",", "", str(x))),
                 lambda x : (re.sub("/", "", str(x))),
                 lambda x : (re.sub("-", "", str(x))),
                 lambda x : (re.sub("_", "", str(x))),  
                 lambda x : (re.sub("%", "", str(x))),
                 lambda x : (re.sub("\(", "", str(x))),
                 lambda x : (re.sub("\)", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')
# Concat the new match to the storage df 
df_big_match = df_merge[df_merge['Name_dat'].isna() == False].copy()
df_match =  pd.concat([df_match, df_big_match], ignore_index=True,axis=0)

Le dataset contient 550 valeurs dont 126 correspondent aux fichiers dat.
Il reste 424 valeurs à matcher.


In [792]:
groupna_family(df_merge, 10)

Family
Uncategorized    89
Hepperle         49
Selig et. al.    48
Leinauer         25
NACA/Munk        22
NASA             20
Boeing           19
Wortmann         19
Lockheed         14
Gulfstream       14
Name: nb_na, dtype: int64


### **6. Smooth -> sm**

In [793]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (1415, 6)
Unmatched values left in big table : (424, 6)


In [794]:
pattern_global =[lambda x : (re.sub("\s", "", str(x))),
                 lambda x : (re.sub("\.", "", str(x))),
                 lambda x : (re.sub(",", "", str(x))),
                 lambda x : (re.sub("/", "", str(x))),
                 lambda x : (re.sub("-", "", str(x))),
                 lambda x : (re.sub("_", "", str(x))),  
                 lambda x : (re.sub("%", "", str(x))),
                 lambda x : (re.sub("\(", "", str(x))),
                 lambda x : (re.sub("\)", "", str(x))),
                 lambda x : (re.sub("smoothed", "sm", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')
# Concat the new match to the storage df 
df_big_match = df_merge[df_merge['Name_dat'].isna() == False].copy()
df_match =  pd.concat([df_match, df_big_match], ignore_index=True,axis=0)

Le dataset contient 424 valeurs dont 13 correspondent aux fichiers dat.
Il reste 411 valeurs à matcher.


### **7. Removing whats between parenthesis with manual verif**

In [795]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (1402, 6)
Unmatched values left in big table : (411, 6)


In [796]:
pattern_global =[lambda x : (re.sub('\(.*?\)', "", str(x))),
                 lambda x : (re.sub("\s", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

df_big_left['parenthesis_content'] = df_big_left['Name_modified'].copy()
df_big_left['parenthesis_content'] = df_big_left.loc[:, 'parenthesis_content'].apply(lambda x : (re.findall('\(.*?\)', str(x))))
df_dat_left['parenthesis_content'] = df_dat_left['Name_modified'].copy()
df_dat_left['parenthesis_content'] = df_dat_left.loc[:, 'parenthesis_content'].apply(lambda x : (re.findall('\(.*?\)', str(x))))

df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')
# Concat the new match to the storage df 
df_big_match = df_merge[df_merge['Name_dat'].isna() == False].copy()
df_match =  pd.concat([df_match, df_big_match], ignore_index=True,axis=0)

Le dataset contient 411 valeurs dont 29 correspondent aux fichiers dat.
Il reste 382 valeurs à matcher.


In [797]:
groupna_family(df_merge, 5)

Family
Uncategorized    70
Hepperle         49
Selig et. al.    28
Leinauer         25
NACA/Munk        22
Name: nb_na, dtype: int64


### **8. Family Hepperle**

In [798]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')
big_m, dat_m = get_letter_group(df_dat_left, df_big_left, 'm')

Unmatched values left in the dat folder : (1373, 7)
Unmatched values left in big table : (382, 7)
Big table number of wings left beginning by m : (57, 7)
Dat folder number of wings left beginning by m : (86, 7)
Difference : -29


In [799]:
hepperle = [lambda x : (((re.split("\s", str(x)))[0])+(re.split("\s", str(x)))[1])]
big_m = try_pattern_family(big_m, 'Hepperle', hepperle)

In [800]:
df_big_left = incorporate_family_pattern(df_big_left, big_m)
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')
# Concat the new match to the storage df 
df_big_match = df_merge[df_merge['Name_dat'].isna() == False].copy()
df_match =  pd.concat([df_match, df_big_match], ignore_index=True,axis=0)

Le dataset contient 382 valeurs dont 49 correspondent aux fichiers dat.
Il reste 333 valeurs à matcher.


### **9. Family NACA/Munk**

In [801]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')
big_n, dat_n = get_letter_group(df_dat_left, df_big_left, 'n')

Unmatched values left in the dat folder : (1324, 7)
Unmatched values left in big table : (333, 7)
Big table number of wings left beginning by n : (55, 7)
Dat folder number of wings left beginning by n : (1023, 7)
Difference : -968


In [802]:
nacamunk = [lambda x : (re.sub("naca","", str(x))), lambda x : (re.sub("\s", "", str(x)))]
big_n = try_pattern_family(big_n, 'NACA/Munk', nacamunk)
df_big_left = incorporate_family_pattern(df_big_left, big_n)
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')
df_dat_left.drop(df_dat_left[df_dat_left['Name_modified'].str.contains('naca')].index, inplace=True)
# Concat the new match to the storage df 
df_big_match = df_merge[df_merge['Name_dat'].isna() == False].copy()
df_match =  pd.concat([df_match, df_big_match], ignore_index=True,axis=0)

Le dataset contient 333 valeurs dont 22 correspondent aux fichiers dat.
Il reste 311 valeurs à matcher.


### **10. Family Gulfstream**

In [803]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')

Unmatched values left in the dat folder : (323, 7)
Unmatched values left in big table : (311, 7)


In [804]:
# As much of giii rows in both dataset its the Gulfstream family 
giii_dat = df_dat_left[df_dat_left['Name_modified'].str.contains("giii")]
giii_big = df_big_left[df_big_left['Name_modified'].str.contains("giii")]
giinb = giii_big['Name_modified'].apply(lambda x : (re.findall('\d+', str(x))))
giinb = [int(nb[0]) for nb in giinb.values]
giinb = sorted(giinb)
giiiletter = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n']
dict_giii = {}
[dict_giii.update({('giiibl'+str(nb)) : ('giii'+str(letter))}) for nb, letter in zip(giinb, giiiletter)]
df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified_by_pattern'].map(dict_giii).fillna(df_big_left['Name_modified_by_pattern'])

df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')
# Concat the new match to the storage df 
df_big_match = df_merge[df_merge['Name_dat'].isna() == False].copy()
df_match =  pd.concat([df_match, df_big_match], ignore_index=True,axis=0)

Le dataset contient 311 valeurs dont 14 correspondent aux fichiers dat.
Il reste 297 valeurs à matcher.


### **11. Store results of matchmaking**

In [805]:
df_match = df_match[['Name_big', 'Family', 'Name_modified', 'Name_dat', 'Size']]
df_match.to_csv('data/df_match.csv')
df_match['SimilarFunction'] = False
print(df_match.shape)
df_match.head()

(6027, 6)


Unnamed: 0,Name_big,Family,Name_modified,Name_dat,Size,SimilarFunction
0,A18,Uncategorized,a18,a18.DAT,945.0,False
1,A6014-S,Ayers,a6014-s,A6014-S.DAT,6116.0,False
2,A6016-S,Ayers,a6016-s,A6016-S.DAT,5914.0,False
3,A6018-S,Ayers,a6018-s,A6018-S.DAT,5889.0,False
4,A6020-S,Ayers,a6020-s,A6020-S.DAT,6116.0,False


# Similar test

### Init

In [806]:
# Get the unmatched names of each dataset
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

# Choose some regex to uniformize the names left
pattern_global =[lambda x : (re.sub("\s", "", str(x))),
                 lambda x : (re.sub("\.", "", str(x))),
                 lambda x : (re.sub(",", "", str(x))),
                 lambda x : (re.sub("/", "", str(x))),
                 lambda x : (re.sub("-", "", str(x))),
                 lambda x : (re.sub("_", "", str(x))),  
                 lambda x : (re.sub("%", "", str(x))),
                 lambda x : (re.sub("\(", "", str(x))),
                 lambda x : (re.sub("\)", "", str(x))),
                 lambda x : (re.sub("smoothed", "sm", str(x)))]

# Apply the pattern before trying to match by similarities
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

# Download the dataframe left before the similarity affectation if needed
df_dat_left.to_csv('data/df_dat_left.csv')
df_big_left.to_csv('data/df_big_left.csv')

Unmatched values left in the dat folder : (309, 7)
Unmatched values left in big table : (297, 7)


In [807]:
# Found 1 error with the correction below 
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'hq17', 'Name_modified_by_pattern'] = 'hq17'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'qa001sm', 'Name_modified_by_pattern'] =  'qa001sm'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'qa002sm', 'Name_modified_by_pattern'] =  'qa002sm'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 's301010384', 'Name_modified_by_pattern'] =  's3010'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 's301409585', 'Name_modified_by_pattern'] =  's3014'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 's301609587', 'Name_modified_by_pattern'] =  's3016'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 's302109584', 'Name_modified_by_pattern'] =  's3021'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'qa002', 'Name_modified_by_pattern'] =  'qa002'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'jh817', 'Name_modified_by_pattern'] =  'jh817'

df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'du80176v1', 'Name_modified_by_pattern'] = 'du80176v1'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'jh25', 'Name_modified_by_pattern'] = 'jh25'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'qa001', 'Name_modified_by_pattern'] =  'qa001'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'qa003', 'Name_modified_by_pattern'] =  'qa003'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'du80141', 'Name_modified_by_pattern'] =  'du80141'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'hawkertempest61semispan', 'Name_modified_by_pattern'] =  'tempest2'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'hawkertempest375semispan', 'Name_modified_by_pattern'] =  'tempest1'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'hawkertempest9677semispan', 'Name_modified_by_pattern'] =  'tempest3'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'du80176v1alt', 'Name_modified_by_pattern'] =  'du80176v1alt'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'lockheedc141bl0', 'Name_modified_by_pattern'] =  'c141a'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 's1010hpvairfoil', 'Name_modified_by_pattern'] =  's1010'

df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'fage&collins1', 'Name_modified_by_pattern'] = 'fg1'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'fage&collins2', 'Name_modified_by_pattern'] = 'fg2'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'fage&collins3', 'Name_modified_by_pattern'] = 'fg3'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'fage&collins4', 'Name_modified_by_pattern'] = 'fg4'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'defiantcanardbl110', 'Name_modified_by_pattern'] = 'defcnd2'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'defiantcanardbl145', 'Name_modified_by_pattern'] = 'defcnd3'

df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'marskepioneerianaca2311243012ahybrid', 'Name_modified_by_pattern'] = 'marske2'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'marskepioneeriidrootnaca431012a24112hybrid', 'Name_modified_by_pattern'] = 'marske3'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'marskepioneeriidtipnaca431012a', 'Name_modified_by_pattern'] = 'marske4'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'marskepioneeriidtipnaca431012a*833hybrid', 'Name_modified_by_pattern'] = 'marske4'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'marskemonarchnaca43012a', 'Name_modified_by_pattern'] = 'marske5'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'lockheedgeorgiasupercritical', 'Name_modified_by_pattern'] = 'lg10sc'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'be656865tc75cambersuitableforf1atowlinegliderc', 'Name_modified_by_pattern'] = 'be6568'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'be669966maxtc99maxcamberclmax21atre100000trans', 'Name_modified_by_pattern'] = 'be6699'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'sikorskysc1094r8', 'Name_modified_by_pattern'] = 'sc1094r8'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'dragonflycanard', 'Name_modified_by_pattern'] = 'drgnfly'

df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'davisbasicb24wing', 'Name_modified_by_pattern'] = 'davis_corrected'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'lockheedc141bl1136', 'Name_modified_by_pattern'] = 'c141b'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'lockheedc141bl42657', 'Name_modified_by_pattern'] = 'c141c'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'lockheedc141bl61061', 'Name_modified_by_pattern'] = 'c141d'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'lockheedc141bl76111', 'Name_modified_by_pattern'] = 'c141e'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'lockheedc141bl95889', 'Name_modified_by_pattern'] = 'c141f'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'lockheedc5abl1256', 'Name_modified_by_pattern'] = 'c5e'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'lockheedc5abl576', 'Name_modified_by_pattern'] = 'c5c'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'lockheedc5abl7586', 'Name_modified_by_pattern'] = 'c5d'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'marskexm1df1430', 'Name_modified_by_pattern'] = 'marske1'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'naca001034a=08cli=02', 'Name_modified_by_pattern'] = 'naca001034a08cli0'
df_big_left.loc[df_big_left['Name_modified_by_pattern'] == 'ronczlowdragflyingwing', 'Name_modified_by_pattern'] = 'marske7'

In [808]:
# Merge the verified result 
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')
# Concat the new match to the storage df 
df_big_match = df_merge[df_merge['Name_dat'].isna() == False].copy()
df_match =  pd.concat([df_match, df_big_match], ignore_index=True,axis=0)
df_match = df_match[['Name_big', 'Family', 'Name_modified', 'Name_dat', 'Size', 'SimilarFunction']]
df_match.loc[df_match['SimilarFunction'].isna(), 'SimilarFunction'] = True
df_match['checked'] = True
print(df_match.shape)
print(df_match.columns)
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Le dataset contient 297 valeurs dont 33 correspondent aux fichiers dat.
Il reste 264 valeurs à matcher.
(6060, 7)
Index(['Name_big', 'Family', 'Name_modified', 'Name_dat', 'Size',
       'SimilarFunction', 'checked'],
      dtype='object')
Unmatched values left in the dat folder : (277, 7)
Unmatched values left in big table : (264, 7)


In [809]:
# Get list of names to try matching
list_big_modified = df_big_left['Name_modified'].values
list_dat_modified = df_dat_left['Name_modified'].values

list_big_modified_by_pattern = df_big_left['Name_modified_by_pattern'].values
list_dat_modified_by_pattern = df_dat_left['Name_modified_by_pattern'].values

list_big_modified_by_family = df_big_left['Name_modified_by_family'].values
list_dat_modified_by_family = df_dat_left['Name_modified_by_family'].values

### Function

In [810]:
from difflib import SequenceMatcher
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

from fuzzywuzzy import fuzz
def similar_fuzz(string1, string2):
    return fuzz.partial_ratio(string1, string2)

def find_match(list_big, list_dat):
    ratio = []
    word = []
    match = []
    partial_match = []
    partial_ratio = []

    for i in list_big:
        maxscore = 0
        maxpartial = 0
        storeword = "temp"
        storeword_partial = "temp"
        for j in list_dat:
            score = similar(i,j)
            partial_score = similar_fuzz(i,j)
            if (score > maxscore):
                maxscore = score
                storeword = j
            # Equal score are not handled by this function
            # It could be a great improvement 
            #if (score == maxscore):
            if (partial_score > maxpartial):
                maxpartial = partial_score
                storeword_partial = j
        word.append(i)
        match.append(storeword)
        ratio.append(maxscore)
        partial_match.append(storeword_partial)
        partial_ratio.append(maxpartial)
        df_similar = pd.DataFrame(data={'Name' : word, 'Match' : match, 'Score' : ratio, 'Match_2' : partial_match, 'Score_2' : partial_ratio})
    return df_similar

In [811]:
def manual_verif_by_score(df, score):
    max_born = score
    min_born = score - 0.1
    mask_score = (df['Score'] < max_born) & (df['Score'] >= min_born)
    return df[mask_score].sort_values(by=['Score'], ascending=False)

def manual_validation(df, score):
    max_born = score
    min_born = score - 0.1
    mask_score = (df['Score'] < max_born) & (df['Score'] >= min_born)
    df.loc[df[mask_score].index, 'checked'] = True
    return df

### Test

In [812]:
df_similar_modified = find_match(list_big_modified, list_dat_modified)
df_similar_modified_by_pattern = find_match(list_big_modified_by_pattern, list_dat_modified_by_pattern)
df_similar_modified_by_family = find_match(list_big_modified_by_family, list_dat_modified_by_family)
print(df_similar_modified['Score'].max())
print(df_similar_modified_by_pattern['Score'].max())
print(df_similar_modified_by_family['Score'].max())

0.875
0.9655172413793104
0.9411764705882353


We will work with df_big_left since it contains the best score among the three.

In [813]:
# We create some columns to store the result of basic verif
df_similar_modified_by_pattern['checked'] = False
df_similar_modified_by_pattern['same'] = False
df_similar_modified_by_pattern['unique'] = False

# We check if the match found are unique and if the 2 match found are the same
mask_unique = df_similar_modified_by_pattern['Match'].value_counts() == 1
index_unique = mask_unique[mask_unique == True].index.to_list()
name_all = df_similar_modified_by_pattern['Match'].to_list()
unique_col = []
for name in name_all:
    if(name in index_unique):
        unique_col.append(True)
    else:
        unique_col.append(False)
df_similar_modified_by_pattern['unique'] = unique_col
df_similar_modified_by_pattern.loc[(df_similar_modified_by_pattern['Match'] == df_similar_modified_by_pattern['Match_2']), 'same'] = True

### Result where the match is unique

In [814]:
df = df_similar_modified_by_pattern[df_similar_modified_by_pattern['unique'] == True].copy()
print(df.shape)
df.head()

(174, 8)


Unnamed: 0,Name,Match,Score,Match_2,Score_2,checked,same,unique
1,ag44ct02f,ag44ct02r,0.888889,ag44ct02r,89,False,True,True
2,ag455ct02frot,ag455ct02r,0.869565,ag455ct02r,90,False,True,True
3,ag45c03f,ag45c03,0.933333,ag45c03,100,False,True,True
4,ag45ct02f,ag45ct02r,0.888889,ag455ct02r,89,False,False,True
5,ag46c03f,ag46c03,0.933333,ag46c03,100,False,True,True


In [815]:
[print(f'{x/10} : {manual_verif_by_score(df, x/10).shape}') for x in range(10,1,-1)]

1.0 : (26, 8)
0.9 : (49, 8)
0.8 : (39, 8)
0.7 : (28, 8)
0.6 : (10, 8)
0.5 : (12, 8)
0.4 : (5, 8)
0.3 : (1, 8)
0.2 : (0, 8)


[None, None, None, None, None, None, None, None, None]

In [816]:
manual_verif_by_score(df, 1)
df = manual_validation(df, 1)

In [817]:
manual_verif_by_score(df, 0.9)
df = manual_validation(df, 0.9)

In [818]:
# Found 1 error with the correction below 
df.loc[df['Name'] == 'jwl0438821', 'Match'] =  'jwl043'
manual_verif_by_score(df, 0.8)
df = manual_validation(df, 0.8)

In [819]:
# Found 1 error with the correction below 
df.loc[df['Name'] == 'ssv2316vansrv10', 'Match'] =  'ssv2316'
manual_verif_by_score(df, 0.7)
df = manual_validation(df, 0.7)

In [820]:
# Found 4 error with the correction below 
df.loc[df['Name'] == 'uag92170sf', 'Match'] =  'uag92170sf'
df.loc[df['Name'] == 'grummank2', 'Match'] =  'k2'
df.loc[df['Name'] == 'grummank3', 'Match'] =  'k3'
df.loc[df['Name'] == 'jh35', 'Match'] =  'jh35'

manual_verif_by_score(df, 0.6)
df = manual_validation(df, 0.6)

In [821]:
manual_verif_by_score(df, 0.5)
df = manual_validation(df, 0.5)

In [822]:
# Found 1 error with the correction below 
df.loc[df['Name'] == 'lockheedc5abl4882', 'Match'] =  'c5b'

manual_verif_by_score(df, 0.4)
df = manual_validation(df, 0.4)

In [823]:
# Found 1 error with the correction below 
df.loc[df['Name'] == 'flyingwingpeewee30', 'Match'] =  'flyingwingpeeWee30'

manual_verif_by_score(df, 0.3)
df = manual_validation(df, 0.3)

In [824]:
df.loc[df['checked'] != True, 'checked'] =  True

In [825]:
df.head()

Unnamed: 0,Name,Match,Score,Match_2,Score_2,checked,same,unique
1,ag44ct02f,ag44ct02r,0.888889,ag44ct02r,89,True,True,True
2,ag455ct02frot,ag455ct02r,0.869565,ag455ct02r,90,True,True,True
3,ag45c03f,ag45c03,0.933333,ag45c03,100,True,True,True
4,ag45ct02f,ag45ct02r,0.888889,ag455ct02r,89,True,False,True
5,ag46c03f,ag46c03,0.933333,ag46c03,100,True,True,True


In [826]:
df_big_left.head()

Unnamed: 0,Name_big,Family,Name_modified,First letter,Name_modified_by_pattern,parenthesis_content,Name_modified_by_family
0,63A108 MOD C,NASA,63a108 mod c,6,63a108modc,[],63a108 mod c
1,AG44CT -02F,Drela,ag44ct -02f,a,ag44ct02f,[],ag44ct -02f
2,AG455CT -02F ROT.,Drela,ag455ct -02f rot.,a,ag455ct02frot,[],ag455ct -02f rot.
3,AG45C -03F,Drela,ag45c -03f,a,ag45c03f,[],ag45c -03f
4,AG45CT -02F,Drela,ag45ct -02f,a,ag45ct02f,[],ag45ct -02f


### Store the result

In [827]:
df.shape

(174, 8)

In [828]:
# Merge the verified result 
df['Name_similar'] = df['Name'].copy()
df_big_left['Name_similar'] = df_big_left['Name_modified_by_pattern'].copy()

df_merge = pd.merge(df_big_left, df, on='Name_similar', how='left', suffixes=('_big', '_match'))
nb_mismatch = df_merge[df_merge['Match'].isna()].shape[0]
nb_match = df_merge.shape[0] - nb_mismatch
print(f'Le dataset contient {df_merge.shape[0]} valeurs dont {nb_match} correspondent aux fichiers dat.')
print(f'Il reste {nb_mismatch} valeurs à matcher.')

Le dataset contient 264 valeurs dont 174 correspondent aux fichiers dat.
Il reste 90 valeurs à matcher.


In [829]:
df_big_match = df_merge[df_merge['Match'].isna() == False].copy()
df_big_match = df_big_match[['Name_big', 'Family', 'Name_similar', 'Match', 'checked']]
df_big_match.rename(columns={"Match": "Name_modified_by_pattern"},inplace=True)
df_match =  pd.concat([df_match, df_big_match], ignore_index=True,axis=0)
print(df_match.shape)
df_match.head()

(6234, 9)


Unnamed: 0,Name_big,Family,Name_modified,Name_dat,Size,SimilarFunction,checked,Name_similar,Name_modified_by_pattern
0,A18,Uncategorized,a18,a18.DAT,945.0,False,True,,
1,A6014-S,Ayers,a6014-s,A6014-S.DAT,6116.0,False,True,,
2,A6016-S,Ayers,a6016-s,A6016-S.DAT,5914.0,False,True,,
3,A6018-S,Ayers,a6018-s,A6018-S.DAT,5889.0,False,True,,
4,A6020-S,Ayers,a6020-s,A6020-S.DAT,6116.0,False,True,,


In [830]:
# Complete with Name dat file
df_match = pd.merge(df_match, df_dat_left[['Name_modified_by_pattern', 'Name_dat', 'Size']], on='Name_modified_by_pattern', how='left', suffixes=('_match', '_left'))
df_match['Name_dat'] = df_match['Name_dat_match'].fillna(df_match['Name_dat_left'])
df_match['Size'] = df_match['Size_match'].fillna(df_match['Size_left'])
df_match.drop(columns=['Name_dat_match', 'Size_match', 'Name_modified_by_pattern', 'Name_dat_left', 'Size_left'], inplace=True)

In [831]:
##### Exceptions !!!!!!
df_match[df_match['Name_dat'].isna()]

Unnamed: 0,Name_big,Family,Name_modified,SimilarFunction,checked,Name_similar,Name_dat,Size
6132,JH35,Hammond,,,True,jh35,,
6229,UAG92 170/SF,Uncategorized,,,True,uag92170sf,,


### Result not unique

In [832]:
# Get the dataset with the unmatched values left
# Merge the verified result 
df['Match_similar'] = df['Match'].copy()
df_dat_left['Match_similar'] = df_dat_left['Name_modified_by_pattern'].copy()
df_dat_left = pd.merge(df, df_dat_left, on='Match_similar', how='right', suffixes=('_match', '_dat'))
df_dat_left = df_dat_left[df_dat_left['Match'].isna()]
df_dat_left.dropna(axis=1, how='all', inplace=True)
print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")

df_big_left = df_merge[df_merge['Match'].isna()].copy()
df_big_left.dropna(axis=1, how='all', inplace=True)
print(f"Unmatched values left in big table : {df_big_left.shape}")

df_big_left = df_big_left[['Name_big', 'Family', 'Name_modified', 'Name_similar', 'parenthesis_content']]
df_dat_left = df_dat_left[['Match_similar', 'Name_modified', 'Name_dat', 'parenthesis_content']]

Unmatched values left in the dat folder : (105, 8)
Unmatched values left in big table : (90, 8)


In [833]:
# relaunch similar on df left
# Get list of names to try matching
list_big_left = df_big_left['Name_similar'].values
error_list = ['hq17', 'qa001sm', 'qa002sm', 'qa001', 'qa002', 'qa003',
              'jh817', 'jh25', 'du80176v1', 'du80141','naca001034a08cli0']
list_big_left = [name for name in list_big_left if name not in error_list]
list_dat_left = df_dat_left['Match_similar'].values
df_similar_left = find_match(list_big_left, list_dat_left)

In [834]:
# error_list = ['hq17', 'qa001sm', 'qa002sm', 'qa001', 'qa002', 'qa003',
#               'jh817', 'jh25', 'jh35', 'du8017v1alt', 'du80176v1', 'du80141',
#               'uag9217sf', 'naca001034a08cli0']

In [835]:
# error not in list_big_left : du8017v1alt, uag9217sf, jh35   du80176v1alt
error_list = ['hq17', 'qa001sm', 'qa002sm', 'qa001', 'qa002', 'qa003',
              'jh817', 'jh25', 'du80176v1', 'du80141','naca001034a08cli0', 'du80176v1alt']

In [836]:
[print(f'{x/10} : {manual_verif_by_score(df_similar_left, x/10).shape}') for x in range(10,1,-1)]

1.0 : (6, 5)
0.9 : (11, 5)
0.8 : (25, 5)
0.7 : (3, 5)
0.6 : (6, 5)
0.5 : (10, 5)
0.4 : (15, 5)
0.3 : (3, 5)
0.2 : (0, 5)


[None, None, None, None, None, None, None, None, None]

In [837]:
manual_verif_by_score(df_similar_left, 1)
df_similar_left = manual_validation(df_similar_left, 1)

In [838]:
# Found 1 error with the correction below 
manual_verif_by_score(df_similar_left, 0.9)
# match 2 is better than 1
mask_score = (df_similar_left['Score'] >= 0.8) & (df_similar_left['Score'] <= 0.9)
df_similar_left.loc[df_similar_left[mask_score].index, 'Match'] = df_similar_left.loc[df_similar_left[mask_score].index, 'Match_2']
df_similar_left = manual_validation(df_similar_left, 0.9)

In [839]:
manual_verif_by_score(df_similar_left, 0.8)
# match 2 is better than 1
mask_score = (df_similar_left['Score'] >= 0.7) & (df_similar_left['Score'] <= 0.8)
df_similar_left.loc[df_similar_left[mask_score].index, 'Match'] = df_similar_left.loc[df_similar_left[mask_score].index, 'Match_2']
df_similar_left = manual_validation(df_similar_left, 0.8)

In [840]:
manual_verif_by_score(df_similar_left, 0.7)
df_similar_left = manual_validation(df_similar_left, 0.7)

In [841]:
manual_verif_by_score(df_similar_left, 0.6)

Unnamed: 0,Name,Match,Score,Match_2,Score_2,checked
5,ah217version,ah217,0.588235,ah217,100,
8,amsoil1rutancanard,amsoil1,0.56,amsoil1,100,
23,deesokay230,oa206,0.5,oa206,60,
24,du80176v1alt,du861372,0.5,vr1,67,
34,jhsym10,n5h10,0.5,m12,67,
76,swallowp30,jwl030,0.5,jwl030,50,


In [842]:
manual_verif_by_score(df_similar_left, 0.5)

Unnamed: 0,Name,Match,Score,Match_2,Score_2,checked
56,naca66218155a=6p51root,p51droot,0.466667,p51droot,93,
12,boeingvertolv43012158,v43012,0.444444,v43012,100,
13,boeingvertolv43015248,v43012,0.444444,v43015,100,
21,cyranoiip30,raf30md,0.444444,m3,50,
30,horstmannandquasthq300gdmod2,hq300gd2,0.444444,hq300gd2,88,
55,naca6621812a=6p51tip,p51dtip,0.444444,p51dtip,92,
57,naca001264a=08cli=02,a63a108c,0.428571,c5a,67,
67,republics3,rhodesg30,0.421053,s3,100,
58,nplfromarccp1372,hsnlf213,0.416667,nplx,75,
69,roncz1046voyagercanard,oneradsharpte,0.4,r1046,80,


In [843]:
df_similar_left.loc[df_similar_left['checked'].isna(), 'Match'] = None

### Store the result

In [844]:
# Merge the verified result 
df_similar_left['Name_similar'] = df_similar_left['Name'].copy()
df_merge = pd.merge(df_big_left, df_similar_left, on='Name_similar', how='left', suffixes=('_big', '_match'))
nb_mismatch = df_merge[df_merge['Match'].isna()].shape[0]
nb_match = df_merge.shape[0] - nb_mismatch
print(f'Le dataset contient {df_merge.shape[0]} valeurs dont {nb_match} correspondent aux fichiers dat.')
print(f'Il reste {nb_mismatch} valeurs à matcher.')

Le dataset contient 90 valeurs dont 45 correspondent aux fichiers dat.
Il reste 45 valeurs à matcher.


In [845]:
df_big_match = df_merge[df_merge['Match'].isna() == False].copy()
df_big_match = df_big_match[['Name_big', 'Family', 'Name_similar', 'Match', 'checked']]
df_big_match.rename(columns={"Match": "Match_similar"},inplace=True)
df_match =  pd.concat([df_match, df_big_match], ignore_index=True,axis=0)
print(df_match.shape)

(6279, 9)


In [846]:
# Complete with Name dat file
df_match = pd.merge(df_match, df_dat_left[['Match_similar', 'Name_modified', 'Name_dat']], on='Match_similar', how='left', suffixes=('_match', '_left'))
df_match['Name_dat'] = df_match['Name_dat_match'].fillna(df_match['Name_dat_left'])
df_match['Name_modified'] = df_match['Name_modified_match'].fillna(df_match['Name_modified_left'])
df_match.drop(columns=['Name_dat_match', 'Name_dat_left', 'Name_modified_match', 'Name_modified_left'], inplace=True)

In [847]:
df_match.tail(5)

Unnamed: 0,Name_big,Family,SimilarFunction,checked,Name_similar,Size,Match_similar,Name_dat,Name_modified
6274,SD7032-099-88,Selig et. al.,,True,sd703209988,,sd7032,sd7032.DAT,sd7032
6275,SD7037-092-88,Selig et. al.,,True,sd703709288,,sd7037,sd7037.DAT,sd7037
6276,SD8000-089-88,Selig et. al.,,True,sd800008988,,sd8000,sd8000.DAT,sd8000
6277,SD8020-010-88,Selig et. al.,,True,sd802001088,,sd8020,sd8020.DAT,sd8020
6278,TH 25816 HALE,Uncategorized,,True,th25816hale,,th25816,th25816.DAT,th25816


In [848]:
##### Exceptions !!!!!!
df_match[df_match['Name_dat'].isna()]

Unnamed: 0,Name_big,Family,SimilarFunction,checked,Name_similar,Size,Match_similar,Name_dat,Name_modified
6132,JH35,Hammond,,True,jh35,,,,
6229,UAG92 170/SF,Uncategorized,,True,uag92170sf,,,,


In [849]:
# df_match.to_csv('index_90left.csv')

### Last wings

In [850]:
# Get the dataset with the unmatched values left
# Merge the verified result 
df_similar_left['Match_similar'] = df_similar_left['Match'].copy()
df_dat_left = pd.merge(df_similar_left, df_dat_left, on='Match_similar', how='right', suffixes=('_match', '_dat'))
df_dat_left = df_dat_left[df_dat_left['Match'].isna()]
df_dat_left.dropna(axis=1, how='all', inplace=True)
print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")

df_big_left = df_merge[df_merge['Match'].isna()].copy()
df_big_left.dropna(axis=1, how='all', inplace=True)
print(f"Unmatched values left in big table : {df_big_left.shape}")

df_big_left = df_big_left[['Name_big', 'Family', 'Name_modified', 'Name_similar', 'parenthesis_content']]
df_dat_left = df_dat_left[['Match_similar', 'Name_modified', 'Name_dat', 'parenthesis_content']]

Unmatched values left in the dat folder : (60, 4)
Unmatched values left in big table : (45, 9)


In [851]:
# Get list of names to try matching
list_big_left = df_big_left['Name_similar'].values
error_list = ['hq17', 'qa001sm', 'qa002sm', 'qa001', 'qa002', 'qa003',
              'jh817', 'jh25', 'du80176v1', 'du80141','naca001034a08cli0', 
              'cyranoiip30', 'du80176v1alt', 'flyingwingpeewee30']
list_big_left = [name for name in list_big_left if name not in error_list]
list_dat_left = df_dat_left['Match_similar'].values

df_similar_left2 = find_match(list_big_left, list_dat_left)

In [852]:
[print(f'{x/10} : {manual_verif_by_score(df_similar_left2, x/10).shape}') for x in range(10,0,-1)]

1.0 : (0, 5)
0.9 : (0, 5)
0.8 : (0, 5)
0.7 : (0, 5)
0.6 : (2, 5)
0.5 : (6, 5)
0.4 : (19, 5)
0.3 : (3, 5)
0.2 : (1, 5)
0.1 : (0, 5)


[None, None, None, None, None, None, None, None, None, None]

In [853]:
df_similar_left2['unique'] = False
df_similar_left2['checked'] = False
mask_unique = df_similar_left2['Match'].value_counts() == 1
index_unique = mask_unique[mask_unique == True].index.to_list()
name_all = df_similar_left2['Match'].to_list()
unique_col = []
for name in name_all:
    if(name in index_unique):
        unique_col.append(True)
    else:
        unique_col.append(False)
df_similar_left2['unique'] = unique_col
#df_similar_left2.drop(index=[14,19,30,44], axis=0, inplace=True)
df_unique = df_similar_left2[df_similar_left2['unique'] == True].copy()
df_duplicates = df_similar_left2[df_similar_left2['unique'] == False].copy()
print(df_unique.shape)
print(df_duplicates.shape)

(17, 7)
(14, 7)


In [854]:
# df_unique seems all good 
df_unique.loc[df_unique['Name'] == 'nyugrummank1', 'Match'] = 'k1'
df_similar_left2.loc[df_similar_left2['unique'] == True, 'checked'] = True
df_unique['checked'] = True

In [856]:
df_duplicates.loc[df_duplicates['Name'] == 'boeingvertolvr1', 'Match'] = 'vr1'
df_duplicates.loc[df_duplicates['Name'] == 'nplfromarccp1372', 'Match'] = 'nplx'
df_duplicates.loc[df_duplicates['Name'] == 'amsoil1rutancanard', 'Match'] = 'amsoil1'
df_duplicates.loc[df_duplicates['Name'] == 'republics3', 'Match'] = 's3'
df_duplicates.loc[df_duplicates['Name'] == 'boeingvertolv43012158', 'Match'] = 'v43012'
df_duplicates.loc[df_duplicates['Name'] == 'boeingvertolv43015248', 'Match'] = 'v43015'
df_duplicates.loc[df_duplicates['Name'] == 'naca66218155a=6p51root', 'Match'] = 'p51hroot'
df_duplicates.loc[df_duplicates['Name'] == 'roncz1082voyagerrootouteraftwing', 'Match'] = 'r1082'
df_duplicates.loc[df_duplicates['Name'] == 'roncz1082tvoyagertipouteraftwing', 'Match'] = 'r1082t'

In [857]:
df_duplicates.loc[:, 'checked'] = True
df_duplicates.loc[df_duplicates['Name'] == 'deesokay230', 'checked'] = False
df_duplicates.loc[df_duplicates['Name'] == 'naca001264a=08cli=02', 'checked'] = False
df_duplicates.loc[df_duplicates['Name'] == 'x35lowdragbody', 'checked'] = False
df_duplicates.loc[df_duplicates['Name'] == 'jhsym10', 'checked'] = False
df_duplicates.loc[df_duplicates['Name'] == 'swallowp30', 'checked'] = False
df_final = pd.concat([df_duplicates[df_duplicates['checked'] == True], df_unique], ignore_index=True,axis=0)

In [859]:
# Merge the verified result 
df_final['Name_similar'] = df_final['Name'].copy()
df_merge = pd.merge(df_big_left, df_final, on='Name_similar', how='left', suffixes=('_big', '_match'))
nb_mismatch = df_merge[df_merge['Match'].isna()].shape[0]
nb_match = df_merge.shape[0] - nb_mismatch
print(f'Le dataset contient {df_merge.shape[0]} valeurs dont {nb_match} correspondent aux fichiers dat.')
print(f'Il reste {nb_mismatch} valeurs à matcher.')

Le dataset contient 45 valeurs dont 26 correspondent aux fichiers dat.
Il reste 19 valeurs à matcher.


In [860]:
df_big_match = df_merge[df_merge['Match'].isna() == False].copy()
df_big_match = df_big_match[['Name_big', 'Family', 'Match', 'checked']]
df_big_match.rename(columns={"Match": "Name_modified"},inplace=True)
df_match =  pd.concat([df_match, df_big_match], ignore_index=True,axis=0)
print(df_match.shape)
df_match.head()

(6305, 9)


Unnamed: 0,Name_big,Family,SimilarFunction,checked,Name_similar,Size,Match_similar,Name_dat,Name_modified
0,A18,Uncategorized,False,True,,945.0,,a18.DAT,a18
1,A6014-S,Ayers,False,True,,6116.0,,A6014-S.DAT,a6014-s
2,A6016-S,Ayers,False,True,,5914.0,,A6016-S.DAT,a6016-s
3,A6018-S,Ayers,False,True,,5889.0,,A6018-S.DAT,a6018-s
4,A6020-S,Ayers,False,True,,6116.0,,A6020-S.DAT,a6020-s


In [861]:
df_match.tail(20)

Unnamed: 0,Name_big,Family,SimilarFunction,checked,Name_similar,Size,Match_similar,Name_dat,Name_modified
6285,BOEING VERTOL V43015-2.48,Boeing,,True,,,,,v43015
6286,BOEING-VERTOL VR-1,Boeing,,True,,,,,vr1
6287,BOEING-VERTOL VR-5,Boeing,,True,,,,,vr5
6288,BOEING-VERTOL VR-7,Boeing,,True,,,,,vr7
6289,BOEING-VERTOL VR-7 WITH TAB,Boeing,,True,,,,,vr7b
6290,BOEING-VERTOL VR-8,Boeing,,True,,,,,vr8
6291,BOEING-VERTOL VR-8 WITH TAB,Boeing,,True,,,,,vr8b
6292,BOEING-VERTOL VR-9,Boeing,,True,,,,,vr9
6293,HORSTMANN AND QUAST HQ-300 GD(MOD 2),Uncategorized,,True,,,,,hq300gd2
6294,HUGHES HELICOPTERS HH-02,Uncategorized,,True,,,,,hh02


In [862]:
6279+17

6296

In [863]:
# Get the dataset with the unmatched values left
# Merge the verified result 
df_final['Match_similar'] = df_final['Match'].copy()
df_dat_left = pd.merge(df_final, df_dat_left, on='Match_similar', how='right', suffixes=('_match', '_dat'))
df_dat_left = df_dat_left[df_dat_left['Match'].isna()]
df_dat_left.dropna(axis=1, how='all', inplace=True)
print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")

df_big_left = df_merge[df_merge['Match'].isna()].copy()
df_big_left.dropna(axis=1, how='all', inplace=True)
print(f"Unmatched values left in big table : {df_big_left.shape}")

df_big_left = df_big_left[['Name_big', 'Family', 'Name_modified', 'Name_similar', 'parenthesis_content']]
df_dat_left = df_dat_left[['Match_similar', 'Name_modified', 'Name_dat', 'parenthesis_content']]

Unmatched values left in the dat folder : (34, 4)
Unmatched values left in big table : (19, 5)


In [867]:
df_big_left[['Name_big', 'Name_similar']]

Unnamed: 0,Name_big,Name_similar
14,CYRANO II P-30,cyranoiip30
15,DEES OKAY230,deesokay230
16,DU 80-141,du80141
17,DU 80-176 V1,du80176v1
18,DU 80-176 V1 ALT,du80176v1alt
19,FLYING WING PEE WEE 30,flyingwingpeewee30
21,HQ-17,hq17
23,JH25,jh25
24,JH817,jh817
25,JHSYM-10,jhsym10


In [874]:
df_match.head()

Unnamed: 0,Name_big,Family,SimilarFunction,Size,Name_dat,Name_modified
0,A18,Uncategorized,False,945.0,a18.DAT,a18
1,A6014-S,Ayers,False,6116.0,A6014-S.DAT,a6014-s
2,A6016-S,Ayers,False,5914.0,A6016-S.DAT,a6016-s
3,A6018-S,Ayers,False,5889.0,A6018-S.DAT,a6018-s
4,A6020-S,Ayers,False,6116.0,A6020-S.DAT,a6020-s


In [875]:
df_dat_left.head()

Unnamed: 0,Match_similar,Name_modified,Name_dat,parenthesis_content
1,ah93w480b,ah93w480b,ah93w480b.DAT,[]
7,dga1182,dga1182,dga1182.DAT,[]
8,e664ex,e664ex,e664ex.DAT,[]
9,fx71l150,fx71l150,fx71l150.DAT,[]
10,fx79w470a,fx79w470a,fx79w470a.DAT,[]


In [868]:
# Complete with Name dat file
df_match = pd.merge(df_match, df_dat_left[['Match_similar', 'Name_modified', 'Name_dat']], on='Match_similar', how='left', suffixes=('_match', '_left'))
df_match['Name_dat'] = df_match['Name_dat_match'].fillna(df_match['Name_dat_left'])
df_match['Name_modified'] = df_match['Name_modified_match'].fillna(df_match['Name_modified_left'])
df_match.drop(columns=['Name_dat_match', 'Name_dat_left', 'Name_modified_match', 'Name_modified_left'], inplace=True)

In [869]:
df_match.shape

(6305, 9)

In [870]:
df_match.to_csv('index_save.csv')

In [871]:
df_match.head()

Unnamed: 0,Name_big,Family,SimilarFunction,checked,Name_similar,Size,Match_similar,Name_dat,Name_modified
0,A18,Uncategorized,False,True,,945.0,,a18.DAT,a18
1,A6014-S,Ayers,False,True,,6116.0,,A6014-S.DAT,a6014-s
2,A6016-S,Ayers,False,True,,5914.0,,A6016-S.DAT,a6016-s
3,A6018-S,Ayers,False,True,,5889.0,,A6018-S.DAT,a6018-s
4,A6020-S,Ayers,False,True,,6116.0,,A6020-S.DAT,a6020-s


In [872]:
df_match['Name_modified'] = df_match['Name_modified'].fillna(df_match['Name_similar'])
df_match.drop(columns=['checked', 'Match_similar', 'Name_similar'], inplace=True)

In [873]:
df_match.to_csv('index_final.csv')

In [865]:
# df_dat_left[['Match_similar', 'Name_dat']]

In [866]:
# Name_big
# CYRANO II P-30
# FLYING WING PEE WEE 30	
# NACA0012-64 A=0.8 C(LI)=0.2	
# X-35 LOW DRAG BODY	
# JH35
# UAG92 170/SF	
# Cyrano II P-30 cyranoiip30
# jhsym10 JHSYM-10
# naca6621812a=6p51tip	 not found
# swallowp30 SwallowP-30
# deesokay230 surement problème