In [133]:
import pandas as pd
import numpy as np
import regex as re

# Set Up

### Get CSV and check duplicates

In [134]:
pd.set_option('display.max_row', 8353 )

In [135]:
df_dat = pd.read_csv('dat_files_index.csv', usecols=('name', 'size'))
df_dat.rename(columns={'name':'Name', 'size':'Size'}, inplace=True)
print(df_dat.shape)
df_dat.head()

(8353, 2)


Unnamed: 0,Name,Size
0,a18.DAT,945
1,a18sm.DAT,1795
2,A6014-S.DAT,6116
3,A6016-S.DAT,5914
4,A6018-S.DAT,5889


In [136]:
# Checking if the dataset contains duplicates
doublon_datfile = df_dat['Name'].value_counts().index[df_dat['Name'].value_counts().values > 1]
print(f'Nombre de doublons dans la bigtable : {len(doublon_datfile)}')
print([i for i in doublon_datfile])

Nombre de doublons dans la bigtable : 0
[]


In [137]:
df_bigtable = pd.read_csv('data/ailes_avion.csv', usecols=('Name', 'Family'))
print(df_bigtable.shape)
df_bigtable.head()

(6324, 2)


Unnamed: 0,Name,Family
0,63A108 MOD C,NASA
1,A18,Uncategorized
2,A18 (SMOOTHED),Uncategorized
3,A6014-S,Ayers
4,A6016-S,Ayers


In [138]:
# Checking if the dataset contains duplicates
doublon_bigtable = df_bigtable['Name'].value_counts().index[df_bigtable['Name'].value_counts().values > 1]
print(f'Nombre de doublons dans la bigtable : {len(doublon_bigtable)}')
print([i for i in doublon_bigtable])

Nombre de doublons dans la bigtable : 2
['FX 66-17AII-182', 'BOEING 737 MIDSPAN']


### Execute Basic Regex and exceptions

In [139]:
# Create a column Name_modified with all names in lowercase and without the .DAT ending 
df_dat['Name_modified'] =  df_dat['Name'].apply(lambda x : (re.split(".DAT$", str(x)))[0])
df_dat['Name_modified'] = df_dat['Name_modified'].apply(lambda x : str(x).lower())
# Create a column Name_modified with all names in lowercase
df_bigtable['Name_modified'] = df_bigtable['Name'].apply(lambda x : str(x).lower())

In [140]:
# Manual affectation of the duplicates after verification
df_bigtable.loc[596, ['Name_modified']] = 'fx6617ai'
df_bigtable.loc[597, ['Name_modified']] = 'fx6617a2'
df_bigtable.loc[154, ['Name_modified']] = 'b737c'
df_bigtable.loc[155, ['Name_modified']] = 'b737b'

In [141]:
# Family exceptions beginning by 'g'
df_bigtable.loc[df_bigtable['Name_modified'] == 'gu25-5(11)8', 'Name_modified'] = 'gu255118'
df_bigtable.loc[df_bigtable['Name_modified'] == 'gs-1', 'Name_modified'] = 'gs1'
df_bigtable.loc[df_bigtable['Name_modified'] == 'griffith 30% suction airfoil', 'Name_modified'] = 'griffith30symsuction'
df_bigtable.loc[df_bigtable['Name_modified'] == 'goe 167 (v.karman prop.2)', 'Name_modified'] = 'goe167'
df_bigtable.loc[df_bigtable['Name_modified'] == 'glenn martin 2', 'Name_modified'] = 'glennmartin2'
df_bigtable.loc[df_bigtable['Name_modified'] == 'glenn martin 3', 'Name_modified'] = 'glennmartin3'
df_bigtable.loc[df_bigtable['Name_modified'] == 'glenn martin 4', 'Name_modified'] = 'glennmartin4'

In [142]:
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler 377mod', 'Name_modified'] = 'e377m'
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler e193gu-k24', 'Name_modified'] = 'e193gu-k24'
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler ste 87(-3)-914', 'Name_modified'] = 'ste87391'
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler ste 871-514', 'Name_modified'] = 'ste87151'
df_bigtable.loc[df_bigtable['Name_modified'] == 'eppler stf 863-615', 'Name_modified'] = 'stf86361'

In [143]:
df_bigtable.loc[df_bigtable['Name_modified'] == 'aquila 9.3% smoothed', 'Name_modified'] = 'aquilasm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'bergey bw-3 (smoothed)', 'Name_modified'] = 'bw3'
df_bigtable.loc[df_bigtable['Name_modified'] == 'clark-y 11.7% smoothed', 'Name_modified'] = 'clarkysm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'cody robertson cr 001 r/c hand-launch low reynolds number airfoil (smoothed)', 'Name_modified'] = 'cr001sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'dale house dh4009 (smoothed) used on the storm r/c aerobatic aircraft', 'Name_modified'] = 'dh4009sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'fx 60-100 10.0% smoothed', 'Name_modified'] = 'fx60100sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'fx 63-137 13.7% smoothed', 'Name_modified'] = 'fx63137sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'fx 74-cl5-140 mod (smoothed)', 'Name_modified'] = 'fx74modsm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'mb253515 15.0% smoothed', 'Name_modified'] = 'mb253515sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'r140 12.04% (smoothed)', 'Name_modified'] = 'r140sm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'spica 11.73% smoothed', 'Name_modified'] = 'spicasm'
df_bigtable.loc[df_bigtable['Name_modified'] == 'usnps4 (smoothed)', 'Name_modified'] = 'usnps4'
df_bigtable.loc[df_bigtable['Name_modified'] == 'wb-135/35 13.5% smoothed', 'Name_modified'] = 'wb13535sm'

In [144]:
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .08 span', 'Name_modified'] = 'b707a'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .19 span', 'Name_modified'] = 'b707b'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .40 span', 'Name_modified'] = 'b707c'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .54 span', 'Name_modified'] = 'b707d'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 707 .99 span', 'Name_modified'] = 'b707e'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 103', 'Name_modified'] = 'boe103'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 106', 'Name_modified'] = 'boe106'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 737 root', 'Name_modified'] = 'b737a'
df_bigtable.loc[df_bigtable['Name_modified'] == 'boeing 737 outboard', 'Name_modified'] = 'b737d'
df_bigtable.loc[df_bigtable['Name_modified'] == 'kc-135 bl124.32', 'Name_modified'] = 'kc135b'
df_bigtable.loc[df_bigtable['Name_modified'] == 'kc-135 bl200.76', 'Name_modified'] = 'kc135c'
df_bigtable.loc[df_bigtable['Name_modified'] == 'kc-135 bl351.6', 'Name_modified'] = 'kc135d'
df_bigtable.loc[df_bigtable['Name_modified'] == 'kc-135 bl52.44', 'Name_modified'] = 'kc135a'

In [145]:
df_bigtable.loc[df_bigtable['Name_modified'] == 'naca m6 (65%)', 'Name_modified'] = 'm6_65'
df_bigtable.loc[df_bigtable['Name_modified'] == 'naca m6 (85%)', 'Name_modified'] = 'm6_85'

# Functions

In [146]:
# Function to merge both dataset, return merge
def merge_df(df_bigtable, df_dat, on_column):
    df_merge = pd.merge(df_bigtable, df_dat, on=on_column, how='left', suffixes=('_big', '_dat'))
    nb_mismatch = df_merge[df_merge['Name_dat'].isna()].shape[0]
    nb_match = df_merge.shape[0] - nb_mismatch
    print(f'Le dataset contient {df_merge.shape[0]} valeurs dont {nb_match} correspondent aux fichiers dat.')
    print(f'Il reste {nb_mismatch} valeurs à matcher.')
    df_merge.head()
    return df_merge

def get_family_done(df_merge, nb):
    family = df_merge.groupby(['Family']).count()
    mask_family = (family['Name_big'] == family['Name_dat'])
    res = family[mask_family].Name_big.sort_values(ascending=False).head(nb)
    print(res)
    return res

# Function to evaluate the missing values per family
def groupna_family(df_merge, nb):
    family = df_merge.groupby(['Family']).count()
    mask_family = (family['Name_big'] != family['Name_dat'])
    family_na = family[mask_family].copy()
    family_na['nb_na'] = family_na['Name_big'] - family_na['Size']
    # We are interested by the 5 families with the most of na
    print(family_na['nb_na'].sort_values(ascending=False).head(nb))

# Function to iinitialize df_big et df_dat
# We want to look through the wings left in each dataframe in order to avoid corrupting good matchs

def initiate_df_left(df_bigtable, df_dat, df_merge):
    # Create df_dat_left and df_big_left
    df_dat_left = pd.merge(df_bigtable, df_dat, on='Name_modified', how='right', suffixes=('_big', '_dat'))
    df_dat_left = df_dat_left[df_dat_left['Name_big'].isna()].copy()
    print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")
    df_big_left = df_merge[df_merge['Name_dat'].isna()].copy()
    print(f"Unmatched values left in big table : {df_big_left.shape}")

    # Create a new column for each df containing the first letter of each wing
    df_dat_left['First letter'] = [x[0] for x in df_dat_left['Name_modified'].values]
    df_big_left['First letter'] = [x[0] for x in df_big_left['Name_modified'].values]

    # Create a new column for each df with z copy of the modified name to further work on
    df_dat_left['Name_modified_by_family'] = df_dat_left['Name_modified'].copy()
    df_big_left['Name_modified_by_family'] = df_big_left['Name_modified'].copy()

    # Create a new column for each df with z copy of the modified name to further work on
    df_dat_left['Name_modified_by_pattern'] = df_dat_left['Name_modified'].copy()
    df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified'].copy()


    # df_dat_left ----> ['Name_big', 'Family', 'Name_modified', 'Name_dat', 'Size', 'First letter', 'Name_modified_by_family']
    # Name_big and Family are NaN
    df_dat_left.drop(columns=['Name_big', 'Family'], inplace=True)
    # df_big_left ----> ['Name_big', 'Family', 'Name_modified', 'Name_dat', 'Size', 'First letter', 'Name_modified_by_family']
    # Name_dat and Size are NaN
    df_big_left.drop(columns=['Name_dat', 'Size'], inplace=True)

    return df_dat_left, df_big_left

# Function to create filter by letter
def get_letter_group(df_dat_left, df_big_left, letter):
    big_letter = df_big_left[df_big_left['First letter'] == letter]
    dat_letter = df_dat_left[df_dat_left['First letter'] == letter]
    print(f'Big table number of wings left beginning by {letter} : {big_letter.shape}')
    print(f'Dat folder number of wings left beginning by {letter} : {dat_letter.shape}')
    print(f'Difference : {big_letter.shape[0] - dat_letter.shape[0]}')
    return big_letter, dat_letter

# Function to pass through pattern
def try_pattern_family(big_letter, family, pattern_list):
    big_letter = big_letter.copy()
    for pattern in pattern_list:
        # Apply pattern
        big_letter.loc[big_letter['Family'] == family, 'Name_modified_by_family'] = big_letter.loc[big_letter['Family'] == family, 'Name_modified_by_family'].apply(pattern)
    return big_letter

# Function to incorporate the new name found (Name_modified_by_family) into the dataset with the bigtable wings left to match
def incorporate_family_pattern(df_big_left, big_letter):
    df_big_left = pd.merge(df_big_left, big_letter[['Name_big', 'Name_modified_by_family']], on='Name_big', how='left', suffixes=('_left', '_big'))
    df_big_left['Name_modified_by_family_big'] = df_big_left['Name_modified_by_family_big'].fillna(df_big_left['Name_modified_by_family_left'])
    df_big_left.drop(["Name_modified_by_family_left"], inplace=True, axis=1)
    df_big_left.rename(columns={'Name_modified_by_family_big':'Name_modified_by_family'}, inplace=True)
    print(df_big_left.columns)
    return df_big_left

def set_df_left(df_dat_left, df_merge, on_column):
    # Create df_dat_left and df_big_left
    df_dat_left = pd.merge(df_merge, df_dat_left, on=on_column, how='right', suffixes=('_big', '_dat'))
    df_dat_left = df_dat_left[df_dat_left['Name_big'].isna()]
    df_dat_left.dropna(axis=1, how='all', inplace=True)
    df_dat_left = df_dat_left.set_axis([re.sub('_dat', "", str(col)) for col in df_dat_left.columns], axis=1)
    df_dat_left.rename(columns={'Name':'Name_dat'}, inplace=True)
    print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")
    df_big_left = df_merge[df_merge['Name_dat'].isna()].copy()
    df_big_left.dropna(axis=1, how='all', inplace=True)
    df_big_left = df_big_left.set_axis([re.sub('_big', "", str(col)) for col in df_big_left.columns], axis=1)
    df_big_left.rename(columns={'Name':'Name_big'}, inplace=True)
    print(f"Unmatched values left in big table : {df_big_left.shape}")
    return df_dat_left, df_big_left

def try_pattern(df_big_left, df_dat_left, pattern_list):
    df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified'].copy()
    df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified'].copy()

    for pattern in pattern_list:
        df_big_left['Name_modified_by_pattern'] = df_big_left.loc[:, 'Name_modified_by_pattern'].apply(pattern)
        df_dat_left['Name_modified_by_pattern'] = df_dat_left.loc[:, 'Name_modified_by_pattern'].apply(pattern)

    return df_dat_left, df_big_left

# Application

In [147]:
df_merge = merge_df(df_bigtable, df_dat, 'Name_modified')
print('Famille complètes : ')
done_list = get_family_done(df_merge,10)

Le dataset contient 6324 valeurs dont 4949 correspondent aux fichiers dat.
Il reste 1375 valeurs à matcher.
Famille complètes : 
Family
NACA 65-series       608
NACA 66-series       604
NACA 67-series       548
NACA 5-digit         401
Habbe                137
Horten               133
Joukowsky            132
Riblett 30-series    114
Riblett 40-series    110
Riblett 37-series    110
Name: Name_big, dtype: int64


In [148]:
# DROP only after all matching is done 
df_dat.drop(df_dat[df_dat['Name_modified'].str.contains('horten')].index, inplace=True)
df_dat.drop(df_dat[df_dat['Name_modified'].str.contains('joukowsky')].index, inplace=True)

### **1. Create dataframes with wings left to match**

In [149]:
df_dat_left, df_big_left = initiate_df_left(df_bigtable, df_dat, df_merge)

Unmatched values left in the dat folder : (2366, 5)
Unmatched values left in big table : (1375, 5)


### **2. Regex to match the family Yost,Eiffel,Eppeler**

In [150]:
groupna_family(df_merge, 5)

Family
Gottingen        381
Eppler           196
Uncategorized    138
Wortmann         107
NASA              54
Name: nb_na, dtype: int64


In [151]:
big_e, dat_e = get_letter_group(df_dat_left, df_big_left, 'e')

Big table number of wings left beginning by e : (216, 6)
Dat folder number of wings left beginning by e : (217, 6)
Difference : -1


In [152]:
yost = [lambda x : (re.sub("\s", "", str(x))), 
        lambda x : (re.sub("\.", "", str(x))), 
        lambda x : (re.sub("/", "", str(x)))]

eiffel = [lambda x : (re.sub('\(.*?\)', "", str(x))), 
          lambda x : (re.sub("\s", "", str(x))), 
          lambda x : (re.split("-", str(x)))[0]]

eppler = [lambda x : (re.sub("eppler", "e", str(x))), 
          lambda x : (re.sub("\s", "", str(x)))]

big_e = try_pattern_family(big_e, 'Yost', yost)
big_e = try_pattern_family(big_e, 'Eiffel', eiffel)
big_e = try_pattern_family(big_e, 'Eppler', eppler)


In [153]:
# Function to incorporate the new name found (Name_modified_by_family) into the dataset with the bigtable wings left to match
df_big_left = incorporate_family_pattern(df_big_left, big_e)

Index(['Name_big', 'Family', 'Name_modified', 'First letter',
       'Name_modified_by_pattern', 'Name_modified_by_family'],
      dtype='object')


In [154]:
# we already did a merge so maybe just add an argument precising the on merge column
# By merging using 'left', we obtain 4905 correspondances
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')

Le dataset contient 1375 valeurs dont 199 correspondent aux fichiers dat.
Il reste 1176 valeurs à matcher.


In [155]:
# eppler_res = df_merge[df_merge['Family'] == 'Eppler']
# eppler_res[eppler_res['Name_modified_dat'].isna()].Name_modified_big

eppler = [lambda x : (re.sub("eppler", "e", str(x))),
          lambda x : (re.sub("ee", "e", str(x))),
          lambda x : (re.sub("hydrofoil", "", str(x))),
          lambda x : (re.sub("strut", "", str(x))),
          lambda x : (re.sub("-", "", str(x))),
          lambda x : (re.sub("\(", "", str(x))),
          lambda x : (re.sub("\)", "", str(x))),
          lambda x : (re.sub("\s", "", str(x)))]

big_e = try_pattern_family(big_e, 'Eppler', eppler)

# Function to incorporate the new name found (Name_modified_by_family) into the dataset with the bigtable wings left to match
df_big_left = incorporate_family_pattern(df_big_left, big_e)

# we already did a merge so maybe just add an argument precising the on merge column
# By merging using 'left', we obtain 4905 correspondances
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')

Index(['Name_big', 'Family', 'Name_modified', 'First letter',
       'Name_modified_by_pattern', 'Name_modified_by_family'],
      dtype='object')
Le dataset contient 1375 valeurs dont 213 correspondent aux fichiers dat.
Il reste 1162 valeurs à matcher.


### **3. Regex to match the family Gottingen**

In [156]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')
big_g, dat_g = get_letter_group(df_dat_left, df_big_left, 'g')

Unmatched values left in the dat folder : (2153, 6)
Unmatched values left in big table : (1162, 6)
Big table number of wings left beginning by g : (399, 6)
Dat folder number of wings left beginning by g : (401, 6)
Difference : -2


In [157]:
gottingen = [lambda x : (re.sub("\s", "", str(x))), lambda x : (re.sub('\(.*?\)', "", str(x)))]
big_g = try_pattern_family(big_g, 'Gottingen', gottingen)

In [158]:
df_big_left = incorporate_family_pattern(df_big_left, big_g)

Index(['Name_big', 'Family', 'Name_modified', 'First letter',
       'Name_modified_by_pattern', 'Name_modified_by_family'],
      dtype='object')


In [159]:
# we already did a merge so maybe just add an argument precising the on merge column
# By merging using 'left', we obtain 4905 correspondances
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')

Le dataset contient 1162 valeurs dont 372 correspondent aux fichiers dat.
Il reste 790 valeurs à matcher.


### **4. Global regex with quick manual verif**

In [160]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')

Unmatched values left in the dat folder : (1781, 6)
Unmatched values left in big table : (790, 6)


In [161]:
pattern_global =[lambda x : (re.sub("\s", "", str(x))), lambda x : (re.sub("-", "", str(x))), lambda x : (re.sub("%", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

In [162]:
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 790 valeurs dont 245 correspondent aux fichiers dat.
Il reste 545 valeurs à matcher.


In [163]:
groupna_family(df_merge, 30)

Family
Uncategorized            100
NASA                      49
Hepperle                  49
Selig et. al.             48
Quabeck                   42
Leinauer                  25
Wortmann                  22
NACA/Munk                 22
Boeing                    19
Barth                     16
Gulfstream                14
Lockheed                  14
NACA                      13
Pflug                     12
Althaus                   11
Gottingen                  9
Drela                      9
Sikorsky                   8
Girsberger                 7
Onera                      7
Marske                     6
Scherrer                   6
Siegmann                   5
NACA 64-series             5
Delft                      5
NACA 63-series             4
Hammond                    4
TsAGI                      3
RAF                        2
NACA 4-digit modified      2
Name: nb_na, dtype: int64


### **5. keep trying**

In [164]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (1536, 6)
Unmatched values left in big table : (545, 6)


In [165]:
pattern_global =[lambda x : (re.sub("\s", "", str(x))),
                 lambda x : (re.sub("\.", "", str(x))),
                 lambda x : (re.sub(",", "", str(x))),
                 lambda x : (re.sub("/", "", str(x))),
                 lambda x : (re.sub("-", "", str(x))),
                 lambda x : (re.sub("_", "", str(x))),  
                 lambda x : (re.sub("%", "", str(x))),
                 lambda x : (re.sub("\(", "", str(x))),
                 lambda x : (re.sub("\)", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

In [166]:
# sb 96 12.7/3.0	 pas fonctionné df_big_left sb9612730  sb96127_30

In [167]:
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 545 valeurs dont 126 correspondent aux fichiers dat.
Il reste 419 valeurs à matcher.


In [168]:
groupna_family(df_merge, 30)

Family
Uncategorized            89
Hepperle                 49
Selig et. al.            48
Leinauer                 25
NACA/Munk                22
NASA                     20
Boeing                   19
Wortmann                 19
Lockheed                 14
Gulfstream               14
NACA                     13
Althaus                  11
Drela                     9
Gottingen                 9
Sikorsky                  8
Onera                     7
Girsberger                6
Marske                    6
NACA 64-series            5
Delft                     4
Hammond                   4
NACA 63-series            4
RAF                       2
NACA 4-digit modified     2
Quabeck                   2
Egglestone                2
Clark                     2
NPL                       1
TsAGI                     1
USA                       1
Name: nb_na, dtype: int64


### **8. Smooth -> sm**

In [169]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (1410, 6)
Unmatched values left in big table : (419, 6)


In [170]:
list_smooth = df_big_left.loc[df_big_left['Name_modified'].str.contains('smooth'), 'Name_modified'].to_list()

In [171]:
pattern_global =[lambda x : (re.sub("\s", "", str(x))),
                 lambda x : (re.sub("smoothed", "sm", str(x))),
                 lambda x : (re.sub("\.", "", str(x))),
                 lambda x : (re.sub(",", "", str(x))),
                 lambda x : (re.sub("/", "", str(x))),
                 lambda x : (re.sub("-", "", str(x))), 
                 lambda x : (re.sub("%", "", str(x))),
                 lambda x : (re.sub("\(", "", str(x))),
                 lambda x : (re.sub("\)", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

In [172]:
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 419 valeurs dont 13 correspondent aux fichiers dat.
Il reste 406 valeurs à matcher.


In [173]:
# Find exceptions
# list_smoothed = df_merge.loc[df_merge['Name_modified_dat'].isna()==False, 'Name_modified_big'].to_list()
# nasmooth = [name for name in list_smooth if name not in list_smoothed]

### **6. Removing parenthesis while keeping the inside content**

In [174]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (1397, 6)
Unmatched values left in big table : (406, 6)


In [175]:
# New columns to be created normally !!!!
pattern_global =[lambda x : (re.sub("\(", "", str(x))),
                 lambda x : (re.sub("\)", "", str(x)))
                 ]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

In [176]:
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 406 valeurs dont 0 correspondent aux fichiers dat.
Il reste 406 valeurs à matcher.


In [177]:
groupna_family(df_merge, 5)

Family
Uncategorized    78
Hepperle         49
Selig et. al.    48
Leinauer         25
NACA/Munk        22
Name: nb_na, dtype: int64


### **7. Removing whats between parenthesis with manual verif**

In [178]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (1397, 6)
Unmatched values left in big table : (406, 6)


In [179]:
# New columns to be created normally !!!!
pattern_global =[lambda x : (re.sub('\(.*?\)', "", str(x))),
                 lambda x : (re.sub("\s", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

df_big_left['parenthesis_content'] = df_big_left['Name_modified'].copy()
df_big_left['parenthesis_content'] = df_big_left.loc[:, 'parenthesis_content'].apply(lambda x : (re.findall('\(.*?\)', str(x))))
df_dat_left['parenthesis_content'] = df_dat_left['Name_modified'].copy()
df_dat_left['parenthesis_content'] = df_dat_left.loc[:, 'parenthesis_content'].apply(lambda x : (re.findall('\(.*?\)', str(x))))

In [180]:
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 406 valeurs dont 29 correspondent aux fichiers dat.
Il reste 377 valeurs à matcher.


In [181]:
groupna_family(df_merge, 10)

Family
Uncategorized    70
Hepperle         49
Selig et. al.    28
Leinauer         25
NACA/Munk        22
NASA             20
Wortmann         19
Boeing           19
Gulfstream       14
Lockheed         14
Name: nb_na, dtype: int64


In [182]:
# df_merge[['Name_big', 'Family', 'Name_modified_by_pattern', 'parenthesis_content_big', 'Name_modified_dat', 'Name_dat']]

#### Family Hepperle

In [183]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')
big_m, dat_m = get_letter_group(df_dat_left, df_big_left, 'm')

Unmatched values left in the dat folder : (1368, 7)
Unmatched values left in big table : (377, 7)
Big table number of wings left beginning by m : (57, 7)
Dat folder number of wings left beginning by m : (86, 7)
Difference : -29


In [184]:
hepperle = [lambda x : (((re.split("\s", str(x)))[0])+(re.split("\s", str(x)))[1])]
big_m.loc[big_m['Family'] == "Hepperle", 'Name_modified_by_family'] = big_m.loc[big_m['Family'] == "Hepperle", 'Name_modified'].copy()
big_m = try_pattern_family(big_m, 'Hepperle', hepperle)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_m.loc[big_m['Family'] == "Hepperle", 'Name_modified_by_family'] = big_m.loc[big_m['Family'] == "Hepperle", 'Name_modified'].copy()


In [185]:
df_big_left = incorporate_family_pattern(df_big_left, big_m)

Index(['Name_big', 'Family', 'Name_modified', 'First letter',
       'Name_modified_by_pattern', 'parenthesis_content',
       'Name_modified_by_family'],
      dtype='object')


In [186]:
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')

Le dataset contient 377 valeurs dont 49 correspondent aux fichiers dat.
Il reste 328 valeurs à matcher.


### Family NACA/Munk

In [187]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')
big_n, dat_n = get_letter_group(df_dat_left, df_big_left, 'n')

Unmatched values left in the dat folder : (1319, 7)
Unmatched values left in big table : (328, 7)
Big table number of wings left beginning by n : (55, 7)
Dat folder number of wings left beginning by n : (1023, 7)
Difference : -968


In [188]:
nacamunk = [lambda x : (re.sub("naca","", str(x))), lambda x : (re.sub("\s", "", str(x)))]
big_n = try_pattern_family(big_n, 'NACA/Munk', nacamunk)
df_big_left = incorporate_family_pattern(df_big_left, big_n)
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')
df_dat_left.drop(df_dat_left[df_dat_left['Name_modified'].str.contains('naca')].index, inplace=True)

Index(['Name_big', 'Family', 'Name_modified', 'First letter',
       'Name_modified_by_pattern', 'parenthesis_content',
       'Name_modified_by_family'],
      dtype='object')
Le dataset contient 328 valeurs dont 22 correspondent aux fichiers dat.
Il reste 306 valeurs à matcher.


In [189]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')

Unmatched values left in the dat folder : (318, 7)
Unmatched values left in big table : (306, 7)


In [190]:
# As much of giii rows in both dataset its the Gulfstream family 
giii_dat = df_dat_left[df_dat_left['Name_modified'].str.contains("giii")]
giii_big = df_big_left[df_big_left['Name_modified'].str.contains("giii")]
giinb = giii_big['Name_modified'].apply(lambda x : (re.findall('\d+', str(x))))
giinb = [int(nb[0]) for nb in giinb.values]
giinb = sorted(giinb)
giiiletter = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n']
dict_giii = {}
[dict_giii.update({('giiibl'+str(nb)) : ('giii'+str(letter))}) for nb, letter in zip(giinb, giiiletter)]
df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified_by_pattern'].map(dict_giii).fillna(df_big_left['Name_modified_by_pattern'])

df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Le dataset contient 306 valeurs dont 14 correspondent aux fichiers dat.
Il reste 292 valeurs à matcher.
Unmatched values left in the dat folder : (304, 7)
Unmatched values left in big table : (292, 7)


# Similar test

In [191]:
# df_big_left['First letter'].value_counts()
# df_dat_left['First letter'].value_counts()
# df_big_left['First letter'].value_counts() - df_dat_left['First letter'].value_counts()

In [192]:
df_dat_left.to_csv('data/df_dat_left.csv')
df_big_left.to_csv('data/df_big_left.csv')

In [193]:
big_s, dat_s = get_letter_group(df_dat_left, df_big_left, 's')
list_big = big_s['Name_modified'].values
list_dat = dat_s['Name_modified'].values

Big table number of wings left beginning by s : (43, 7)
Dat folder number of wings left beginning by s : (44, 7)
Difference : -1


In [194]:
from difflib import SequenceMatcher
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

from fuzzywuzzy import fuzz
def similar_fuzz(string1, string2):
    return fuzz.partial_ratio(string1, string2)

def find_match(list_big, list_dat):
    ratio = []
    word = []
    match = []
    partial_match = []
    partial_ratio = []

    for i in list_big:
        maxscore = 0
        maxpartial = 0
        storeword = "temp"
        storeword_partial = "temp"
        for j in list_dat:
            score = similar(i,j)
            partial_score = similar_fuzz(i,j)
            print(i, "----------", j, " : ", score)
            if (score > maxscore):
                maxscore = score
                storeword = j
            if (score == maxscore):
                print("egal score")
            if (partial_score > maxpartial):
                maxpartial = partial_score
                storeword_partial = j
        word.append(i)
        match.append(storeword)
        ratio.append(maxscore)
        partial_match.append(storeword_partial)
        partial_ratio.append(maxpartial)

    return ratio, word, match, partial_match, partial_ratio

In [195]:
ratio, word, match, partial_match, partial_ratio = find_match(list_big, list_dat)

s1010 hpv airfoil ---------- s1010  :  0.45454545454545453
egal score
s1010 hpv airfoil ---------- s102blunt  :  0.3076923076923077
s1010 hpv airfoil ---------- s2050  :  0.2727272727272727
s1010 hpv airfoil ---------- s2060  :  0.2727272727272727
s1010 hpv airfoil ---------- s2062  :  0.18181818181818182
s1010 hpv airfoil ---------- s2091  :  0.18181818181818182
s1010 hpv airfoil ---------- s3  :  0.10526315789473684
s1010 hpv airfoil ---------- s3002  :  0.2727272727272727
s1010 hpv airfoil ---------- s3010  :  0.36363636363636365
s1010 hpv airfoil ---------- s3014  :  0.2727272727272727
s1010 hpv airfoil ---------- s3016  :  0.2727272727272727
s1010 hpv airfoil ---------- s3021  :  0.18181818181818182
s1010 hpv airfoil ---------- s3024  :  0.18181818181818182
s1010 hpv airfoil ---------- s3025  :  0.18181818181818182
s1010 hpv airfoil ---------- s4061  :  0.18181818181818182
s1010 hpv airfoil ---------- s4062  :  0.18181818181818182
s1010 hpv airfoil ---------- s4180  :  0.272727272

In [196]:
print(list_big.shape)
print(list_dat.shape)

(43,)
(44,)


In [197]:
df_similar = pd.DataFrame(data={'Name' : word, 'Match' : match, 'Score' : ratio, 'Match_2' : partial_match, 'Score_2' : partial_ratio})
df_similar

Unnamed: 0,Name,Match,Score,Match_2,Score_2
0,s1010 hpv airfoil,s1010,0.454545,s1010,100
1,s2050 8.93%,s2050,0.625,s2050,100
2,s2060 8%,s2060,0.769231,s2060,100
3,s2062 8%,s2062,0.769231,s2062,100
4,s2091-101-83,s2091,0.588235,s2091,100
5,s3010-103-84,s3010,0.588235,s3,100
6,s3014-095-85,sc1095r8,0.6,s3,100
7,s3016-095-87,sc1095r8,0.6,s3,100
8,s3021-095-84,sc1095r8,0.6,s3,100
9,s3024 9.84%,s3024,0.625,s3,100


In [198]:
df_similar['Match'].value_counts()

Match
sc1095r8     3
sd7032       2
sd8000       2
s3002        2
sipkill      2
sc1094r8     2
s1010        1
sc1095       1
sc1012r8     1
sc2110       1
sd6060       1
ssca07       1
ssca09       1
s102blunt    1
ssv2316      1
stcyr172     1
stcyr234     1
sd7003       1
sd2030       1
s2050        1
s4061        1
s2060        1
s2062        1
s2091        1
s3010        1
s3024        1
s3025        1
s4062        1
s7055        1
s4180        1
s4233        1
s6061        1
s6062        1
s6063        1
s7012        1
stcyr24      1
Name: count, dtype: int64