In [2121]:
import pandas as pd
import numpy as np
import regex as re

# Set Up

### Get CSV and check duplicates

In [2122]:
pd.set_option('display.max_row', 8353 )

In [2123]:
df_dat = pd.read_csv('dat_files_index.csv', usecols=('name', 'size'))
df_dat.rename(columns={'name':'Name', 'size':'Size'}, inplace=True)
print(df_dat.shape)
df_dat.head()

(8353, 2)


Unnamed: 0,Name,Size
0,a18.DAT,945
1,a18sm.DAT,1795
2,A6014-S.DAT,6116
3,A6016-S.DAT,5914
4,A6018-S.DAT,5889


In [2124]:
# Checking if the dataset contains duplicates
doublon_datfile = df_dat['Name'].value_counts().index[df_dat['Name'].value_counts().values > 1]
print(f'Nombre de doublons dans la bigtable : {len(doublon_datfile)}')
print([i for i in doublon_datfile])

Nombre de doublons dans la bigtable : 0
[]


In [2125]:
df_bigtable = pd.read_csv('data/ailes_avion.csv', usecols=('Name', 'Family'))
print(df_bigtable.shape)
df_bigtable.head()

(6324, 2)


Unnamed: 0,Name,Family
0,63A108 MOD C,NASA
1,A18,Uncategorized
2,A18 (SMOOTHED),Uncategorized
3,A6014-S,Ayers
4,A6016-S,Ayers


In [2126]:
# Checking if the dataset contains duplicates
doublon_bigtable = df_bigtable['Name'].value_counts().index[df_bigtable['Name'].value_counts().values > 1]
print(f'Nombre de doublons dans la bigtable : {len(doublon_bigtable)}')
print([i for i in doublon_bigtable])

Nombre de doublons dans la bigtable : 2
['FX 66-17AII-182', 'BOEING 737 MIDSPAN']


### Execute Basic Regex and exceptions

In [2127]:
# Create a column Name_modified with all names in lowercase and without the .DAT ending 
df_dat['Name_modified'] =  df_dat['Name'].apply(lambda x : (re.split(".DAT$", str(x)))[0])
df_dat['Name_modified'] = df_dat['Name_modified'].apply(lambda x : str(x).lower())
# Create a column Name_modified with all names in lowercase
df_bigtable['Name_modified'] = df_bigtable['Name'].apply(lambda x : str(x).lower())

In [2128]:
# Manual affectation of the duplicates after verification
df_bigtable.loc[596, ['Name_modified']] = 'fx6617ai'
df_bigtable.loc[597, ['Name_modified']] = 'fx6617a2'
df_bigtable.loc[154, ['Name_modified']] = 'b737c'
df_bigtable.loc[155, ['Name_modified']] = 'b737b'

In [2129]:
# Family exceptions beginning by 'g'
df_bigtable.loc[df_bigtable['Name_modified'] == 'gu25-5(11)8', 'Name_modified'] = 'gu255118'
df_bigtable.loc[df_bigtable['Name_modified'] == 'gs-1', 'Name_modified'] = 'gs1'
df_bigtable.loc[df_bigtable['Name_modified'] == 'griffith 30% suction airfoil', 'Name_modified'] = 'griffith30symsuction'
df_bigtable.loc[df_bigtable['Name_modified'] == 'goe 167 (v.karman prop.2)', 'Name_modified'] = 'goe167'
df_bigtable.loc[df_bigtable['Name_modified'] == 'glenn martin 2', 'Name_modified'] = 'glennmartin2'
df_bigtable.loc[df_bigtable['Name_modified'] == 'glenn martin 3', 'Name_modified'] = 'glennmartin3'
df_bigtable.loc[df_bigtable['Name_modified'] == 'glenn martin 4', 'Name_modified'] = 'glennmartin4'

# Functions

In [2130]:
# Function to merge both dataset, return merge
def merge_df(df_bigtable, df_dat, on_column):
    df_merge = pd.merge(df_bigtable, df_dat, on=on_column, how='left', suffixes=('_big', '_dat'))
    nb_mismatch = df_merge[df_merge['Name_dat'].isna()].shape[0]
    nb_match = df_merge.shape[0] - nb_mismatch
    print(f'Le dataset contient {df_merge.shape[0]} valeurs dont {nb_match} correspondent aux fichiers dat.')
    print(f'Il reste {nb_mismatch} valeurs à matcher.')
    df_merge.head()
    return df_merge

# Function to evaluate the missing values per family
def groupna_family(df_merge):
    family = df_merge.groupby(['Family']).count()
    mask_family = (family['Name_big'] != family['Name_dat'])
    family_na = family[mask_family].copy()
    family_na['nb_na'] = family_na['Name_big'] - family_na['Size']
    # We are interested by the 5 families with the most of na
    print(family_na['nb_na'].sort_values(ascending=False).head(5))

# Function to iinitialize df_big et df_dat
# We want to look through the wings left in each dataframe in order to avoid corrupting good matchs

def initiate_df_left(df_bigtable, df_dat, df_merge):
    # Create df_dat_left and df_big_left
    df_dat_left = pd.merge(df_bigtable, df_dat, on='Name_modified', how='right', suffixes=('_big', '_dat'))
    df_dat_left = df_dat_left[df_dat_left['Name_big'].isna()].copy()
    print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")
    df_big_left = df_merge[df_merge['Name_dat'].isna()].copy()
    print(f"Unmatched values left in big table : {df_big_left.shape}")

    # Create a new column for each df containing the first letter of each wing
    df_dat_left['First letter'] = [x[0] for x in df_dat_left['Name_modified'].values]
    df_big_left['First letter'] = [x[0] for x in df_big_left['Name_modified'].values]

    # Create a new column for each df with z copy of the modified name to further work on
    df_dat_left['Name_modified_by_family'] = df_dat_left['Name_modified'].copy()
    df_big_left['Name_modified_by_family'] = df_big_left['Name_modified'].copy()

    # Create a new column for each df with z copy of the modified name to further work on
    df_dat_left['Name_modified_by_pattern'] = df_dat_left['Name_modified'].copy()
    df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified'].copy()


    # df_dat_left ----> ['Name_big', 'Family', 'Name_modified', 'Name_dat', 'Size', 'First letter', 'Name_modified_by_family']
    # Name_big and Family are NaN
    df_dat_left.drop(columns=['Name_big', 'Family'], inplace=True)
    # df_big_left ----> ['Name_big', 'Family', 'Name_modified', 'Name_dat', 'Size', 'First letter', 'Name_modified_by_family']
    # Name_dat and Size are NaN
    df_big_left.drop(columns=['Name_dat', 'Size'], inplace=True)

    return df_dat_left, df_big_left

# Function to create filter by letter
def get_letter_group(df_dat_left, df_big_left, letter):
    big_letter = df_big_left[df_big_left['First letter'] == letter]
    dat_letter = df_dat_left[df_dat_left['First letter'] == letter]
    print(f'Big table number of wings left beginning by {letter} : {big_letter.shape}')
    print(f'Dat folder number of wings left beginning by {letter} : {dat_letter.shape}')
    print(f'Difference : {big_letter.shape[0] - dat_letter.shape[0]}')
    return big_letter, dat_letter

# Function to pass through pattern
def try_pattern_family(big_letter, family, pattern_list):
    big_letter = big_letter.copy()
    for pattern in pattern_list:
        # Apply pattern
        big_letter.loc[big_letter['Family'] == family, 'Name_modified_by_family'] = big_letter.loc[big_letter['Family'] == family, 'Name_modified_by_family'].apply(pattern)
    return big_letter

# Function to incorporate the new name found (Name_modified_by_family) into the dataset with the bigtable wings left to match
def incorporate_family_pattern(df_big_left, big_letter):
    df_big_left = pd.merge(df_big_left, big_letter[['Name_big', 'Name_modified_by_family']], on='Name_big', how='left', suffixes=('_left', '_big'))
    df_big_left['Name_modified_by_family_big'] = df_big_left['Name_modified_by_family_big'].fillna(df_big_left['Name_modified_by_family_left'])
    df_big_left.drop(["Name_modified_by_family_left"], inplace=True, axis=1)
    df_big_left.rename(columns={'Name_modified_by_family_big':'Name_modified_by_family'}, inplace=True)
    print(df_big_left.columns)
    return df_big_left

def set_df_left(df_dat_left, df_merge, on_column):
    # Create df_dat_left and df_big_left
    df_dat_left = pd.merge(df_merge, df_dat_left, on=on_column, how='right', suffixes=('_big', '_dat'))
    df_dat_left = df_dat_left[df_dat_left['Name_big'].isna()]
    df_dat_left.dropna(axis=1, how='all', inplace=True)
    df_dat_left = df_dat_left.set_axis([re.sub('_dat', "", str(col)) for col in df_dat_left.columns], axis=1)
    df_dat_left.rename(columns={'Name':'Name_dat'}, inplace=True)
    print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")
    df_big_left = df_merge[df_merge['Name_dat'].isna()].copy()
    df_big_left.dropna(axis=1, how='all', inplace=True)
    df_big_left = df_big_left.set_axis([re.sub('_big', "", str(col)) for col in df_big_left.columns], axis=1)
    df_big_left.rename(columns={'Name':'Name_big'}, inplace=True)
    print(f"Unmatched values left in big table : {df_big_left.shape}")
    return df_dat_left, df_big_left

def try_pattern(df_big_left, df_dat_left, pattern_list):
    df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified'].copy()
    df_big_left['Name_modified_by_pattern'] = df_big_left['Name_modified'].copy()

    for pattern in pattern_list:
        df_big_left['Name_modified_by_pattern'] = df_big_left.loc[:, 'Name_modified_by_pattern'].apply(pattern)
        df_dat_left['Name_modified_by_pattern'] = df_dat_left.loc[:, 'Name_modified_by_pattern'].apply(pattern)

    return df_dat_left, df_big_left

# Application

In [2131]:
test_function = merge_df(df_bigtable, df_dat, 'Name_modified')
groupna_family(test_function)

Le dataset contient 6324 valeurs dont 4916 correspondent aux fichiers dat.
Il reste 1408 valeurs à matcher.
Family
Gottingen        381
Eppler           201
Uncategorized    147
Wortmann         110
NASA              54
Name: nb_na, dtype: int64


### **1. Create dataframes with wings left to match**

In [2132]:
df_dat_left, df_big_left = initiate_df_left(df_bigtable, df_dat, test_function)

Unmatched values left in the dat folder : (3437, 5)
Unmatched values left in big table : (1408, 5)


### **2. Regex to match the family Yost,Eiffel,Eppeler**

In [2133]:
big_e, dat_e = get_letter_group(df_dat_left, df_big_left, 'e')

Big table number of wings left beginning by e : (221, 6)
Dat folder number of wings left beginning by e : (219, 6)
Difference : 2


In [2134]:
yost = [lambda x : (re.sub("\s", "", str(x))), lambda x : (re.sub("\.", "", str(x))), lambda x : (re.sub("/", "", str(x)))]
eiffel = [lambda x : (re.sub('\(.*?\)', "", str(x))), lambda x : (re.sub("\s", "", str(x))), lambda x : (re.split("-", str(x)))[0]]
eppler = [lambda x : (re.sub("eppler", "e", str(x))), lambda x : (re.sub("\s", "", str(x)))]
big_e = try_pattern_family(big_e, 'Yost', yost)
big_e = try_pattern_family(big_e, 'Eiffel', eiffel)
big_e = try_pattern_family(big_e, 'Eppler', eppler)


In [2135]:
# Function to incorporate the new name found (Name_modified_by_family) into the dataset with the bigtable wings left to match
df_big_left = incorporate_family_pattern(df_big_left, big_e)

Index(['Name_big', 'Family', 'Name_modified', 'First letter',
       'Name_modified_by_pattern', 'Name_modified_by_family'],
      dtype='object')


In [2136]:
# we already did a merge so maybe just add an argument precising the on merge column
# By merging using 'left', we obtain 4905 correspondances
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')

Le dataset contient 1408 valeurs dont 199 correspondent aux fichiers dat.
Il reste 1209 valeurs à matcher.


### **3. Regex to match the family Gottingen**

In [2137]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')
big_g, dat_g = get_letter_group(df_dat_left, df_big_left, 'g')

Unmatched values left in the dat folder : (3238, 6)
Unmatched values left in big table : (1209, 6)
Big table number of wings left beginning by g : (399, 6)
Dat folder number of wings left beginning by g : (401, 6)
Difference : -2


In [2138]:
gottingen = [lambda x : (re.sub("\s", "", str(x))), lambda x : (re.sub('\(.*?\)', "", str(x)))]
big_g = try_pattern_family(big_g, 'Gottingen', gottingen)

In [2139]:
df_big_left = incorporate_family_pattern(df_big_left, big_g)

Index(['Name_big', 'Family', 'Name_modified', 'First letter',
       'Name_modified_by_pattern', 'Name_modified_by_family'],
      dtype='object')


In [2140]:
# we already did a merge so maybe just add an argument precising the on merge column
# By merging using 'left', we obtain 4905 correspondances
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_family')

Le dataset contient 1209 valeurs dont 372 correspondent aux fichiers dat.
Il reste 837 valeurs à matcher.


### **4. Global regex with quick manual verif**

In [2141]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_family')


Unmatched values left in the dat folder : (2866, 6)
Unmatched values left in big table : (837, 6)


In [2142]:
pattern_global =[lambda x : (re.sub("\s", "", str(x))), lambda x : (re.sub("-", "", str(x))), lambda x : (re.sub("%", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

In [2143]:
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 837 valeurs dont 245 correspondent aux fichiers dat.
Il reste 592 valeurs à matcher.


### **5. keep trying**

In [2144]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (2621, 6)
Unmatched values left in big table : (592, 6)


In [2145]:
pattern_global =[lambda x : (re.sub("\s", "", str(x))),
                 lambda x : (re.sub("\.", "", str(x))),
                 lambda x : (re.sub("/", "", str(x))),
                 lambda x : (re.sub("-", "", str(x))), 
                 lambda x : (re.sub("%", "", str(x))),
                 lambda x : (re.sub("\(", "", str(x))),
                 lambda x : (re.sub("\)", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

In [2146]:
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 592 valeurs dont 86 correspondent aux fichiers dat.
Il reste 506 valeurs à matcher.


In [2147]:
groupna_family(df_merge)

Family
Uncategorized    99
Hepperle         49
Selig et. al.    48
Boeing           32
Leinauer         25
Name: nb_na, dtype: int64


### **8. Smooth -> sm**

In [2148]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (2535, 6)
Unmatched values left in big table : (506, 6)


In [2149]:
list_smooth = df_big_left.loc[df_big_left['Name_modified'].str.contains('smooth'), 'Name_modified']

In [2150]:
pattern_global =[lambda x : (re.sub("\s", "", str(x))),
                 lambda x : (re.sub("smoothed", "sm", str(x))),
                 lambda x : (re.sub("\.", "", str(x))),
                 lambda x : (re.sub("/", "", str(x))),
                 lambda x : (re.sub("-", "", str(x))), 
                 lambda x : (re.sub("%", "", str(x))),
                 lambda x : (re.sub("\(", "", str(x))),
                 lambda x : (re.sub("\)", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

In [2151]:
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 506 valeurs dont 13 correspondent aux fichiers dat.
Il reste 493 valeurs à matcher.


In [2152]:
list_smoothed = df_merge.loc[df_merge['Name_modified_dat'].isna()==False, 'Name_modified_big'].to_list()
list_smoothed

['a18 (smoothed)',
 'be50 (smoothed)',
 'ch10 (smoothed)',
 'davis (smoothed)',
 'gemini (smoothed)',
 'gm15 (smoothed)',
 'goe 795 smoothed',
 'k3311 (smoothed)',
 'ma409 (smoothed)',
 'pmc19 smoothed',
 'tsagi p-ii (15.5%) smoothed',
 'ua(2)-180 smoothed',
 'wasp (smoothed)']

### **6. Removing parenthesis while keeping the inside content**

In [2153]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (2522, 6)
Unmatched values left in big table : (493, 6)


In [2154]:
# New columns to be created normally !!!!
pattern_global =[lambda x : (re.sub("\(", "", str(x))),
                 lambda x : (re.sub("\)", "", str(x)))
                 ]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

In [2155]:
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 493 valeurs dont 0 correspondent aux fichiers dat.
Il reste 493 valeurs à matcher.


In [2156]:
groupna_family(df_merge)

Family
Uncategorized    88
Hepperle         49
Selig et. al.    48
Boeing           32
Leinauer         25
Name: nb_na, dtype: int64


### **7. Removing whats between parenthesis with manual verif**

In [2157]:
df_dat_left, df_big_left = set_df_left(df_dat_left, df_merge, 'Name_modified_by_pattern')

Unmatched values left in the dat folder : (2522, 6)
Unmatched values left in big table : (493, 6)


In [2158]:
# New columns to be created normally !!!!
pattern_global =[lambda x : (re.sub('\(.*?\)', "", str(x))),
                 lambda x : (re.sub("\s", "", str(x)))]
df_dat_left, df_big_left = try_pattern(df_big_left, df_dat_left, pattern_global)

df_big_left['parenthesis_content'] = df_big_left['Name_modified'].copy()
df_big_left['parenthesis_content'] = df_big_left.loc[:, 'parenthesis_content'].apply(lambda x : (re.findall('\(.*?\)', str(x))))
df_dat_left['parenthesis_content'] = df_dat_left['Name_modified'].copy()
df_dat_left['parenthesis_content'] = df_dat_left.loc[:, 'parenthesis_content'].apply(lambda x : (re.findall('\(.*?\)', str(x))))

In [2159]:
df_big_left.loc[2,:]

Name_big                    AG44CT -02F
Family                            Drela
Name_modified               ag44ct -02f
First letter                          a
Name_modified_by_pattern     ag44ct-02f
Name_modified_by_family     ag44ct -02f
parenthesis_content                  []
Name: 2, dtype: object

In [2160]:
df_merge = merge_df(df_big_left, df_dat_left, 'Name_modified_by_pattern')

Le dataset contient 493 valeurs dont 30 correspondent aux fichiers dat.
Il reste 463 valeurs à matcher.


In [2161]:
groupna_family(df_merge)

Family
Uncategorized    79
Hepperle         49
Boeing           32
Selig et. al.    28
Leinauer         25
Name: nb_na, dtype: int64


In [2162]:
# df_merge[['Name_big', 'Family', 'Name_modified_by_pattern', 'parenthesis_content_big', 'Name_modified_dat', 'Name_dat']]

In [2163]:
# get duplicates in Name_dat of df_merge
duplicates_without_parenthesis = df_merge['Name_modified_dat'].value_counts().index[df_merge['Name_modified_dat'].value_counts().values > 1].to_list()
print(duplicates_without_parenthesis)
# Get their index
index_duplicates_without_parenthesis = [df_merge[df_merge['Name_modified_dat'] == name].index for name in duplicates_without_parenthesis]
index_list = []
for index in index_duplicates_without_parenthesis:
    index_list.append(index[0])
    index_list.append(index[1])
print(index_list)

[]
[]


In [2164]:
df_merge.loc[index_list, :]

Unnamed: 0,Name_big,Family,Name_modified_big,First letter_big,Name_modified_by_pattern,Name_modified_by_family_big,parenthesis_content_big,Name_modified_by_family_dat,Name_modified_dat,Name_dat,Size,First letter_dat,parenthesis_content_dat


In [2165]:
df_dat_left[df_dat_left['Name_modified'].str.contains(duplicates_without_parenthesis[0])]

IndexError: list index out of range

In [None]:
df_dat_left[df_dat_left['Name_modified'].str.contains(duplicates_without_parenthesis[1])]

Unnamed: 0,Name_modified_by_pattern,Name_modified_by_family,Name_modified,Name_dat,Size,First letter,parenthesis_content
1252,k3311,k3311,k3311,k3311.DAT,1140,k,[]
1253,k3311sm,k3311sm,k3311sm,k3311sm.DAT,1814,k,[]


In [None]:
df_dat_left[df_dat_left['Name_modified'].str.contains(duplicates_without_parenthesis[2])]

Unnamed: 0,Name_modified_by_pattern,Name_modified_by_family,Name_modified,Name_dat,Size,First letter,parenthesis_content
1299,ma409,ma409,ma409,ma409.DAT,962,m,[]
1300,ma409sm,ma409sm,ma409sm,ma409sm.DAT,1794,m,[]
