In [3002]:
import pandas as pd
import numpy as np
import regex as re

# Get CSV

In [3003]:
pd.set_option('display.max_row', 8353 )

In [3004]:
df_dat = pd.read_csv('dat_files_index.csv', usecols=('name', 'size'))
df_dat.rename(columns={'name':'Name', 'size':'Size'}, inplace=True)
print(df_dat.shape)
df_dat.head()

(8353, 2)


Unnamed: 0,Name,Size
0,a18.DAT,945
1,a18sm.DAT,1795
2,A6014-S.DAT,6116
3,A6016-S.DAT,5914
4,A6018-S.DAT,5889


In [3005]:
df_bigtable = pd.read_csv('data/ailes_avion.csv', usecols=('Name', 'Family'))
print(df_bigtable.shape)
df_bigtable.head()

(6324, 2)


Unnamed: 0,Name,Family
0,63A108 MOD C,NASA
1,A18,Uncategorized
2,A18 (SMOOTHED),Uncategorized
3,A6014-S,Ayers
4,A6016-S,Ayers


# Basic Regex and merge

In [3006]:
# Create a column Name_modified with all names in lowercase and without the .DAT ending 
df_dat['Name_modified'] =  df_dat['Name'].apply(lambda x : (re.split(".DAT$", str(x)))[0])
df_dat['Name_modified'] = df_dat['Name_modified'].apply(lambda x : str(x).lower())
# Create a column Name_modified with all names in lowercase
df_bigtable['Name_modified'] = df_bigtable['Name'].apply(lambda x : str(x).lower())

In [3007]:
# Count the number of family in big table
#df_bigtable['Family'].value_counts()

In [3008]:
# By merging using 'left', we obtain 4905 correspondances 
df_merge = pd.merge(df_bigtable, df_dat, on='Name_modified', how='left', suffixes=('_big', '_dat'))
nb_mismatch = df_merge[df_merge['Name_dat'].isna()].shape[0]
nb_match = df_merge.shape[0] - nb_mismatch
print(f'Le dataset contient {df_merge.shape[0]} valeurs dont {nb_match} correspondent aux fichiers dat.')
print(f'Il reste {nb_mismatch} valeurs à matcher.')
df_merge.head()

Le dataset contient 6324 valeurs dont 4905 correspondent aux fichiers dat.
Il reste 1419 valeurs à matcher.


Unnamed: 0,Name_big,Family,Name_modified,Name_dat,Size
0,63A108 MOD C,NASA,63a108 mod c,,
1,A18,Uncategorized,a18,a18.DAT,945.0
2,A18 (SMOOTHED),Uncategorized,a18 (smoothed),,
3,A6014-S,Ayers,a6014-s,A6014-S.DAT,6116.0
4,A6016-S,Ayers,a6016-s,A6016-S.DAT,5914.0


# Matchmaking using families

Plus tard, il faudrait ajouter une valeur qui indique à quel point on est sûr du matchmaking

In [3009]:
# Count the number of missing values per family 
family = df_merge.groupby(['Family']).count()
mask_family = (family['Name_big'] != family['Name_dat'])
family_na = family[mask_family].copy()
family_na['nb_na'] = family_na['Name_big'] - family_na['Size']
# We are interested by the 5 families with the most of na
family_na['nb_na'].sort_values(ascending=False).head(5)

Family
Gottingen        382
Eppler           201
Uncategorized    153
Wortmann         112
NASA              54
Name: nb_na, dtype: int64

### **1. Create dataframes with wings left to match**

In [3010]:
# Create df_dat_left and df_big_left
df_dat_left = pd.merge(df_bigtable, df_dat, on='Name_modified', how='right', suffixes=('_big', '_dat'))
df_dat_left = df_dat_left[df_dat_left['Name_big'].isna()]
print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")
df_big_left = df_merge[df_merge['Name_dat'].isna()].copy()
print(f"Unmatched values left in big table : {df_big_left.shape}")

# Create a new column for each df containing the first letter of each wing
df_dat_left['First letter'] = [x[0] for x in df_dat_left['Name_modified'].values]
df_big_left['First letter'] = [x[0] for x in df_big_left['Name_modified'].values]

# Create a new column for each df with z copy of the modified name to further work on
df_dat_left['Name_modified_by_family'] = df_dat_left['Name_modified'].copy()
df_big_left['Name_modified_by_family'] = df_big_left['Name_modified'].copy()

Unmatched values left in the dat folder : (3448, 5)
Unmatched values left in big table : (1419, 5)


In [3011]:
# df_dat_left ----> ['Name_big', 'Family', 'Name_modified', 'Name_dat', 'Size', 'First letter', 'Name_modified_by_family']
# Name_big and Family are NaN
df_dat_left.drop(columns=['Name_big', 'Family'], inplace=True)
# df_big_left ----> ['Name_big', 'Family', 'Name_modified', 'Name_dat', 'Size', 'First letter', 'Name_modified_by_family']
# Name_dat and Size are NaN
df_big_left.drop(columns=['Name_dat', 'Size'], inplace=True)

### **2. Regex to match the family Yost,Eiffel,Eppeler**

- Family yost : correspondance if deleting the spaces points and / from big e
- Family eiffel : correspondance if deleting the spaces and content between parenthesis + special case for eiffel 10 (wright) - 1903 wright flyer airfoil	
- Family eppler : correspondance if replacing eppler by e and removing spaces

In [3012]:
big_e = df_big_left[df_big_left['First letter'] == 'e']
dat_e = df_dat_left[df_dat_left['First letter'] == 'e']
print(f'Big table number of wings left beginning by e : {big_e.shape}')
print(f'Dat folder number of wings left beginning by e : {dat_e.shape}')
print(f'Difference : {big_e.shape[0] - dat_e.shape[0]}')

Big table number of wings left beginning by e : (221, 5)
Dat folder number of wings left beginning by e : (219, 5)
Difference : 2


In [3013]:
# Family yost
big_e.loc[big_e['Family'] == 'Yost', 'Name_modified_by_family'] = big_e.loc[big_e['Family'] == 'Yost', 'Name_modified_by_family'].apply(lambda x : (re.sub("\s", "", str(x))))
big_e.loc[big_e['Family'] == 'Yost', 'Name_modified_by_family'] = big_e.loc[big_e['Family'] == 'Yost', 'Name_modified_by_family'].apply(lambda x : (re.sub("\.", "", str(x))))
big_e.loc[big_e['Family'] == 'Yost', 'Name_modified_by_family'] = big_e.loc[big_e['Family'] == 'Yost', 'Name_modified_by_family'].apply(lambda x : (re.sub("/", "", str(x))))

# Family eiffel
big_e.loc[big_e['Family'] == 'Eiffel', 'Name_modified_by_family'] = big_e.loc[big_e['Family'] == 'Eiffel', 'Name_modified_by_family'].apply(lambda x : (re.sub('\(.*?\)', "", str(x))))
big_e.loc[big_e['Family'] == 'Eiffel', 'Name_modified_by_family'] = big_e.loc[big_e['Family'] == 'Eiffel', 'Name_modified_by_family'].apply(lambda x : (re.sub("\s", "", str(x))))
big_e.loc[big_e['Family'] == 'Eiffel', 'Name_modified_by_family'] = big_e.loc[big_e['Family'] == 'Eiffel', 'Name_modified_by_family'].apply(lambda x : (re.split("-", str(x)))[0])

# Family eppler
big_e.loc[big_e['Family'] == 'Eppler', 'Name_modified_by_family'] = big_e.loc[big_e['Family'] == 'Eppler', 'Name_modified_by_family'].apply(lambda x : (re.sub("eppler", "e", str(x))))
big_e.loc[big_e['Family'] == 'Eppler', 'Name_modified_by_family'] = big_e.loc[big_e['Family'] == 'Eppler', 'Name_modified_by_family'].apply(lambda x : (re.sub("\s", "", str(x))))

In [3014]:
# Incorporating the new name found (Name_modified_by_family) into the dataset with the bigtable wings left to match
df_big_left = pd.merge(df_big_left, big_e[['Name_big', 'Name_modified_by_family']], on='Name_big', how='left', suffixes=('_left', '_big'))
print(df_big_left.columns)
#df_big_left.drop(columns=['Name_modified_by_family_big'], inplace=True)
df_big_left.rename(columns={'Name_modified_by_family_left':'Name_modified_by_family_original', 'Name_modified_by_family_big':'Name_modified_by_family'}, inplace=True)

Index(['Name_big', 'Family', 'Name_modified', 'First letter',
       'Name_modified_by_family_left', 'Name_modified_by_family_big'],
      dtype='object')


In [3015]:
df_big_left.head()

Unnamed: 0,Name_big,Family,Name_modified,First letter,Name_modified_by_family_original,Name_modified_by_family
0,63A108 MOD C,NASA,63a108 mod c,6,63a108 mod c,
1,A18 (SMOOTHED),Uncategorized,a18 (smoothed),a,a18 (smoothed),
2,AG03 (FLAT AFT BOTTOM),Drela,ag03 (flat aft bottom),a,ag03 (flat aft bottom),
3,AG44CT -02F,Drela,ag44ct -02f,a,ag44ct -02f,
4,AG455CT -02F ROT.,Drela,ag455ct -02f rot.,a,ag455ct -02f rot.,


In [3016]:
# By merging using 'left', we obtain 4905 correspondances 
df_merge_e = pd.merge(df_big_left, df_dat_left, on='Name_modified_by_family', how='left', suffixes=('_big', '_dat'))
print(df_merge_e.shape)
nb_mismatch = df_merge_e[df_merge_e['Name_dat'].isna()].shape[0]
nb_match = df_merge_e.shape[0] - nb_mismatch
print(f'Le dataset contient {df_merge_e.shape[0]} valeurs dont {nb_match} correspondent aux fichiers dat.')
print(f'Il reste {nb_mismatch} valeurs à matcher.')
df_merge_e.head()

(1419, 10)
Le dataset contient 1419 valeurs dont 199 correspondent aux fichiers dat.
Il reste 1220 valeurs à matcher.


Unnamed: 0,Name_big,Family,Name_modified_big,First letter_big,Name_modified_by_family_original,Name_modified_by_family,Name_modified_dat,Name_dat,Size,First letter_dat
0,63A108 MOD C,NASA,63a108 mod c,6,63a108 mod c,,,,,
1,A18 (SMOOTHED),Uncategorized,a18 (smoothed),a,a18 (smoothed),,,,,
2,AG03 (FLAT AFT BOTTOM),Drela,ag03 (flat aft bottom),a,ag03 (flat aft bottom),,,,,
3,AG44CT -02F,Drela,ag44ct -02f,a,ag44ct -02f,,,,,
4,AG455CT -02F ROT.,Drela,ag455ct -02f rot.,a,ag455ct -02f rot.,,,,,


### **3. Regex to match the family Gottingen**

In [3017]:
# Create df_dat_left and df_big_left
df_dat_left = pd.merge(df_merge_e, df_dat_left, on='Name_modified_by_family', how='right', suffixes=('_big', '_dat'))
df_dat_left = df_dat_left[df_dat_left['Name_big'].isna()]
df_dat_left.dropna(axis=1, how='all', inplace=True)
df_dat_left = df_dat_left.set_axis([re.sub('_dat', "", str(col)) for col in df_dat_left.columns], axis=1)
print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")
df_big_left = df_merge_e[df_merge_e['Name_dat'].isna()].copy()
df_big_left.dropna(axis=1, how='all', inplace=True)
df_big_left = df_big_left.set_axis([re.sub('_big', "", str(col)) for col in df_big_left.columns], axis=1)
print(f"Unmatched values left in big table : {df_big_left.shape}")

Unmatched values left in the dat folder : (3249, 5)
Unmatched values left in big table : (1220, 6)


In [3018]:
big_g = df_big_left[df_big_left['First letter'] == 'g']
dat_g = df_dat_left[df_dat_left['First letter'] == 'g']
print(f'Big table number of wings left beginning by g : {big_g.shape}')
print(f'Dat folder number of wings left beginning by g : {dat_g.shape}')
print(f'Difference : {big_g.shape[0] - dat_g.shape[0]}')

Big table number of wings left beginning by g : (406, 6)
Dat folder number of wings left beginning by g : (408, 5)
Difference : -2


In [3019]:
big_g = big_g.copy()

In [3020]:
# Family exceptions beginning by 'g'
big_g.loc[big_g['Name_modified'] == 'gu25-5(11)8', 'Name_modified_by_family'] = 'gu255118'
big_g.loc[big_g['Name_modified'] == 'gs-1', 'Name_modified_by_family'] = 'gs1'
big_g.loc[big_g['Name_modified'] == 'griffith 30% suction airfoil', 'Name_modified_by_family'] = 'griffith30symsuction'
big_g.loc[big_g['Name_modified'] == 'goe 167 (v.karman prop.2)', 'Name_modified_by_family'] = 'goe167'
big_g.loc[big_g['Name_modified'] == 'glenn martin 2', 'Name_modified_by_family'] = 'glennmartin2'
big_g.loc[big_g['Name_modified'] == 'glenn martin 3', 'Name_modified_by_family'] = 'glennmartin3'
big_g.loc[big_g['Name_modified'] == 'glenn martin 4', 'Name_modified_by_family'] = 'glennmartin4'
# Family Gottingen
big_g.loc[big_g['Family'] == 'Gottingen', 'Name_modified_by_family'] = big_g.loc[big_g['Family'] == 'Gottingen', 'Name_modified'].apply(lambda x : (re.sub("\s", "", str(x))))
big_g.loc[big_g['Family'] == 'Gottingen', 'Name_modified_by_family'] = big_g.loc[big_g['Family'] == 'Gottingen', 'Name_modified_by_family'].apply(lambda x : (re.sub('\(.*?\)', "", str(x))))

In [3021]:
big_g.columns

Index(['Name', 'Family', 'Name_modified', 'First letter',
       'Name_modified_by_family_original', 'Name_modified_by_family'],
      dtype='object')

In [3022]:
df_big_left.columns

Index(['Name', 'Family', 'Name_modified', 'First letter',
       'Name_modified_by_family_original', 'Name_modified_by_family'],
      dtype='object')

In [3023]:
# Incorporating the new name found (Name_modified_by_family) into the dataset with the bigtable wings left to match
df_big_left = pd.merge(df_big_left, big_g[['Name', 'Name_modified_by_family']], on='Name', how='left', suffixes=('_left', '_big'))
print(df_big_left.columns)
df_big_left.rename(columns={'Name_modified_by_family_left':'Name_modified_by_family_e', 'Name_modified_by_family_big':'Name_modified_by_family'}, inplace=True)

Index(['Name', 'Family', 'Name_modified', 'First letter',
       'Name_modified_by_family_original', 'Name_modified_by_family_left',
       'Name_modified_by_family_big'],
      dtype='object')


In [3024]:
# By merging using 'inner', we obtain xx correspondances 
df_merge_g = pd.merge(df_big_left, df_dat_left, on='Name_modified_by_family', how='left', suffixes=('_big', '_dat'))
df_merge_g.dropna(axis=1, how='all', inplace=True)
print(df_merge_g.shape)
nb_mismatch = df_merge_g[df_merge_g['Name_dat'].isna()].shape[0]
nb_match = df_merge_g.shape[0] - nb_mismatch
print(f'Le dataset contient {df_merge_g.shape[0]} valeurs dont {nb_match} correspondent aux fichiers dat.')
print(f'Il reste {nb_mismatch} valeurs à matcher.')
df_merge_g.head()

(1220, 11)
Le dataset contient 1220 valeurs dont 379 correspondent aux fichiers dat.
Il reste 841 valeurs à matcher.


Unnamed: 0,Name_big,Family,Name_modified_big,First letter_big,Name_modified_by_family_original,Name_modified_by_family_e,Name_modified_by_family,Name_modified_dat,Name_dat,Size,First letter_dat
0,63A108 MOD C,NASA,63a108 mod c,6,63a108 mod c,,,,,,
1,A18 (SMOOTHED),Uncategorized,a18 (smoothed),a,a18 (smoothed),,,,,,
2,AG03 (FLAT AFT BOTTOM),Drela,ag03 (flat aft bottom),a,ag03 (flat aft bottom),,,,,,
3,AG44CT -02F,Drela,ag44ct -02f,a,ag44ct -02f,,,,,,
4,AG455CT -02F ROT.,Drela,ag455ct -02f rot.,a,ag455ct -02f rot.,,,,,,


### **4. Autre famille**

In [3031]:
# Create df_dat_left and df_big_left
df_dat_left = pd.merge(df_merge_g, df_dat_left, on='Name_modified_by_family', how='right', suffixes=('_big', '_dat'))
df_dat_left = df_dat_left[df_dat_left['Name_big'].isna()]
df_dat_left.dropna(axis=1, how='all', inplace=True)
df_dat_left = df_dat_left.set_axis([re.sub('_dat', "", str(col)) for col in df_dat_left.columns], axis=1)
print(f"Unmatched values left in the dat folder : {df_dat_left.shape}")
df_big_left = df_merge_g[df_merge_g['Name_dat'].isna()].copy()
df_big_left.dropna(axis=1, how='all', inplace=True)
df_big_left = df_big_left.set_axis([re.sub('_big', "", str(col)) for col in df_big_left.columns], axis=1)
print(f"Unmatched values left in big table : {df_big_left.shape}")

Unmatched values left in the dat folder : (2870, 5)
Unmatched values left in big table : (841, 7)


In [None]:
# df_big_left3.dropna(axis=1, how='all', inplace=True)
# df_dat_left3.dropna(axis=1, how='all', inplace=True)
# big_w = df_big_left3[df_big_left3['First letter_big2'] == 'w']
# dat_w = df_dat_left3[df_dat_left3['First letter'] == 'w']
# print(f'Big table number of wings left beginning by g : {big_w.shape}')
# print(f'Dat folder number of wings left beginning by g : {dat_w.shape}')
# print(f'Difference : {big_w.shape[0] - dat_w.shape[0]}')