This code：
1. removes the country names in ORBIS data 
2. conducts fuzzy matching the company names in "Organization" column in ORBIS data by calculating the word similarity in GDELT data. Some companies' names are similar but not the same, this code helps them to figure out the exact company names



In [1]:
import pandas as pd
import numpy as np 
from cleanco import basename
from sklearn.feature_extraction import text
import pycountry
from fuzzywuzzy import fuzz
import py_stringsimjoin as ssj
from difflib import SequenceMatcher
from collections import Counter
import string
from cleanco import basename
import regex as re
import py_stringsimjoin as ssj
import pandas as pd
from fuzzywuzzy import fuzz
import jellyfish
import pyarrow as pa
import string
import unicodedata
import regex as re
import py_stringmatching as sm


### 1. read data

In [14]:

ORBIS_INPUT = './input/NG_firm_names_lms.parquet' 
GDELT_INPUT = './input/Niger_GDELT_100.csv'
indata_orbis = pd.read_parquet(ORBIS_INPUT).head(1000)
indata_gdelt = pd.read_csv(GDELT_INPUT).head(100)

print('debug: read Orbis data parquet successfully')



debug: read Orbis data parquet successfully


In [15]:
indata_orbis.columns
indata_orbis.head()

Unnamed: 0,bvdid,category_of_company,name_internat,name_native,ctryiso
0,NG110012R,SMALL COMPANY,Bodion Nigeria Limited,Bodion Nigeria Limited,NG
1,NG110013R,SMALL COMPANY,Himak Construction Nigeria Limited,Himak Construction Nigeria Limited,NG
2,NG110014R,SMALL COMPANY,Commodity Trading Nig. Ltd,Commodity Trading Nig. Ltd,NG
3,NG110015R,SMALL COMPANY,Kolawole Ajayi Akinsulire Nig. Ltd,Kolawole Ajayi Akinsulire Nig. Ltd,NG
4,NG1100169R,SMALL COMPANY,A. K Building Limited,A. K Building Limited,NG


### 2. process Orbis data

In [16]:
indata_orbis = indata_orbis.iloc[:,2:3].dropna()
indata_orbis['name_original'] = indata_orbis['name_internat']
indata_orbis['name'] = pd.DataFrame(indata_orbis['name_internat'].apply(str.lower))
outdata_orbis = indata_orbis[['name_original', 'name']]
outdata_orbis.head()
# indata_orbis.head()

Unnamed: 0,name_original,name
0,Bodion Nigeria Limited,bodion nigeria limited
1,Himak Construction Nigeria Limited,himak construction nigeria limited
2,Commodity Trading Nig. Ltd,commodity trading nig. ltd
3,Kolawole Ajayi Akinsulire Nig. Ltd,kolawole ajayi akinsulire nig. ltd
4,A. K Building Limited,a. k building limited


In [17]:
stop = text.ENGLISH_STOP_WORDS

def preprocess_nlp(row):
        row = row.lower()
        row = row.strip()
        row = re.sub(r'\(.*\)', '', row)
        row = row.translate(str .maketrans('', '', string.punctuation))
        row =' '.join(row.split())
        row = unicodedata.normalize('NFKD', row).encode('ASCII', 'ignore').decode()
        row =  ' '.join(word.lower() for word in row.split() if word not in stop)
        return row

outdata_orbis['name_clean']=outdata_orbis['name'].apply(preprocess_nlp)

outdata_orbis.to_csv('./output/orbis_list.csv')


In [18]:
indata_gdelt = indata_gdelt[['organizations']].dropna()

orgs_unextracted_gdelt = []

for index, row in indata_gdelt.iterrows():
    # row is a single-item list with a string surrounded
    # by curly braces. Extract the single item and remove
    # the surrounding curly braces.
    orgs_unextracted_gdelt.append(row[0][1:-1])

# The rows are json-like formatted strings that contain non-quoted
# information which includes company names, each of which can be extracted 
# via regex and be treated as a subrow.
orgs_extracted_gdelt = []

# The rows are json-like formatted strings that contain non-quoted
# information which includes company names, each of which can be extracted 
# via regex and be treated as a subrow.
for row in orgs_unextracted_gdelt:
    row = row.split('},')
    for subrow in row:
        match = re.findall(r'(?:n=)(.*)(?:,)', subrow)
        orgs_extracted_gdelt.append(match[0])

outdata_gdelt = pd.DataFrame(orgs_extracted_gdelt)
outdata_gdelt.rename(columns={0: 'name_gdelt'}, inplace=True)
outdata_gdelt['name_original'] = outdata_gdelt['name_gdelt']
outdata_gdelt['name_gdelt']=outdata_gdelt['name_gdelt'].apply(preprocess_nlp)


### 3. remove country names

In [19]:
file = open("./input/country_spellings.txt")
lines = file.readlines()

country_ls=[]
for l in lines:
    line=l.strip()
    line_2=line.replace('"',"")
    pat = re.compile("sname(?:==|:)(.*?);")
    # print(line_2)
    for i in pat.findall(line_2):
        country_ls.append(i.lower())
    # break


for i in pycountry.countries:
    country_ls.append(i.alpha_2.lower())
    country_ls.append(i.alpha_3.lower())
    country_ls.append(i.name.lower())
len(country_ls)

country_ls.sort(reverse=True)
country_df=pd.DataFrame(data=country_ls,columns=['country_name'])
country_df.drop_duplicates(subset ='country_name',inplace=True)

outdata_gdelt=outdata_gdelt[~outdata_gdelt['name_gdelt'].isin(country_df['country_name'])]
print('debug: country names have been removed ------------------')
outdata_gdelt[outdata_gdelt['name_gdelt'].isin(country_df['country_name'])].to_csv('./output/removed_country_name.csv')


debug: country names have been removed ------------------


In [20]:
outdata_orbis.reset_index(inplace=True)
outdata_gdelt.reset_index(inplace=True)

### 4. Join 2 tables using various similarity measure


In [23]:
ws = sm.WhitespaceTokenizer(return_set=True)

# distance join
output_pairs_distance_join = ssj.edit_distance_join(outdata_orbis, outdata_gdelt,
                                      'index', 'index', 
                                      'name_clean', 'name_gdelt', 
                                      50,
                                      l_out_attrs=['name_clean'], 
                                      r_out_attrs=['name_gdelt'],
                                      n_jobs =-1)

# Jaccard Join 
output_pairs_jaccard_join = ssj.jaccard_join(outdata_orbis, outdata_gdelt, 
                                             'index', 'index', 
                                             'name_clean', 'name_gdelt', 
                                             ws, 0.1, 
                                             l_out_attrs=['name_clean'], 
                                             r_out_attrs=['name_gdelt'],
                                             n_jobs=-1)
# Cosine Join 
output_pairs_cosine_join = ssj.cosine_join(outdata_orbis, outdata_gdelt, 
                                             'index', 'index', 
                                             'name_clean', 'name_gdelt', 
                                             ws, 0.1, 
                                             l_out_attrs=['name_clean'], 
                                             r_out_attrs=['name_gdelt'],
                                             n_jobs=-1)
# Dice Join 
output_pairs_dice_join = ssj.dice_join(outdata_orbis, outdata_gdelt, 
                                             'index', 'index', 
                                             'name_clean', 'name_gdelt', 
                                             ws, 0.1, 
                                             l_out_attrs=['name_clean'], 
                                             r_out_attrs=['name_gdelt'],
                                             n_jobs=-1)

 # overlap join 
output_pairs_overlap_join = ssj.overlap_join(outdata_orbis, outdata_gdelt, 
                                             'index', 'index', 
                                             'name_clean', 'name_gdelt', 
                                             ws, 0.1, 
                                             l_out_attrs=['name_clean'], 
                                             r_out_attrs=['name_gdelt'],
                                             n_jobs=-1)
# overlap coefficient join 
output_pairs_overlap_coefficient_join = ssj.overlap_coefficient_join(outdata_orbis, outdata_gdelt, 
                                             'index', 'index', 
                                             'name_clean', 'name_gdelt', 
                                             ws, 0.1, 
                                             l_out_attrs=['name_clean'], 
                                             r_out_attrs=['name_gdelt'],
                                             n_jobs=-1)
# master list
# To cross join, merge on a temporary key and then drop it.
outdata_gdelt['key'] = 1
outdata_orbis['key'] = 1

master_list = pd.merge(outdata_gdelt, outdata_orbis, on='key').drop('key', 1)
master_list.rename(columns={'name_x': 'name_gdelt', 
                             'name_original_x': 'name_original_gdelt', 
                             'name': 'name_orbis', 
                             'name_clean': 'name_clean_orbis', 
                             'name_original_y': 'name_original_orbis'}, 
                    inplace=True)
master_list.to_csv('./output/master_list.csv')

  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  master_list = pd.merge(outdata_gdelt, outdata_orbis, on='key').drop('key', 1)


In [24]:
try:
    data = master_list
except:
    data = pd.read_csv('./output/master_list.csv')
    data.drop(columns='Unnamed: 0', inplace=True)
    
data = data.dropna() # To prevent errors processing matches.
# Get matches of names as well as meta information.
# This is where the heavy lifting happens.

display('Match processing will take some time...')
display(str(len(data)) + ' rows...')

# !pip install tqdm
from tqdm import tqdm
tqdm.pandas() # Introduces pd.apply_progress() for progress bars.

# Name comparisons. Run an apply() on two columns.
display('Calculating fuzz ratio for names...')
data['fuzz_ratio'] = data.progress_apply(lambda x: fuzz.ratio(x.name_gdelt, x.name_clean_orbis), axis=1)
display('Calculating fuzz partial ratio for names...')
data['fuzz_partial_ratio'] = data.progress_apply(lambda x: fuzz.partial_ratio(x.name_gdelt, x.name_clean_orbis), axis=1)
display('Calculating token sort ratio for names...')
data['fuzz_token_sort_ratio'] = data.progress_apply(lambda x: fuzz.token_sort_ratio(x.name_gdelt, x.name_clean_orbis), axis=1)
display('Calculating jaro distance for names...')
data['jaro_distance'] = data.progress_apply(lambda x: jellyfish.jaro_distance(x.name_gdelt, x.name_clean_orbis), axis=1)

# Metaphone generation.
display('Generating metaphones for uncleaned orbis names...')
data['metaphone_unclean_orbis'] = data['name_orbis'].progress_apply(jellyfish.metaphone)
display('Generating metaphones for cleaned orbis names...')
data['metaphone_clean_orbis'] = data['name_clean_orbis'].progress_apply(jellyfish.metaphone)
display('Generating metaphones for gdelt names...')
data['metaphone_gdelt'] = data['name_gdelt'].progress_apply(jellyfish.metaphone)

# Metaphone comparisons. Run an apply() on two columns.
display('Calculating fuzz ratio for metaphones...')
data['metaphone_fuzz_ratio'] = data.progress_apply(lambda x: fuzz.ratio(x.metaphone_gdelt, x.metaphone_clean_orbis), axis=1)
display('Calculating fuzz partial ratio for metaphones...')
data['metaphone_fuzz_partial_ratio'] = data.progress_apply(lambda x: fuzz.partial_ratio(x.metaphone_gdelt, x.metaphone_clean_orbis), axis=1)
display('Calculating token sort ratio for metaphones...')
data['metaphone_fuzz_token_sort_ratio'] = data.progress_apply(lambda x: fuzz.token_sort_ratio(x.metaphone_gdelt, x.metaphone_clean_orbis), axis=1)
display('Calculating jaro distance for metaphones...')
data['metaphone_jaro_distance'] = data.progress_apply(lambda x: jellyfish.jaro_distance(x.metaphone_gdelt, x.metaphone_clean_orbis), axis=1)

display('Done.')

'Match processing will take some time...'

'749000 rows...'

'Calculating fuzz ratio for names...'

100%|██████████| 749000/749000 [00:53<00:00, 14127.29it/s]


'Calculating fuzz partial ratio for names...'

100%|██████████| 749000/749000 [01:24<00:00, 8874.40it/s] 


'Calculating token sort ratio for names...'

100%|██████████| 749000/749000 [01:07<00:00, 11031.59it/s]


'Calculating jaro distance for names...'

100%|██████████| 749000/749000 [00:43<00:00, 17027.22it/s]


'Generating metaphones for uncleaned orbis names...'

100%|██████████| 749000/749000 [00:06<00:00, 118194.12it/s]


'Generating metaphones for cleaned orbis names...'

100%|██████████| 749000/749000 [00:06<00:00, 122101.14it/s]


'Generating metaphones for gdelt names...'

100%|██████████| 749000/749000 [00:05<00:00, 143143.34it/s]


'Calculating fuzz ratio for metaphones...'

100%|██████████| 749000/749000 [00:48<00:00, 15416.09it/s]


'Calculating fuzz partial ratio for metaphones...'

100%|██████████| 749000/749000 [01:10<00:00, 10663.27it/s]


'Calculating token sort ratio for metaphones...'

100%|██████████| 749000/749000 [01:04<00:00, 11551.92it/s]


'Calculating jaro distance for metaphones...'

100%|██████████| 749000/749000 [00:44<00:00, 16928.52it/s]


'Done.'

In [25]:

#### py_stringsimjoin
# Edit distance join
data = pd.merge(data, 
                output_pairs_distance_join, 
                how='outer', 
                left_on=['index_x', 'index_y'], 
                right_on=['r_index', 'l_index'])

data.rename(columns={'_sim_score': 'sim_score_distance'}, inplace=True)

#### py_stringmatching
# Jaccard join
data = pd.merge(data, 
                output_pairs_jaccard_join, 
                how='outer', 
                left_on=['index_x', 'index_y'], 
                right_on=['r_index', 'l_index'])

data.rename(columns={'_sim_score': 'sim_score_jaccard'}, inplace=True)

# Cosine Join 
data = pd.merge(data, 
                output_pairs_cosine_join, 
                how='outer', 
                left_on=['index_x', 'index_y'], 
                right_on=['r_index', 'l_index'])

data.rename(columns={'_sim_score': 'sim_score_cosine'}, inplace=True)

# Dice Join 
data = pd.merge(data, 
                output_pairs_dice_join, 
                how='outer', 
                left_on=['index_x', 'index_y'], 
                right_on=['r_index', 'l_index'])

data.rename(columns={'_sim_score': 'sim_score_dice'}, inplace=True)

data = pd.merge(data, output_pairs_overlap_join, 
                how='outer', 
                left_on=['index_x', 'index_y'], 
                right_on=['r_index', 'l_index'])

data.rename(columns={'_sim_score': 'sim_score_overlap'}, inplace=True)

# Overlap coefficient join 
data = pd.merge(data, 
                output_pairs_overlap_coefficient_join, 
                how='outer', 
                left_on=['index_x', 'index_y'], 
                right_on=['r_index', 'l_index'])

data.rename(columns={'_sim_score': 'sim_score_overlap_coefficient'}, inplace=True)

data.to_csv('./output/matches_raw.csv')


  data = pd.merge(data,
  data = pd.merge(data,


In [26]:
try:
    indata = data
except:
    indata = pd.read_csv('./output/matches_raw.csv')
    indata.drop(columns=['Unnamed: 0'], inplace=True)


#Sort match data in a multindex and sort by name and score.
df_sorted = indata.set_index(['name_original_orbis', 'name_original_gdelt'])
df_sorted = df_sorted.sort_values(by=['name_original_orbis', 
                                      'fuzz_ratio', 
                                      'fuzz_partial_ratio', 
                                      'fuzz_token_sort_ratio'], 
                                  ascending=False)
df_sorted = df_sorted.sort_index()

df_sorted.to_csv('./output/matches_sorted.csv')

try:
    df_sorted
except:
    indata = pd.read_csv('./output/matches_sorted.csv')
    df_sorted = indata.set_index(['name_original_orbis', 'name_original_gdelt'])

df_sorted.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,index_x,name_gdelt,index_y,name_orbis,name_clean_orbis,fuzz_ratio,fuzz_partial_ratio,fuzz_token_sort_ratio,jaro_distance,metaphone_unclean_orbis,...,r_index_x,l_name_clean_x,r_name_gdelt_x,sim_score_overlap,_id_y,l_index_y,r_index_y,l_name_clean_y,r_name_gdelt_y,sim_score_overlap_coefficient
name_original_orbis,name_original_gdelt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A & A Super Market Limited,Absa Bank Ltd,273,absa bank,230,a & a super market limited,super market limited,28,33,21,0.464815,A A SPR MRKT LMTT,...,,,,,,,,,,
A & A Super Market Limited,Absa Group,259,absa group,230,a & a super market limited,super market limited,20,20,27,0.522222,A A SPR MRKT LMTT,...,,,,,,,,,,
A & A Super Market Limited,Afghanistan Analysts Network,232,afghanistan analysts network,230,a & a super market limited,super market limited,25,30,33,0.442328,A A SPR MRKT LMTT,...,,,,,,,,,,
A & A Super Market Limited,Africa Center,51,africa center,230,a & a super market limited,super market limited,30,38,36,0.534249,A A SPR MRKT LMTT,...,,,,,,,,,,
A & A Super Market Limited,Africa Department Of International Relations,14,africa department international relations,230,a & a super market limited,super market limited,33,45,33,0.605081,A A SPR MRKT LMTT,...,,,,,,,,,,


In [27]:
# Just in case we want to look at the df
# we should have the columns in a nice order.

df_unscored = df_sorted[[
    # 'acronym_gdelt', 
    # 'freq_gdelt', 
    'fuzz_ratio', 
    'fuzz_partial_ratio', 
    'fuzz_token_sort_ratio', 
    'jaro_distance', 
    'metaphone_unclean_orbis', 
    'metaphone_clean_orbis', 
    'metaphone_gdelt',
    'metaphone_jaro_distance',
    'metaphone_fuzz_ratio',
    'metaphone_fuzz_partial_ratio',
    'metaphone_fuzz_token_sort_ratio',
    'sim_score_distance',
    'sim_score_jaccard',
    'sim_score_cosine',
    'sim_score_dice',
    'sim_score_overlap',
    'sim_score_overlap_coefficient',
]]

df_scored = df_unscored

# An approach called "fuzz similarity"
# https://www.analyticsinsight.net/company-names-standardization-using-a-fuzzy-nlp-approach/
df_scored['fuzz_similarity'] = (2 * df_scored['fuzz_partial_ratio'] * df_scored['fuzz_token_sort_ratio']) / (df_scored['fuzz_partial_ratio'] + df_scored['fuzz_token_sort_ratio'])

# Cumulative scores.
df_scored['total_score_name'] = df_scored['fuzz_ratio'] + df_scored['fuzz_partial_ratio'] + df_scored['fuzz_token_sort_ratio']
df_scored['total_score_metaphone'] = df_scored['metaphone_fuzz_ratio'] + df_scored['metaphone_fuzz_partial_ratio'] + df_scored['metaphone_fuzz_token_sort_ratio']

# Save progress here to allow fast manipulation of matching below.
df_matches = df_scored


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_scored['fuzz_similarity'] = (2 * df_scored['fuzz_partial_ratio'] * df_scored['fuzz_token_sort_ratio']) / (df_scored['fuzz_partial_ratio'] + df_scored['fuzz_token_sort_ratio'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_scored['total_score_name'] = df_scored['fuzz_ratio'] + df_scored['fuzz_partial_ratio'] + df_scored['fuzz_token_sort_ratio']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [28]:
df_scored = df_unscored
# An approach called "fuzz similarity"
# https://www.analyticsinsight.net/company-names-standardization-using-a-fuzzy-nlp-approach/
df_scored['fuzz_similarity'] = (2 * df_scored['fuzz_partial_ratio'] * df_scored['fuzz_token_sort_ratio']) / (df_scored['fuzz_partial_ratio'] + df_scored['fuzz_token_sort_ratio'])

# Cumulative scores.
df_scored['total_score_name'] = df_scored['fuzz_ratio'] + df_scored['fuzz_partial_ratio'] + df_scored['fuzz_token_sort_ratio']
df_scored['total_score_metaphone'] = df_scored['metaphone_fuzz_ratio'] + df_scored['metaphone_fuzz_partial_ratio'] + df_scored['metaphone_fuzz_token_sort_ratio']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_scored['fuzz_similarity'] = (2 * df_scored['fuzz_partial_ratio'] * df_scored['fuzz_token_sort_ratio']) / (df_scored['fuzz_partial_ratio'] + df_scored['fuzz_token_sort_ratio'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_scored['total_score_name'] = df_scored['fuzz_ratio'] + df_scored['fuzz_partial_ratio'] + df_scored['fuzz_token_sort_ratio']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [21]:
df_matches = df_scored

In [29]:
# Filter matches.
df_matches = df_matches[((df_matches['total_score_name'] > 280.0) & (df_matches['jaro_distance'] > 0.9))]
df_matches.to_csv('./output/matches_filtered.csv')


In [23]:
try:
    indata = df_matches
except:
    indata = pd.read_csv('./output/matches_filtered.csv')
    indata = indata.set_index(['name_original_orbis', 'name_original_gdelt'])

In [25]:
# Clean up the final output.
dataout = indata[['fuzz_similarity', 
                  'total_score_name', 
                  'total_score_metaphone', 
                #   'freq_gdelt', 
                  'jaro_distance', 
                  'metaphone_jaro_distance', 
                  'sim_score_distance',
                  'sim_score_jaccard',
                  'sim_score_cosine',
                  'sim_score_dice',
                  'sim_score_overlap',
                  'sim_score_overlap_coefficient',
                 ]]

In [26]:
dataout.to_csv('./output/OUTPUT.csv')

In [27]:
dataout

Unnamed: 0_level_0,Unnamed: 1_level_0,fuzz_similarity,total_score_name,total_score_metaphone,jaro_distance,metaphone_jaro_distance,sim_score_distance,sim_score_distance,sim_score_distance,sim_score_jaccard,sim_score_jaccard,...,sim_score_cosine,sim_score_cosine,sim_score_cosine,sim_score_dice,sim_score_dice,sim_score_dice,sim_score_overlap,sim_score_overlap_coefficient,sim_score_overlap_coefficient,sim_score_overlap_coefficient
name_original_orbis,name_original_gdelt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
