In [5]:
##### Takes a search query as input and get the vectors from the whole dataset to compare.
import spacy
import pickle
import numpy as np
from scipy import spatial
import sys
import unidecode
import pandas as pd
#from sklearn.decomposition import PCA
#QUERY  Neighbours Ids_and_Score_bool
directory='../'
argv=sys.argv
nlp = spacy.load("fr_core_news_lg")
pca = pickle.load(open(directory+'models/pca_30.pkl','rb'))
pca_space= np.load(directory+'models/vectors_pca_30.npy', allow_pickle=True)
id_table=list(np.load(directory+'../data/id_table.npy', allow_pickle=True))
tree = spatial.KDTree(pca_space)
from spacy.lang.fr.stop_words import STOP_WORDS
from spacy.lang.fr import French
parser=French()
stopwords = list(STOP_WORDS)

def process_query(search_query):
    query=str(search_query).lower()
    clean_query = unidecode.unidecode(query)
    tokens=parser(clean_query)
    tokens = [ word.lower_ for word in tokens ]
    tokens = [ word for word in tokens if word not in stopwords]
    tokens = " ".join([i for i in tokens])
    return (tokens)

def query2vec(search_query):
    x=nlp(search_query).vector #spacy 300d
    y=pca.transform([x])[0] #pca 30d
    return(y)

def get_id(idx):
    dataset_id=id_table[idx]
    return(dataset_id)

def get_idx(ids):
    dataset_idx=id_table.index(ids)
    return(dataset_idx)

def id2vec(ids):
    return(pca_space[get_idx(ids)])

def neighbours(vector, n):
    n_ids=[]
    score=[]
    dist, pos=tree.query(vector, k=n)
    for j in range(len(pos)):
        n_ids.append(get_id(pos[j]))
        score.append(1-dist[j]/50) ##very approximate metric 
    return(n_ids, score)

def Search(search_query, n):
    n_ids, score=neighbours(query2vec(process_query(search_query)), n)
    #print(n_ids, score)
    return(n_ids, score)

def Similarity(ids, n):
    n_ids, score=neighbours(id2vec(ids), n)
    return(n_ids, score)



In [6]:
with open('../../data/google_results_ids.json') as json_data:
    data = json.load(json_data)

google_df=pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in data.items() ]))

In [7]:
def load_queries(path='../../data/querys.csv'):
    df=pd.read_csv('../../data/querys.csv', sep=',',error_bad_lines=False, encoding='latin-1')
    ids=df['expected']
    queries=np.array(df['query'], dtype=str)
    return queries, ids

In [8]:
queries, ids = load_queries()

In [9]:
queries

array(['siren', 'sirene', 'entreprise', 'entreprises', 'siret',
       'open damir', 'opendamir', 'damir', 'contours départements',
       'emissions polluantes', 'géofla départements',
       'effectifs police municipale', 'marchés public bourgogne',
       'Liste gares SNCF', 'contours départements français',
       'loi de finance 2016', 'lolf 2016', 'formations pas de calais',
       'accidents de la circulation', 'accidents de la route',
       'risque de décès un an après accident', 'COG',
       'code officiel géographique', 'contour commune',
       'contours communes', 'contour communes', 'code postal',
       'codes postaux', 'prénoms', 'association', 'associations', 'RNA',
       'nan', 'répertoire des associations',
       'répertoire national des associations', 'waldec',
       'organismes de formation', 'organisme de formation',
       'bibliothèques', "annuaire de l'éducation", 'grand débat',
       'vie-publique répertoire'], dtype='<U36')

In [40]:
queries.remove('vie-publique répertoire')

In [41]:
for i in range(len(queries)):
    print(google_df[queries[i]])

0     5b7ffc618b4c4169d30727e0
1     5e719db34444483bd0a1a2b9
2     58ef2cfda3a7293d49c4e17a
3     58d27679a3a7297c157fe3ae
4     5c37848906e3e77914d00e71
5     5971fbe588ee38522976e8f0
6     5dd651758b4c410aeb6582e6
7     5369956ba3a729239d2046fc
8     5a097ae9c751df643bf1b500
9     57092f51a3a72944fcc6ab40
10    54f63501c751df466f882844
11    5530fbacc751df5ff937dddb
12                         NaN
13                         NaN
14                         NaN
15                         NaN
16                         NaN
17                         NaN
18                         NaN
19                         NaN
Name: siren, dtype: object
0     5b7ffc618b4c4169d30727e0
1     5e719db34444483bd0a1a2b9
2     58ef2cfda3a7293d49c4e17a
3     58d27679a3a7297c157fe3ae
4     5971fbe588ee38522976e8f0
5     5dd651758b4c410aeb6582e6
6     5c37848906e3e77914d00e71
7     5eed7bbeb63de854684fc1c2
8     5a22644f88ee3848529af925
9     54f63501c751df466f882844
10    593457c6a3a72968e2876bf3
11    5530fb

In [11]:
import opendatascience_module as opd

In [12]:
all_results=[]
for i in range(len(queries)):
        results=opd.Search(queries[i],20)[0]
        all_results.append(results)

In [13]:
all_results

[['5cc8ccf8634f415c2e02a868',
  '5b7ffc618b4c4169d30727e0',
  '58aeede688ee38212d0209b6',
  '56eedf1e88ee380d24908574',
  '5971fbe588ee38522976e8f0',
  '5ad7207ba3a7292e3ab95b2d',
  '576b13eca3a72927ec4f8575',
  '593457c6a3a72968e2876bf3',
  '5883cbd788ee3810c89b81c1',
  '56eedf15c751df4588d6e93c',
  '558a8518c751df75e4a453c7',
  '5a098079c751df6654626da4',
  '5c5a6f709ce2e7579ccc53f4',
  '5883d53d88ee3809bb9b81a6',
  '59591e1ea3a7291dd09c8156',
  '5ba4a1998b4c410d279d68a4',
  '5ba4a1708b4c410d1a44e797',
  '5ba4a16a8b4c410d3557041b',
  '5e3221ed06e3e70507320baf',
  '5a980555b59508758f84c010'],
 ['5cc8ccf8634f415c2e02a868',
  '5b7ffc618b4c4169d30727e0',
  '58aeede688ee38212d0209b6',
  '5ad7207ba3a7292e3ab95b2d',
  '5971fbe588ee38522976e8f0',
  '593457c6a3a72968e2876bf3',
  '5bd89dba06e3e738f68c0eda',
  '576b13e9a3a72927ec4f856f',
  '5ca0320e9ce2e70830b50dda',
  '5e3221ed06e3e70507320baf',
  '576b13eda3a72927ec4f8577',
  '576b13eca3a72927ec4f8575',
  '5c5a6f709ce2e7579ccc53f4',
  '5a9805

In [65]:
google_df

Unnamed: 0,siren,sirene,entreprises,siret,open damir,opendamir,contours départements,emissions polluantes,géofla départements,effectifs police municipale,...,RNA,Unnamed: 13,répertoire des associations,répertoire national des associations,waldec,organismes de formation,organisme de formation,bibliothèques,annuaire de l'éducation,grand débat
0,5b7ffc618b4c4169d30727e0,5b7ffc618b4c4169d30727e0,5e9dbdbe71589194c8f7b42f,5b7ffc618b4c4169d30727e0,54de1e8fc751df388646738b,54de1e8fc751df388646738b,536991b0a3a729239d203d13,53ba4c07a3a729219b7bead3,5809e241c751df2646c562c5,5369986ba3a729239d204f55,...,58e53811c751df03df38f42d,53699041a3a729239d20396a,58e53811c751df03df38f42d,53699233a3a729239d203e69,58e53811c751df03df38f42d,5c926a7a634f410578005c68,582c8978c751df788ec0bb7e,5ddd3cd0634f412acd60b195,5889d03fa3a72974cbf0d5b1,5c5c3236634f4155110aa4ea
1,5e719db34444483bd0a1a2b9,5e719db34444483bd0a1a2b9,53699569a3a729239d2046eb,5e719db34444483bd0a1a2b9,54ca9d19c751df7e99467389,54ca9d19c751df7e99467389,536991b2a3a729239d203d1a,565c615fc751df0248aad371,59f12582a3a729792d414507,53698f4ca3a729239d2036df,...,53ca2be2a3a7294a1ddd7847,5d13a8b6634f41070a43dff3,53ca2be2a3a7294a1ddd7847,5e0deb87634f410630f1d044,53ca2be2a3a7294a1ddd7847,582c8978c751df788ec0bb7e,5e6125318b4c412192d9143d,5f031d2b73d414ed46d05ffd,5c6b575b9ce2e74a605908de,5c90a4459ce2e77b2ee27c1e
2,58ef2cfda3a7293d49c4e17a,58ef2cfda3a7293d49c4e17a,5c6adbae634f4114a5c41776,5a970f31c751df6d181a6c2a,53699271a3a729239d203f1c,55095017c751df68c9882844,5cc103f906e3e7402d6fc4fc,53699489a3a729239d204488,536995f5a3a729239d20487f,58e53811c751df03df38f42d,...,5d36f81006e3e7531be7501d,5a546923a3a7295c2417f21f,5bbb5400634f41350719b01e,5dfa4d268b4c416bb6270186,5d36f81006e3e7531be7501d,555b5673c751df4821190c78,5e166f7c06e3e76950ce8d52,59cc758ba3a72921191db6c4,5889d042a3a72974cbf0d5b8,5eafea7019df8efd86ded503
3,58d27679a3a7297c157fe3ae,58d27679a3a7297c157fe3ae,536995aba3a729239d2047ad,58ef2cfda3a7293d49c4e17a,537893d3a3a7295dd332d9e0,53699271a3a729239d203f1c,55717121c751df588de5726c,58b7990ba3a7293affefb38d,53699233a3a729239d203e69,,...,53d04837a3a72970fff91f17,5878ee5aa3a7291484cac804,5d36f81006e3e7531be7501d,5448d3e0c751df01f85d0572,53cdfd2ba3a7292987895b2b,5e6125318b4c412192d9143d,5369993aa3a729239d2051bf,5b12086db5950870b30303f1,58481d2388ee384953c65bb3,5c34c4d1634f4173183a64f1
4,5c37848906e3e77914d00e71,5971fbe588ee38522976e8f0,53698f16a3a729239d203648,5e7201d522f2a43e9f736a9a,53699d0ea3a729239d205b2e,,5eded3a7c4bc16480e179ef3,5c9df4b68b4c415e1d76aac6,54ad3c47c751df3070de6536,,...,5eb2c639949dda18c97e9677,5a944adc88ee3849d5288fb7,53698f1aa3a729239d203653,5a980555b59508758f84c010,53ca2e62a3a7294a1ddd784b,5c4ae9a206e3e725ac1b2dce,595918a4a3a7291dcf9c80d7,5890bf78a3a72974c1f0dc8f,58481d2688ee384953c65bb4,53699fb3a3a729239d2061bc
5,5971fbe588ee38522976e8f0,5dd651758b4c410aeb6582e6,53ca2be2a3a7294a1ddd7847,58d27679a3a7297c157fe3ae,,,5726ef67c751df48e1fcca0d,5369945ba3a729239d204413,57868e81a3a7295d371adce0,,...,582c8978c751df788ec0bb7e,5c5c3236634f4155110aa4ea,59590cf1a3a7291dcf9c7ffe,538071a9a3a7297e4d35d6ce,,555c431bc751df5800190c78,59591afda3a7291dcf9c811e,5369903fa3a729239d203966,53699359a3a729239d204183,5465f7fec751df09436c8d07
6,5dd651758b4c410aeb6582e6,5c37848906e3e77914d00e71,5ecf5df0eb87ae4934709e30,5971fbe588ee38522976e8f0,,,56ba2ce488ee382c284b6d5b,5c3678d59ce2e747493938ab,55717121c751df588de5726c,,...,5e32231adee7e71547f5c8ee,56cc6d6988ee385864fa79d0,58ef2cf7a3a7293d49c4e179,5be560f19ce2e72d6683100d,,5369993aa3a729239d2051bf,5c4ae9a206e3e725ac1b2dce,5f054551211dbd0958d05ffd,536998f1a3a729239d2050d4,545b55e1c751df52de9b6045
7,5369956ba3a729239d2046fc,5eed7bbeb63de854684fc1c2,5ebc1ab7e5d3010d32ca8260,59591a8ca3a7291dcf9c810f,,,53699233a3a729239d203e69,5b4cb383c751df4acc48b254,5c16d8dd634f4166797a2f7d,,...,536995aba3a729239d2047ad,5bf42c958b4c4144b0110ce8,53ca2e62a3a7294a1ddd784b,5369a160a3a729239d2065bc,,5e166f7c06e3e76950ce8d52,59591bcba3a7291dcf9c8139,59591df1a3a7291dcf9c8176,5be055ac634f41318d7c18b7,5aec09dfc751df576c13dda8
8,5a097ae9c751df643bf1b500,5a22644f88ee3848529af925,5668697e88ee381d74af0bf4,5c37848906e3e77914d00e71,,,536998c8a3a729239d205051,5b4cb052c751df44a431418a,5808de39c751df1e0679df72,,...,5d0d24af634f411c05d9ca9b,5b51825fa3a72936dfe1c340,5dc0bc70dee7e71661936949,53699569a3a729239d2046eb,,59591afea3a7291dd09c80fb,53699058a3a729239d2039a2,582dbacdc751df5570c0bb7e,5889d042a3a72974c1f0d608,5d1b7c576f44417a5d88e05e
9,57092f51a3a72944fcc6ab40,54f63501c751df466f882844,5cd57bf68b4c4179299eb0e9,53699569a3a729239d2046eb,,,536991b0a3a729239d203d11,5c37849406e3e77917e6226a,536997ffa3a729239d204e36,,...,5e166f789ce2e71756a48bf1,5efa23afd85990251fc9a197,536995aba3a729239d2047ad,,,5bc0d6e5634f4142b4581861,58481d21c751df76c4c0bb7e,5c3538da9ce2e7459c6d765a,5e3aecb2634f4102f1094a7f,5bc6cf3a9ce2e72581422d3f


In [56]:
list(google_df['siren'])

['5b7ffc618b4c4169d30727e0',
 '5e719db34444483bd0a1a2b9',
 '58ef2cfda3a7293d49c4e17a',
 '58d27679a3a7297c157fe3ae',
 '5c37848906e3e77914d00e71',
 '5971fbe588ee38522976e8f0',
 '5dd651758b4c410aeb6582e6',
 '5369956ba3a729239d2046fc',
 '5a097ae9c751df643bf1b500',
 '57092f51a3a72944fcc6ab40',
 '54f63501c751df466f882844',
 '5530fbacc751df5ff937dddb',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan]

In [74]:
def response_similarity(all_results, google_df):
    all_common=[]
    for i in range(len(all_results)):
        list1_as_set = set(all_results[i])
        intersection = list1_as_set.intersection(list(google_df.iloc[:,i]))
        intersection_as_list = list(intersection)
        all_common.append(intersection_as_list)
    return(all_common)

In [75]:
all_common=response_similarity(all_results, google_df)

In [77]:
nn=[]
for i in all_common:
    nn.append(len(i))

In [80]:
sum(nn)/len(nn)

1.763157894736842

#### en moyenne 1.7 resultats communs sur 20 premiers resultats mais bon parfois yen pas 20 (souvent)

In [57]:
list1_as_set = set(all_results[0])
intersection = list1_as_set.intersection(list(google_df['siren']))
intersection_as_list = list(intersection)
print(intersection_as_list)

['5971fbe588ee38522976e8f0', '5b7ffc618b4c4169d30727e0']


In [18]:
def first_found_pos(queries, ids):
    positions=[101]*len(queries)
    for i in range(len(queries)):
        results=opd.Search(queries[i],100)[0]
        k=0
        for r in results:
            k=k+1
            if r==ids[i]:
                positions[i]=k
    return positions

In [17]:
queries

array(['siren', 'sirene', 'entreprise', 'entreprises', 'siret',
       'open damir', 'opendamir', 'damir', 'contours départements',
       'emissions polluantes', 'géofla départements',
       'effectifs police municipale', 'marchés public bourgogne',
       'Liste gares SNCF', 'contours départements français',
       'loi de finance 2016', 'lolf 2016', 'formations pas de calais',
       'accidents de la circulation', 'accidents de la route',
       'risque de décès un an après accident', 'COG',
       'code officiel géographique', 'contour commune',
       'contours communes', 'contour communes', 'code postal',
       'codes postaux', 'prénoms', 'association', 'associations', 'RNA',
       'nan', 'répertoire des associations',
       'répertoire national des associations', 'waldec',
       'organismes de formation', 'organisme de formation',
       'bibliothèques', "annuaire de l'éducation", 'grand débat',
       'vie-publique répertoire'], dtype='<U36')

In [19]:
POSs=first_found_pos(queries, ids)

In [22]:
for (i,j) in zip(POSs, queries):
    print(i,j)

2 siren
2 sirene
101 entreprise
101 entreprises
9 siret
78 open damir
101 opendamir
101 damir
101 contours départements
32 emissions polluantes
1 géofla départements
3 effectifs police municipale
3 marchés public bourgogne
1 Liste gares SNCF
101 contours départements français
101 loi de finance 2016
101 lolf 2016
101 formations pas de calais
4 accidents de la circulation
5 accidents de la route
2 risque de décès un an après accident
11 COG
8 code officiel géographique
101 contour commune
101 contours communes
101 contour communes
3 code postal
3 codes postaux
89 prénoms
101 association
101 associations
1 RNA
101 nan
80 répertoire des associations
6 répertoire national des associations
101 waldec
49 organismes de formation
46 organisme de formation
101 bibliothèques
27 annuaire de l'éducation
7 grand débat
101 vie-publique répertoire


In [44]:
bench_score(queries, ids)

20.657894736842106
[1, 1, 101, 1, 101, 1, 101, 101, 16, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 1, 1, 101, 1, 101, 101, 101, 101, 101]


20.657894736842106

In [47]:
classify_results(queries, first_found_pos(queries, ids))

([('siren', 1),
  ('sirene', 1),
  ('siret', 1),
  ('opendamir', 1),
  ('RNA', 1),
  ('répertoire des associations', 1),
  ('waldec', 1)],
 [('entreprises', 101),
  ('open damir', 101),
  ('contours départements', 101),
  ('emissions polluantes', 101),
  ('géofla départements', 16),
  ('effectifs police municipale', 101),
  ('marchés public bourgogne', 101),
  ('Liste gares SNCF', 101),
  ('contours départements français', 101),
  ('loi de finance 2016', 101),
  ('lolf 2016', 101),
  ('formations pas de calais', 101),
  ('accidents de la circulation', 101),
  ('accidents de la route', 101),
  ('risque de décès un an après accident', 101),
  ('COG', 101),
  ('code officiel géographique', 101),
  ('contour commune', 101),
  ('contours communes', 101),
  ('contour communes', 101),
  ('code postal', 101),
  ('codes postaux', 101),
  ('prénoms', 101),
  ('association', 101),
  ('associations', 101),
  ('répertoire national des associations', 101),
  ('organismes de formation', 101),
  ('org

In [43]:
def classify_results(queries, positions):
    acceptable=[]
    rejected=[]
    bof=[]
    for q in range(len(queries)):
        if positions[q]==100:
            rejected.append((queries[q], positions[q]))
        else:
            if positions[q]<=5:
                 acceptable.append((queries[q], positions[q]))
            else:
                bof.append((queries[q], positions[q]))
    return acceptable, bof, rejected

def bench_score(queries, ids):
    positions=first_found_pos(queries, ids)
    score=0
    for i in positions:
        score+=101-i
    score=score/len(queries)
    print(score)
    print(positions)
    return(score)

In [11]:
vecss=[1,2,3,4,5]

In [12]:
vecss[:-1]

[1, 2, 3, 4]

In [9]:
vecss.append(list(id2vec(get_id(48))))

In [21]:
n_ids, score=Search('agriculture biologique', 10)

In [23]:
for i in n_ids:
    print('Neighbour  ' + data_np[get_idx(i)]+ '\n')
    print(i)

Neighbour  agriculture biologique productions végétales surfaces par département. agriculture agriculture biologique developpement durable occupation du sol

5baaf08306e3e75ac77347e9
Neighbour  agriculture biologique productions végétales surfaces par département. agriculture agriculture biologique developpement durable occupation du sol

5b935d619ce2e71929bc02c2
Neighbour  agriculture biologique productions végétales surfaces par département. agriculture agriculture biologique developpement durable occupation du sol

59d45f70b59508043de9ba09
Neighbour  agriculture biologique productions végétales surfaces par département. agriculture agriculture biologique developpement durable occupation du sol

59d30df3a3a7291993b1eae0
Neighbour  agriculture biologique productions végétales surfaces par département. agriculture agriculture biologique developpement durable occupation du sol

59cf1a61b59508043de9b878
Neighbour  agriculture biologique productions végétales surfaces par département. agr

In [11]:
import pandas as pd
df=pd.read_csv('../../data/querys.csv', sep=',',error_bad_lines=False, encoding='latin-1') #le fichier original

In [12]:
data=pd.read_pickle("../../data/nodesc_clean.pkl")
data_np=data.to_numpy()

In [13]:
pca_space[get_idx('53ba4c07a3a729219b7bead3')]

array([-1.28434331, -3.45974299, -1.69202476,  3.02181656,  5.08455115,
       -1.23227441,  0.41482335, -1.52394594, -0.62698708, -0.88486056,
       -1.01096387, -1.25736276, -0.67990563, -0.82565908,  0.11153779,
        1.50026546,  1.65519552, -1.23502851,  1.79596791,  0.99266682,
       -0.2531902 ,  1.49539908, -0.4166633 ,  0.39676161,  3.80853666,
       -0.88179272,  0.73311649, -0.26577974, -0.89995632, -0.93473154])

In [14]:
df_array=np.array(df)

In [6]:
for i in range(len(df_array)):
    n_ids, score = Similarity(df_array[i][2], 10)
    print('\n'+df_array[i][0])
    for k in range(len(n_ids)):
        print('Neighbour n°: '+ str(k) + data_np[get_idx(n_ids[k])])


siren
Neighbour n°: 0arrêté du 15 septembre délibération crpmem normandie praires et amandes oc abrog 88 . autres coquillages cantonnement crpm manche normandie
Neighbour n°: 1maisons départementales des solidarités localisation. aide sociale mds solidarite
Neighbour n°: 2stationnements particuliers ville de lorient. deplacements duree limitee emplacement reserve livraison lorient personne mobilite reduite pmr stationnement taxi transports ville de lorient
Neighbour n°: 3données essentielles des marchés publics publiés sur marches publics.info aws . commande publique donnees essentielles
Neighbour n°: 4annuaire géolocalisé. administration adresse annuaire citoyennete courriel email finances publiques gouvernement mail telephone
Neighbour n°: 5température quotidienne régionale depuis janvier . developpement durable meteorologie region territoire territoires et regions
Neighbour n°: 6dette propre au 1er janvier de l'exercice. budget dette propre finances departementales territoire
Neigh


contours départements
Neighbour n°: 0accords mets et vins. alimentation cuisine gastronomie oenologie vin
Neighbour n°: 1contours géographiques des régions – . admin express france geospatial ign region territoires
Neighbour n°: 2périmètres des secteurs de recrutements des collèges publics de gironde. college conseil departemental gironde limites administratives
Neighbour n°: 3plan départemental des itinéraires de promenades et de randonnées pdipr . nan
Neighbour n°: 4résultats du premier tour de la primaire de la belle alliance. election parti socialiste presidentielle primaire raicaux
Neighbour n°: 5collecte des archives municipales. culture loisirs sports
Neighbour n°: 6economie zae grand poitiers données métiers. activite activite economique donnees ouvertes economique economy passerelle inspire zones
Neighbour n°: 7votre commerce roubaix. activites business commercant commerces developpement economique economie emploi hebergement label zero dechet magasin pme restaurant restaurat


contours départements français
Neighbour n°: 0accords mets et vins. alimentation cuisine gastronomie oenologie vin
Neighbour n°: 1contours géographiques des régions – . admin express france geospatial ign region territoires
Neighbour n°: 2périmètres des secteurs de recrutements des collèges publics de gironde. college conseil departemental gironde limites administratives
Neighbour n°: 3plan départemental des itinéraires de promenades et de randonnées pdipr . nan
Neighbour n°: 4résultats du premier tour de la primaire de la belle alliance. election parti socialiste presidentielle primaire raicaux
Neighbour n°: 5collecte des archives municipales. culture loisirs sports
Neighbour n°: 6economie zae grand poitiers données métiers. activite activite economique donnees ouvertes economique economy passerelle inspire zones
Neighbour n°: 7votre commerce roubaix. activites business commercant commerces developpement economique economie emploi hebergement label zero dechet magasin pme restaurant 

ValueError: '53699058a3a729239d2039a3' is not in list

In [38]:
scores=[]
positions=[]
for i in range(len(df_array)):
    n_ids, score = Search(df_array[i][0], 20)
    #print(n_ids)
    #print(df_array[i][2])
    for k in range(len(n_ids)):
        if n_ids[k]==df_array[i][2]:
            score=1-k/len(n_ids)
            pos=k
            print('FOUND :' + str(df_array[i][0])+ ' in pos '+ str(k))
            break
        else:
            score=0
            pos=k
            if k==len(n_ids)-1:
                print('DIDNT FIND:'+str(df_array[i][0]))
    scores.append(score)
    positions.append(k)

DIDNT FIND:siren
DIDNT FIND:sirene
DIDNT FIND:entreprise
DIDNT FIND:entreprises
DIDNT FIND:siret
DIDNT FIND:open damir
DIDNT FIND:opendamir
DIDNT FIND:damir
FOUND :contours départements in pos 6
DIDNT FIND:emissions polluantes
DIDNT FIND:géofla départements
FOUND :effectifs police municipale in pos 10
FOUND :marchés public bourgogne in pos 2
FOUND :Liste gares SNCF in pos 0
DIDNT FIND:contours départements français
FOUND :loi de finance 2016 in pos 9
DIDNT FIND:lolf 2016
DIDNT FIND:formations pas de calais
DIDNT FIND:accidents de la circulation
DIDNT FIND:accidents de la route
FOUND :risque de décès un an après accident in pos 2
DIDNT FIND:COG
FOUND :code officiel géographique in pos 0
DIDNT FIND:contour commune
DIDNT FIND:contours communes
DIDNT FIND:contour communes
FOUND :code postal in pos 7
DIDNT FIND:codes postaux
DIDNT FIND:prénoms
DIDNT FIND:association
DIDNT FIND:associations
DIDNT FIND:RNA
DIDNT FIND:nan
DIDNT FIND:répertoire des associations
DIDNT FIND:répertoire national de

In [52]:
df

Unnamed: 0,query,params,expected
0,siren,,5b7ffc618b4c4169d30727e0
1,sirene,,5b7ffc618b4c4169d30727e0
2,entreprise,,5b7ffc618b4c4169d30727e0
3,entreprises,,5b7ffc618b4c4169d30727e0
4,siret,,5b7ffc618b4c4169d30727e0
5,open damir,,54de1e8fc751df388646738b
6,opendamir,,54de1e8fc751df388646738b
7,damir,,54de1e8fc751df388646738b
8,contours départements,,536991b0a3a729239d203d13
9,emissions polluantes,,53ba4c07a3a729219b7bead3


In [10]:
scores

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0.94,
 0,
 0,
 0.9,
 0.98,
 1.0,
 0,
 0.91,
 0,
 0,
 0.55,
 0,
 0.98,
 0,
 1.0,
 0,
 0,
 0,
 0.9299999999999999,
 0,
 0,
 0.61,
 0,
 0,
 0,
 0,
 0.8,
 0,
 0,
 0,
 0,
 0,
 0.8200000000000001,
 0]

In [48]:
df=df.drop(17)

In [17]:
get_id(40989)

'53699489a3a729239d204488'