# Script finding words unknown to Kam4D from a corpus

**Author**: Diane Marquette <br>
**Date created**: 20/10/2020 <br>
**Date last modified**: 10/11/2020 <br>
**Python Version**: 3.7.7

In [6]:
import nltk
from py2neo import Graph

/!\ [Verbiste](http://perso.b2b2c.ca/~sarrazip/dev/verbiste.html) must be downloaded and installed on your computer. This script uses the <code>french-deconjugator</code> command from this package and executes in the terminal.

### Connect to the database

In [7]:
# connect to our database by creating a Neo4j graph database instance
kam4d = Graph("http://localhost:7474/")

In [8]:
# command to display the query results in a table
# kam4d.run("MATCH (s:Smurf{spelling:'ragondin'})-[:WRITTEN_IN]->(l:Language {code:'FRA'}) RETURN s,l").to_table()

### Import and preprocess the corpus

In [9]:
# import corpus
data_folder = '/Users/dianemarquette/Downloads/'
corpus_file = 'Fre.Freq.2.txt'

with open(data_folder+corpus_file, encoding='utf8') as f:
    corpus_index = f.readlines()

# delete 1st line corresponding to the columns headers
corpus_index = corpus_index[1:]

# number of distinct words in the corpus
print("Number of words in the corpus: {}".format(len(corpus_index)))

Number of words in the corpus: 767872


In [10]:
# display 1st line of the corpus index
print(corpus_index[0])

de	1483286	44995.0859	514548	58.4279	940789	33983.8295	723168	35.7325	1761326	51479.4279	510707	78.7500



In [11]:
# remove spaces at the beginning and at the end of the strings
original_vocab = [x.strip() for x in corpus_index] 
original_vocab[0]

'de\t1483286\t44995.0859\t514548\t58.4279\t940789\t33983.8295\t723168\t35.7325\t1761326\t51479.4279\t510707\t78.7500'

In [12]:
# function to tokenize a document
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    return " ".join([word.lower() for word in tokens])

In [13]:
# tokenize each line of the corpus index
corpus_vocab = [tokenize(d).split() for d in original_vocab]
corpus_vocab[0]

['de',
 '1483286',
 '44995.0859',
 '514548',
 '58.4279',
 '940789',
 '33983.8295',
 '723168',
 '35.7325',
 '1761326',
 '51479.4279',
 '510707',
 '78.7500']

In [14]:
# only keep strings corresponding to words
corpus_words = [word for document in corpus_vocab for word in document if word.isalpha()]
corpus_words[0:10]

['de', 'la', 'et', 'le', 'à', 'l', 'les', 'un', 'en', 'd']

### Find words from the corpus that are unknown to the database

In [15]:
def query_word(language_id, database, word):
    '''
     Check if a word exists in the database
     :paramaters: 
         - language_id -> string such as 'FRA' for French or 'SPA' for Spanish
         - database -> a Neo4j graph database instance
         - word -> string that we want to check
     :return:
         - first value from the first record returned or None (if the word doesn't exist in the database)
     '''
    
    # check if word from the corpus already exists in the database
    # /!\ add a second curly to tell Python that the first { is actually part of the string (query)
    db_matching = database.run("MATCH (s:Smurf{{lemma_accent:'{}'}})-[:WRITTEN_IN]->(l:Language {{code:'{}'}}) RETURN s,l".format(word, language_id))
    
    return db_matching.evaluate()

In [16]:
def detect_unknown_words(language_id, database, corpus_words):
    '''
     Detect words from a corpus that don't exist in the database (conjugated verbs are converted into their infinitive form using Verbiste)
     :paramaters: 
         - language_id -> string such as 'FRA' for French or 'SPA' for Spanish
         - database -> a Neo4j graph database instance
         - corpus_words -> list of strings corresponding to all the distinct words appearing in a corpus
     :return:
         - unknown_words -> list of words from the corpus that don't exist in the database
     '''
    
    unknown_words = []

    for word in corpus_words[0:100]:
        
        # check using Verbiste if unknown words are conjugated verbs
        if query_word('FRA', kam4d, word) == None:
            
            # use command in terminal installed by Verbiste package
            verbiste_results = ! french-deconjugator {word}
            
            if verbiste_results == ['']:
                # add unknown words to the list
                unknown_words.append(word)
            
            else:
                verb_format =  tokenize(verbiste_results[0]).split()
                infinitive = verb_format[0]
                if query_word('FRA', kam4d, infinitive) == None:
                    # add unknown words to the list
                    unknown_words.append(word)
            
    return unknown_words

In [17]:
# find words from the French corpus that aren't in Kam4D
unknown_words = detect_unknown_words('FRA', kam4d, corpus_words)
unknown_words

['de',
 'et',
 'le',
 'à',
 'l',
 'les',
 'en',
 'd',
 'des',
 'que',
 'une',
 'qui',
 'je',
 'du',
 'il',
 'dans',
 'ce',
 'qu',
 'ne',
 'au',
 'sur',
 'j',
 'c',
 'n',
 'on',
 'mais',
 'se',
 'avec',
 's',
 'nous',
 'vous',
 'ou',
 'elle',
 'me',
 'cette',
 'sa',
 'mon',
 'aux',
 'ça',
 'ils',
 'm',
 'ses',
 'ces',
 'leur',
 'sans',
 'ma',
 'donc',
 'tous',
 'où',
 'alors',
 'quand',
 'moi',
 'autres',
 'mes',
 'fois',
 'notre']