In [3]:
import gzip
import os
import shutil

import gensim
from keras.utils import get_file

Using TensorFlow backend.


In [2]:
def load_model():
    filename = 'GoogleNews-vectors-negative300.bin'
    filename_zipped = filename + '.gz'
    url = 'https://s3.amazonaws.com/dl4j-distribution/' + filename_zipped
    
    def download_file_from_url_unless_in_cache():
        print('downloading file: {} from url: {}'.format(filename_zipped, url) )
        return get_file(filename_zipped, url)
    
    model_file_zipped = download_file_from_url_unless_in_cache()
    model_file_unzipped = os.path.join('downloads', filename)
    
    def unzip_file_if_not_yet_unzipped():
        print('uncompressing to file: {} from file: {}'.format(model_file_unzipped, model_file_zipped))
        if not os.path.isfile(model_file_unzipped):
            with open(model_file_unzipped, 'wb') as f_out, gzip.open(model_file_zipped, 'rb') as f_in:
                shutil.copyfileobj(f_in, f_out)
 
    unzip_file_if_not_yet_unzipped()
        
    def load_model():
        print('loading model from file: {}'.format(model_file_unzipped))
        return gensim.models.KeyedVectors.load_word2vec_format(model_file_unzipped, binary=True)
    
    return load_model()



In [3]:
model = load_model()

downloading file: GoogleNews-vectors-negative300.bin.gz from url: https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
uncompressing to file: downloads\GoogleNews-vectors-negative300.bin from file: C:\Users\fisch\.keras\datasets\GoogleNews-vectors-negative300.bin.gz
loading model from file: downloads\GoogleNews-vectors-negative300.bin


In [4]:
model.most_similar('espresso')

[('cappuccino', 0.6888186931610107),
 ('mocha', 0.6686208844184875),
 ('coffee', 0.6616826057434082),
 ('latte', 0.6536753177642822),
 ('caramel_macchiato', 0.6491268873214722),
 ('ristretto', 0.6485545635223389),
 ('espressos', 0.6438628435134888),
 ('macchiato', 0.6428250074386597),
 ('chai_latte', 0.6308028697967529),
 ('espresso_cappuccino', 0.6280542612075806)]

In [5]:
# man(A) - woman(B) = king(C) -x

# woman - man = x - king
# x = woman(B) + king(C) - man(A)
# X = B + C - A

In [6]:
a, b, c = 'man', 'woman', 'king'

In [10]:
model.most_similar(positive=(b,c), negative=(a,), topn=1)

[('queen', 0.7118192911148071)]

In [20]:
def A_is_to_B_as_C_is_to(a, b, c, topn=1):
    def to_list(x):
        return x if type(x) == list else [x]
    a,b,c = map(to_list, (a,b,c))
    
    topn_x = model.most_similar(b + c, a, topn=topn)
    if len(topn_x) == 0:
        return None
    if len(topn_x) == 1:
        return topn_x[0][0]
    return [x[0] for x in topn_x]

In [21]:
A_is_to_B_as_C_is_to('man', 'woman', 'king')

'queen'

In [25]:
A_is_to_B_as_C_is_to('dog', 'cat', 'man')

'woman'

In [46]:
for country in ('Canada', 'France','Ukraine','Italy'):
    capital = A_is_to_B_as_C_is_to('Germany', 'Berlin', country)
    print('{} is the capital of {}'.format(capital, country))

Ottawa is the capital of Canada
Paris is the capital of France
Kiev is the capital of Ukraine
Rome is the capital of Italy


In [69]:
for company in 'Google', 'IBM','Boeing','Microsoft', 'Samsung', 'VW', 'Mafia':
    products = A_is_to_B_as_C_is_to(['Starbucks','Apple'], ['Starbucks_coffee','iPhone'], company, topn=3)
    print('The product of {} is {}'.format(company, products))

The product of Google is ['personalized_homepage', 'app', 'Gmail']
The product of IBM is ['DB2', 'WebSphere_Portal', 'Tamino_XML_Server']
The product of Boeing is ['Dreamliner', 'airframe', 'aircraft']
The product of Microsoft is ['Windows_Mobile', 'SyncMate', 'Windows']
The product of Samsung is ['MM_A###', 'handset', 'Samsung_SCH_B###']
The product of VW is ['Dacia_Logan', 'Volkswagen', 'roadster']
The product of Mafia is ['mafioso', 'mafia', 'boss_Sam_Giancana']


In [82]:
A_is_to_B_as_C_is_to('Putin','Yeltsin','Obama', topn=8)

['Barack_Obama',
 'Barack',
 'Clinton',
 'President_Barack_Obama',
 'Illinois_senator',
 'McCain',
 'Biden',
 'Bill_Clinton']

In [None]:
http://cloud.devmount.de/d2bc5672c523b086

In [80]:
def load_german_model():
    filename = 'word2vec_german_devmount.model'
    url = 'http://cloud.devmount.de/d2bc5672c523b086/german.model'
    
    def download_file_from_url_unless_in_cache():
        print('downloading file: {} from url: {}'.format(filename, url) )
        return get_file(filename, url)
    
    model_file = download_file_from_url_unless_in_cache()
    
    def load_model():
        print('loading model from file: {}'.format(model_file))
        return gensim.models.KeyedVectors.load_word2vec_format(model_file, binary=True)
    
    return load_model()



In [81]:
german = load_german_model()

downloading file: word2vec_german_devmount.model from url: http://cloud.devmount.de/d2bc5672c523b086/german.model
Downloading data from http://cloud.devmount.de/d2bc5672c523b086/german.model
loading model from file: C:\Users\fisch\.keras\datasets\word2vec_german_devmount.model


In [82]:
class Model:
    def __init__(self, model):
        self.model = model
        
    def A_is_to_B_as_C_is_to(self, a, b, c, topn=1):
        def to_list(x):
            return x if type(x) == list else [x]
        a,b,c = map(to_list, (a,b,c))

        topn_x = self.model.most_similar(b + c, a, topn=topn)
        if len(topn_x) == 0:
            return None
        if len(topn_x) == 1:
            return topn_x[0][0]
        return [x[0] for x in topn_x]

In [83]:
model = Model(german)

In [84]:
model.A_is_to_B_as_C_is_to('Haus', 'Garten', 'Mann')

'Bursche'

In [93]:
for land in ('Kanada', 'Frankreich','Ukraine','Italien'):
    staatsoberhaupt = model.A_is_to_B_as_C_is_to('Deutschland', 'Angela_Merkel', land)
    print('{} ist das Staatsoberhaupt von {}'.format(staatsoberhaupt, land))

Kanzlerin ist das Staatsoberhaupt von Kanada
Merkel ist das Staatsoberhaupt von Frankreich
Janukowitsch ist das Staatsoberhaupt von Ukraine
Monti ist das Staatsoberhaupt von Italien


In [129]:
model.A_is_to_B_as_C_is_to(['Frau','Maedchen','Oma'], ['Mann','Junge','Opa'], ['Hebamme'])

'erfahrener'

In [134]:
german.doesnt_match(['Mann','Bulle','Kueken'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'Kueken'

In [136]:
german.similar_by_word('Hebamme')

[('Aerztin', 0.7699185609817505),
 ('Krankenschwester', 0.7651543021202087),
 ('Kinderkrankenschwester', 0.7281628251075745),
 ('Frauenaerztin', 0.7241747379302979),
 ('Kinderaerztin', 0.7229164838790894),
 ('Hausaerztin', 0.720133364200592),
 ('Ergotherapeutin', 0.7135937213897705),
 ('Zahnaerztin', 0.70682692527771),
 ('Krankenpflegerin', 0.7021933197975159),
 ('Arzthelferin', 0.7011792659759521)]

In [172]:
german.most_similar('DATEV')

[('Nemetschek_AG', 0.6830635070800781),
 ('Spedition_Logistik', 0.6829559803009033),
 ('herstellerunabhaengige', 0.6825460195541382),
 ('IT-Dienstleisters', 0.6813675165176392),
 ('Electronic_Banking', 0.6805779337882996),
 ('bundesweit_taetigen', 0.6782735586166382),
 ('Softwareanbieter', 0.6772146821022034),
 ('Renewables', 0.6726152896881104),
 ('Dienstleistungen_Bereichen', 0.6687685251235962),
 ('MCE', 0.6681452989578247)]

In [178]:
for firma in 'Siemens', 'BMW','Airbus','Telekom', 'DATEV':
    produkte = model.A_is_to_B_as_C_is_to(['Opel','Suhrkamp','Buecher'], ['Auto'], firma, topn=5)
    print('{} produziert {}'.format(firma, produkte))

Siemens produziert ['Bruckneudorf', 'M17', 'Messfahrzeug', 'S31', 'Raststation']
BMW produziert ['Quad', 'Motorrad', 'Fahrzeug', 'Moped', 'Leitschiene']
Airbus produziert ['Flugzeug', 'F-4', 'Super_Puma', 'Lande-', 'Start_Landung']
Telekom produziert ['Mobilfunknetz', 'Basisstation', 'Relaisstation', 'Bruckneudorf', 'Telefonnetz']
DATEV produziert ['Emergency_Response', 'IPS', 'Lage-', 'Messfahrzeug', 'Operations_Center']


In [191]:
"ha" in "aha"

True

In [183]:
worte = list(german.vocab.keys())

In [192]:
 [w for w in worte if "DATEV" in w]

['DATEV']

In [205]:
model.A_is_to_B_as_C_is_to('Tarantino','Riefenstahl', 'Beckenbauer')

'Franz_Beckenbauer'