In [16]:
%matplotlib inline
#figsize(12, 8)

from sklearn import svm
from keras.utils import get_file
import os
import gensim
import numpy as np
import random
import requests
import geopandas as gpd
from IPython.core.pylabtools import figsize
import csv
import pandas as pd

In [7]:
MODEL = 'GoogleNews-vectors-negative300.bin'
unzipped = os.path.join('generated', MODEL)

In [8]:
if not os.path.isfile(unzipped):
    with open(unzipped, 'wb') as fout:
        zcat = subprocess.Popen(['zcat'],
                          stdin=open(path),
                          stdout=fout
                         )
        zcat.wait()

In [10]:
model = gensim.models.KeyedVectors.load_word2vec_format(unzipped, binary=True)

In [11]:
model.most_similar(positive=['Korea'])

[('South_Korea', 0.8255740404129028),
 ('Korean', 0.7428451180458069),
 ('South_Korean', 0.6742696762084961),
 ('Seoul', 0.6671160459518433),
 ('Japan', 0.6590375304222107),
 ('Korea_ROK', 0.6261441111564636),
 ('Koreans', 0.62441086769104),
 ('Pool_KOREA_OUT', 0.6176227927207947),
 ('Tourism_Organization_KTO', 0.6147845983505249),
 ('SEOUL_NORTH', 0.6106680631637573)]

In [61]:
countries=pd.read_csv('countries.csv')
positive=list(countries['name'])

<p>Since there are more than 3,000,000 words, it unlikey that there will be country names among randomly picked 5000words.</p>

In [26]:
negative=random.sample(model.vocab.keys(), 5000) 
negative[:10]

['cleated',
 'Lentigen_Corporation',
 'Offeror_wholly_owned',
 'jobboardtv@kwtx.com',
 'Sukenick',
 'birds_reptiles_amphibians',
 'Houghten',
 'www.facebook.com_IGT',
 'GDNF_gene',
 'Charles_Musyoki']

<p>Labelling Country and Non-country to 1 and 0 respectively</p>

In [27]:
labelled=[(pos_word, 1) for pos_word in positive]+[(neg_word,0) for neg_word in negative]
random.shuffle(labelled)
labelled[:10]

[('Canada', 1),
 ('Turkmenistan', 1),
 ('Ethiopia', 1),
 ('Swaziland', 1),
 ('Czech_Republic', 1),
 ('Cameroon', 1),
 ('UAE', 1),
 ('Liberia', 1),
 ('Netherlands', 1),
 ('East_Timor', 1)]

In [29]:
x=np.asarray([model[w] for w,i in labelled])
y=np.asarray([i for w,i in labelled])
x

array([[-0.13671875, -0.15429688,  0.26953125, ...,  0.02099609,
         0.28515625, -0.2578125 ],
       [-0.21875   ,  0.11035156,  0.02746582, ...,  0.265625  ,
         0.23339844,  0.34765625],
       [-0.02148438,  0.28125   ,  0.09619141, ..., -0.05517578,
         0.11523438,  0.21582031],
       ...,
       [ 0.02893066,  0.07763672,  0.09228516, ..., -0.04785156,
        -0.06030273,  0.17773438],
       [ 0.03564453,  0.22070312,  0.16601562, ..., -0.04443359,
        -0.38476562,  0.00052261],
       [-0.0859375 ,  0.09228516, -0.06591797, ...,  0.09082031,
        -0.15429688,  0.02148438]], dtype=float32)

<p>70% of data will be used as training data</p>

In [35]:
training_fraction=0.7 
cut_off=int(training_fraction*len(labelled))

<p>With given data, separating sections of positive and negative data</p>

In [32]:
clf=svm.SVC(kernel='linear')
clf.fit(x[:cut_off], y[:cut_off])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

<p>Making prediction on rest of 30% of data</p>

In [33]:
result=clf.predict(x[cut_off:])

In [37]:
missed=[country for (pred, truth, country) in zip(result, y[cut_off:], labelled[cut_off:]) if pred!=truth]

In [39]:
100-100*float(len(missed))/len(result), missed

(99.93573264781492, [('Venezuela', 0)])

In [43]:
all_predictions = clf.predict(model.syn0)
res=[]
for w,prediction in zip(model.index2word, all_predictions):
    if prediction:
        res.append(w)
        if len(res)==150:
            break
random.sample(res,50)

  """Entry point for launching an IPython kernel.


['Shanghai',
 'United_States',
 'Thailand',
 'Afghanistan',
 'Kansas',
 'UK',
 'Iraqi',
 'France',
 'Idaho',
 'Pennsylvania',
 'Netherlands',
 'Iowa',
 'America',
 'England',
 'Queensland',
 'Arkansas',
 'Middle_East',
 'Gaza',
 'Tennessee',
 'Dutch',
 'North_Carolina',
 'Sweden',
 'Tehran',
 'Texas',
 'Taiwan',
 'Nebraska',
 'Sri_Lanka',
 'Zimbabwe',
 'Hong_Kong',
 'Pakistan',
 'Massachusetts',
 'African',
 'China',
 'U.S.',
 'Oklahoma',
 'overseas',
 'Spain',
 'Alaska',
 'Europe',
 'Wales',
 'India',
 'Cuba',
 'EU',
 'Switzerland',
 'Korea',
 'Vermont',
 'Bangladesh',
 'Venezuela',
 'Brazil',
 'California']

<h3>Finding semantic distance among words</h3>

In [74]:
#country_to_idx={country['name']:idx for idx, country in enumerate(countries)}
country_to_idx={}
for idx in range(len(countries)):
    country_to_idx[countries.at[idx,'name']]=idx

In [81]:
country_vecs=np.asarray([model[countries.at[c,'name']]for c in range(len(countries))])

In [83]:
country_vecs.shape

(184, 300)

In [84]:
dists=np.dot(country_vecs, country_vecs[country_to_idx['Canada']])
for idx in reversed(np.argsort(dists)[-10:]):
    print(countries.at[idx, 'name'], dists[idx])

Canada 7.5440245
New_Zealand 3.9619699
Finland 3.9392405
Puerto_Rico 3.838145
Jamaica 3.8102934
Sweden 3.8042789
Slovakia 3.7038739
Australia 3.6711009
Bahamas 3.6240416
United_States 3.5374336
