# Importing the Data

In [1]:
import os
import pandas as pd

In [2]:
# Creates the dictionary to store the texts and other information
def create_dict():
    data_dict = {'filename':[], 'year':[], 'dir':[], 'text':[]}
    
    for subdir, dirs, files in os.walk('text_files'):
        for f in files:
            data_dict['filename'].append(f)
            data_dict['year'].append(subdir[-4:])
            data_dict['dir'].append(os.path.join(subdir, f))
        
    return data_dict

# Reads the text files and storing in the dictionary
def read_txt(data_dict):
    for dirs in data_dict['dir']:
        with open(dirs, 'r') as f:
            text = f.read()
        data_dict['text'].append(text)
    
    return data_dict

data_dict = create_dict()
data_dict = read_txt(data_dict)
data = pd.DataFrame(data_dict)

data.head()

Unnamed: 0,filename,year,dir,text
0,mart-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/mart-...,Bilim\nTeknikve\n\nAylık Popüler Bilim Dergi...
1,ocak-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/ocak-...,Bilim\nTeknikve\n\nAylık Popüler Bilim Dergi...
2,nisan-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/nisan...,Bilim\nTeknikve\n\nAylık Popüler Bilim Dergi...
3,temmuz-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/temmu...,Bilim\nTeknikve\n\nAylık Popüler Bilim Dergi...
4,mayis-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/mayis...,Bilim\nTeknikve\n\nAylık Popüler Bilim Dergi...


# Preprocessing

- Fixing the Turkish characters*
- Lowercase
- Stemming*
- Tokenization
- Removing stopwords


In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
from TurkishStemmer import TurkishStemmer
from nltk.corpus import stopwords
from tqdm import tqdm as tqdm

import string
import time
import nltk
import re

In [4]:
# pdfminer3 did not recognize Turkish characters
# and the documentation is not well written
# manually fixing them
def preprocess(text):
    text = text.str.replace('‹', 'İ')
    text = text.str.replace('¤', 'ğ')
    text = text.str.replace('›', 'ı')
    text = text.str.replace('ﬂ', 'ş')
    text = text.str.replace('ß', 'ş')
    text = text.str.replace('(cid:159)', 'ü')
    text = text.str.replace('(cid:221)', 'ı')
    text = text.str.replace('(cid:223)', 'ş')
    text = text.str.replace('(cid:141)', 'ç')
    text = text.str.replace('(cid:154)', 'ö')
    text = text.str.replace('(cid:219)', 'ğ')
    text = text.str.replace('(cid:222)', 'Ş')
    text = text.str.replace('(cid:220)', 'İ')
    text = text.str.replace('(cid:133)', 'Ö')
    text = text.str.replace('(cid:213)', "'")
    text = text.str.replace('(cid:134)', 'Ü')
    text = text.str.replace('(cid:130)', 'Ç')
    text = text.str.replace('(cid:212)', '')
    text = text.str.replace('(cid:210)', '')
    text = text.str.replace('(cid:211)', '')
    text = text.str.replace('(cid:201)', '')
    text = text.str.replace('(cid:158)', 'ü')
    text = text.str.replace('Ý', 'ı')
    text = text.str.replace('Û', 'ğ')
    text = text.str.replace('Õ', "'")
    text = text.str.replace('-\n', '')
    text = text.str.replace('\n', ' ')
    text = text.str.replace('\x02', ' ')
    text = text.str.replace('\x0c', ' ')
    text = text.str.lower() # Converting to lowercase
    text = text.str.replace('doç.', 'doç')
    text = text.str.replace('dr.', 'dr')
    text = text.str.replace('prof.', 'prof')
    text = text.str.replace('yrd.', 'yrd')
    text = text.str.replace('i̇', 'i')
    
    return text

data['text'] = preprocess(data['text'])

In [5]:
"""
# Stemming
stemmer = TurkishStemmer()

stemmed_lists = []
for index in tqdm(data.index):
    mini_l = []
    for text in data.loc[index]['text'].split(" "):
        mini_l.append(stemmer.stem(text))
        
    big_text = " "
    for char in mini_l:
        big_text = big_text + " " + char
    stemmed_lists.append(big_text)

data['stemmed'] = stemmed_lists
"""

data.head()

Unnamed: 0,filename,year,dir,text
0,mart-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/mart-...,bilim teknikve aylık popüler bilim dergisi ...
1,ocak-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/ocak-...,bilim teknikve aylık popüler bilim dergisi ...
2,nisan-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/nisan...,bilim teknikve aylık popüler bilim dergisi ...
3,temmuz-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/temmu...,bilim teknikve aylık popüler bilim dergisi ...
4,mayis-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/mayis...,bilim teknikve aylık popüler bilim dergisi ...


In [6]:
# Stopwords
def read_stopwords():
    sw = stopwords.words('turkish')
    with open('turkish-stopwords.txt') as f:
        text = f.read()
        sw.extend(text.split())
        sw.extend([*string.punctuation])
    return list(set(sw))

# Sentence extraction
def extract_sentences(data):
    sent_tokenizer = nltk.tokenize.PunktSentenceTokenizer()
    sentence_all = []
    
    print('Extracting sentences.')
    time.sleep(1)
    
    for text in tqdm(data['text'].tolist()):
        s_list = sent_tokenizer.tokenize(text)
        sentence_all.append(s_list)
    
    data['sentence'] = sentence_all

    return data

# Tokenizer
# https://github.com/apdullahyayik/Turkish-Word-Tokenizer/blob/master/word_tokenize.py
def word_tokenize_turkish(sentence, sw):
    """
    Args:
        sentence (str): any sentence.
    Returns:
        list: each item is a word.
    """
    
    
    acronym_each_dot = r"(?:[a-zğçşöüı]\.){2,}"
    acronym_end_dot = r"\b[a-zğçşöüı]{2,3}\."
    suffixes = r"[a-zğçşöüıi̇]{3,}' ?[a-zğçşöüıi̇]{0,3}"
    numbers = r"\d+[.,:\d]+"
    any_word = r"[a-zğçşöüıi̇]+"
    punctuations = r"[a-zğçşöüıi̇]*[.,!?;:]"
    word_regex = "|".join([acronym_each_dot,
                           acronym_end_dot,
                           suffixes,
                           numbers,
                           any_word,
                           punctuations])
    tokens = re.compile("%s"%word_regex, re.I).findall(sentence)
    filtered_tokens = []

    for token in tokens:
        if token not in sw:
            if len(token) != 1:
                filtered_tokens.append(token)

    
    return filtered_tokens 

# Tokenizes all sentences
def tokenize_sentences(data, sw):
    token_list_all = []
    sw = read_stopwords()
    
    print('Tokenizing sentences.')
    time.sleep(1)
    
    for sentence_list in tqdm(data['sentence'].tolist()):
        token_list_temp = []
        for sentence in sentence_list:
            token = word_tokenize_turkish(sentence, sw)
            token_list_temp.append(token)
        token_list_all.append(token_list_temp)

    data['sentence_tokenized'] = token_list_all  
    
    return data

In [7]:
sw = read_stopwords()

data = extract_sentences(data)
data = tokenize_sentences(data, sw)

Extracting sentences.


100%|██████████| 257/257 [00:31<00:00,  8.09it/s]


Tokenizing sentences.


100%|██████████| 257/257 [01:29<00:00,  2.87it/s]


In [8]:
data.head()

Unnamed: 0,filename,year,dir,text,sentence,sentence_tokenized
0,mart-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/mart-...,bilim teknikve aylık popüler bilim dergisi ...,[ bilim teknikve aylık popüler bilim dergisi...,"[[bilim, teknikve, aylık, popüler, bilim, derg..."
1,ocak-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/ocak-...,bilim teknikve aylık popüler bilim dergisi ...,[ bilim teknikve aylık popüler bilim dergisi...,"[[bilim, teknikve, aylık, popüler, bilim, derg..."
2,nisan-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/nisan...,bilim teknikve aylık popüler bilim dergisi ...,[ bilim teknikve aylık popüler bilim dergisi...,"[[bilim, teknikve, aylık, popüler, bilim, derg..."
3,temmuz-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/temmu...,bilim teknikve aylık popüler bilim dergisi ...,[ bilim teknikve aylık popüler bilim dergisi...,"[[bilim, teknikve, aylık, popüler, bilim, derg..."
4,mayis-2015.txt,2015,/home/emrecan/kave/Final/text_files/2015/mayis...,bilim teknikve aylık popüler bilim dergisi ...,[ bilim teknikve aylık popüler bilim dergisi...,"[[bilim, teknikve, aylık, popüler, bilim, derg..."


In [9]:
# Ran into an error when converting years to int
data.loc[(data['year'] == 'ints')]

Unnamed: 0,filename,year,dir,text,sentence,sentence_tokenized
24,agustos-1999-checkpoint.txt,ints,/home/emrecan/kave/Final/text_files/1999/.ipyn...,ii. ulusal gökyüzü gözlem şenliği’ne doğru......,"[ ii., ulusal gökyüzü gözlem şenliği’ne doğru....","[[ii.], [ulusal, gökyüzü, gözlem, şenliği, doğ..."
208,aralik-2004-checkpoint.txt,ints,/home/emrecan/kave/Final/text_files/2004/.ipyn...,a y l i k p o p ü l e r b ...,[ a y l i k p o p ü l e r b ...,"[[bilim, veteknik, nevi, mirasım, ilim, akıldı..."


In [10]:
# Fixing manually
data.loc[208, 'filename'] = 'aralik-2004.txt'
data.loc[208, 'year'] = 2004
data.loc[208, 'dir'] = '/home/emrecan/kave/Final/text_files/2004/aralik-2004.txt'

data.loc[24, 'filename'] = 'agustos-1999.txt'
data.loc[24, 'year'] = 1999
data.loc[24, 'dir'] = '/home/emrecan/kave/Final/text_files/1999/agustos-1999.txt'

data['year'] = data['year'].astype('int64') # Convert year values to numerical
data.sort_values(by='year',inplace=True) # Sort by year

# Model Training

## Seperating Time Intervals

In [11]:
data_99_09 = data.loc[(data['year'] >= 1999) & (data['year'] <= 2009)] # Extract a time interval
data_10_20 = data.loc[(data['year'] >= 2010) & (data['year'] <= 2020)] # Extract a time interval

In [12]:
data_99_09

Unnamed: 0,filename,year,dir,text,sentence,sentence_tokenized
23,temmuz-1999.txt,1999,/home/emrecan/kave/Final/text_files/1999/temmu...,selçuk alsan - raşit gürdilek atomaltı dünya...,[ selçuk alsan - raşit gürdilek atomaltı düny...,"[[selçuk, alsan, raşit, gürdilek, atomaltı, dü..."
24,agustos-1999.txt,1999,/home/emrecan/kave/Final/text_files/1999/agust...,ii. ulusal gökyüzü gözlem şenliği’ne doğru......,"[ ii., ulusal gökyüzü gözlem şenliği’ne doğru....","[[ii.], [ulusal, gökyüzü, gözlem, şenliği, doğ..."
22,eylul-1999.txt,1999,/home/emrecan/kave/Final/text_files/1999/eylul...,dünya’da binlerce insan depremler yüzünde...,[ dünya’da binlerce insan depremler yüzünd...,"[[dünya, binlerce, insan, depremler, yüzünden,..."
21,nisan-1999.txt,1999,/home/emrecan/kave/Final/text_files/1999/nisan...,bilim ve teknoloji haberleri selçuk alsan - ...,[ bilim ve teknoloji haberleri selçuk alsan -...,"[[bilim, teknoloji, haberleri, selçuk, alsan, ..."
20,ekim-1999.txt,1999,/home/emrecan/kave/Final/text_files/1999/ekim-...,h l a e b r e i r r a ş i t g ü r d i...,[ h l a e b r e i r r a ş i t g ü r d ...,"[[nasa, mars, sondasını, yitirdi, yanlışlığın,..."
...,...,...,...,...,...,...
168,subat-2009.txt,2009,/home/emrecan/kave/Final/text_files/2009/subat...,aylık popüler bilim dergisi şubat 2009 yıl 4...,[aylık popüler bilim dergisi şubat 2009 yıl ...,"[[aylık, popüler, bilim, dergisi, şubat, 2009,..."
169,kasim-2009.txt,2009,/home/emrecan/kave/Final/text_files/2009/kasim...,b i l i m v e t e k n i k ...,[b i l i m v e t e k n i k ...,"[[gökadalar, posteri, derginizle, birlikte, bi..."
170,mayis-2009.txt,2009,/home/emrecan/kave/Final/text_files/2009/mayis...,aylık popüler bilim dergisi mayıs 2009 yıl 4...,[aylık popüler bilim dergisi mayıs 2009 yıl ...,"[[aylık, popüler, bilim, dergisi, mayıs, 2009,..."
161,eylul-2009.txt,2009,/home/emrecan/kave/Final/text_files/2009/eylul...,aylık popüler bilim dergisi eylül 2009 yıl 4...,[aylık popüler bilim dergisi eylül 2009 yıl ...,"[[aylık, popüler, bilim, dergisi, eylül, 2009,..."


In [13]:
data_10_20

Unnamed: 0,filename,year,dir,text,sentence,sentence_tokenized
187,kasim-2010.txt,2010,/home/emrecan/kave/Final/text_files/2010/kasim...,b i l i m v e t e k n i k ...,[b i l i m v e t e k n i k ...,"[[türkiye, biyoçeşitliliği, posteri, derginizl..."
188,haziran-2010.txt,2010,/home/emrecan/kave/Final/text_files/2010/hazir...,“kök hücreler” posteri derginizle birlikte... ...,[“kök hücreler” posteri derginizle birlikte......,"[[kök, hücreler, posteri, derginizle, birlikte..."
189,temmuz-2010.txt,2010,/home/emrecan/kave/Final/text_files/2010/temmu...,“türkiye’nin jeolojik oluşumları” posteri derg...,[“türkiye’nin jeolojik oluşumları” posteri der...,"[[türkiye, jeolojik, oluşumları, posteri, derg..."
186,agustos-2010.txt,2010,/home/emrecan/kave/Final/text_files/2010/agust...,aylık popüler bilim dergisi ağustos 2010 yıl...,[aylık popüler bilim dergisi ağustos 2010 yı...,"[[aylık, popüler, bilim, dergisi, ağustos, 201..."
185,mayis-2010.txt,2010,/home/emrecan/kave/Final/text_files/2010/mayis...,türkiye’den bilimcilerin gözüyle ve katkılarıy...,[türkiye’den bilimcilerin gözüyle ve katkıları...,"[[türkiye, den, bilimcilerin, gözüyle, katkıla..."
...,...,...,...,...,...,...
152,ocak-2019.txt,2019,/home/emrecan/kave/Final/text_files/2019/ocak-...,aylık popüler bilim dergisi ocak 2019 yıl 52...,[aylık popüler bilim dergisi ocak 2019 yıl 5...,"[[aylık, popüler, bilim, dergisi, ocak, 2019, ..."
158,kasim-2019.txt,2019,/home/emrecan/kave/Final/text_files/2019/kasim...,aylık popüler bilim dergisi kasım 2019 yıl 5...,[aylık popüler bilim dergisi kasım 2019 yıl ...,"[[aylık, popüler, bilim, dergisi, kasım, 2019,..."
75,mart-2020.txt,2020,/home/emrecan/kave/Final/text_files/2020/mart-...,aylık popüler bilim dergisi mart 2020 yıl 53...,[aylık popüler bilim dergisi mart 2020 yıl 5...,"[[aylık, popüler, bilim, dergisi, mart, 2020, ..."
74,ocak-2020.txt,2020,/home/emrecan/kave/Final/text_files/2020/ocak-...,228 yerine 130 72 yerine 50 156 yerine 90...,[228 yerine 130 72 yerine 50 156 yerine 9...,"[[228, 130, 72, 50, 156, 90, bilim, yaşta, biz..."


In [14]:
# Merging the years of data
def merge_sentences(df):
    merged = []
    for sentence_list in df['sentence_tokenized'].tolist():
        merged.extend(sentence_list)
    return merged

merged_99_09 = merge_sentences(data_99_09)
merged_10_20 = merge_sentences(data_10_20)

In [15]:
len(data['sentence_tokenized'][0]), len(merged_99_09)

(1722, 550088)

In [16]:
!pip install ipython-autotime
%load_ext autotime

Defaulting to user installation because normal site-packages is not writeable


In [17]:
from gensim.models import Word2Vec

# Model trained with texts from 1999 to 2009

## Word2Vec model trained with skip-gram and negative sampling
model_99_09 = Word2Vec(size=150, window=4, min_count=5, sg=1, negative=10, workers=4, seed=7)

print('Building vocab...')
model_99_09.build_vocab(merged_99_09)

print('Training the model...')
model_99_09.train(merged_99_09,
                  total_examples=model_99_09.corpus_count,
                  epochs=5)

Building vocab...
Training the model...


(27599756, 29909640)

time: 3min 39s


In [18]:
# Model trained with texts from 2009 to 2020

## Word2Vec model trained with skip-gram and negative sampling
model_10_20 = Word2Vec(size=150, window=4, min_count=5, sg=1, negative=10, workers=4, seed=7)

print('Building vocab...')
model_10_20.build_vocab(merged_10_20)

print('Training the model...')
model_10_20.train(merged_10_20,
                  total_examples=model_10_20.corpus_count,
                  epochs=5)

Building vocab...
Training the model...


(15008576, 16364980)

time: 2min 3s


In [19]:
if not os.path.exists('models'):
    os.mkdir('models')
    
model_99_09.save('models/model_99_09.model')
model_10_20.save('models/model_10_20.model')

time: 1.66 s


# Model Alignment

- Source: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf

In [20]:
import gensim
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine

model_99_09 = Word2Vec.load('models/model_99_09.model')
model_10_20 = Word2Vec.load('models/model_10_20.model')

time: 1.69 s


In [21]:
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    """Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        (With help from William. Thank you!)
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """
    
    # patch by Richard So [https://twitter.com/richardjeanso) (thanks!) to update this code for new version of gensim
    base_embed.init_sims()
    other_embed.init_sims()

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)
    
    # get the embedding matrices
    base_vecs = in_base_embed.wv.vectors_norm
    other_vecs = in_other_embed.wv.vectors_norm

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v) 
    # Replace original array with modified one
    # i.e. multiplying the embedding matrix (syn0norm/vectors_norm)by "ortho"
    other_embed.wv.vectors_norm = other_embed.wv.vectors = (other_embed.wv.vectors_norm).dot(ortho)
    return other_embed
	
def intersection_align_gensim(m1,m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.wv.vocab.keys())
    vocab_m2 = set(m2.wv.vocab.keys())

    # Find the common vocabulary
    common_vocab = vocab_m1&vocab_m2
    if words: common_vocab&=set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1-common_vocab and not vocab_m2-common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.vocab[w].count + m2.wv.vocab[w].count,reverse=True)

    # Then for each model...
    for m in [m1,m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.vocab[w].index for w in common_vocab]
        old_arr = m.wv.vectors_norm
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors_norm = m.wv.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        m.wv.index2word = common_vocab
        old_vocab = m.wv.vocab
        new_vocab = {}
        for new_index,word in enumerate(common_vocab):
            old_vocab_obj=old_vocab[word]
            new_vocab[word] = gensim.models.word2vec.Vocab(index=new_index, count=old_vocab_obj.count)
        m.wv.vocab = new_vocab

    return (m1,m2)

time: 3.24 ms


In [22]:
# Function aligned the second parameter (model_09_20) to the first parameter
# and returned the aligned model
model_10_20_aligned = smart_procrustes_align_gensim(model_99_09, model_10_20)

time: 754 ms


# Detecting Shifts

In [23]:
# KeyedVectors of both models
wv_0 = model_99_09.wv
wv_1 = model_10_20_aligned.wv

time: 504 µs


In [24]:
def find_shifts(wv0, wv1):
    shifts = []

    for word in wv0.vocab.keys():
        distance = cosine(wv0[word], wv1[word]) # Computing cosine distance
        shifts.append((word, distance))
  
    # Sorting shifts with respect to distance values
    shifts = sorted(shifts, key=lambda tup: tup[1])
    shifts.reverse()

    return shifts

time: 5.04 ms


In [25]:
# Finding the largest shifts
shifts = find_shifts(wv_0, wv_1)

# Largest 50 shifts
largest_50 = shifts[:50]
largest_50 = pd.DataFrame(largest_50, columns=['Word', 'Distance'])
largest_50

Unnamed: 0,Word,Distance
0,ücretsizdir,0.723232
1,137,0.651829
2,havuzu,0.640438
3,5.0,0.634531
4,eo,0.630934
5,4.0,0.629119
6,bt,0.620677
7,btd,0.620294
8,cas,0.620146
9,basım,0.617757


time: 2.77 s


# Remarkable Shifts

In [26]:
words = [
         'artırılmış', 'tesla', 'tabletler',
         'çekirdekli', 'sancar',
         'cinsiyet', 'google',
         'sony', 'nükleer'
        ]

time: 497 µs


In [27]:
movements = {'Word':[], 'Shift':[], 'Moving Away':[], 'Moving Towards':[]}

def create_movements_df(words, wv_0, wv_1):
    for word in words:
        shift = cosine(wv_0[word], wv_1[word])
        similar_99_09 = wv_0.most_similar(word)
        similar_10_20 = wv_1.most_similar(word)
        
        # Eliminating intersections
        for index0, element0 in enumerate(similar_99_09):
            for index1, element1 in enumerate(similar_10_20):
                if element0[0] == element1[0]:
                    similar_99_09.pop(index0)
                    similar_10_20.pop(index1)

        movements['Word'].append(word)
        movements['Shift'].append(shift)
        movements['Moving Away'].append(similar_99_09)
        movements['Moving Towards'].append(similar_10_20)

    movements_df = pd.DataFrame(movements)
    movements_df.sort_values(by=['Shift'], inplace=True, ascending=False)
    
    return movements_df

movements_df = create_movements_df(words, wv_0, wv_1)

time: 192 ms


In [28]:
from IPython.display import display

def show_movements(movements_df, n):
    for index, row in movements_df.iterrows():
        if n >=0:
            away = row['Moving Away']
            towards = row['Moving Towards']
            word = [row['Word'] for i in range(len(away))]
            
            away_df = pd.DataFrame(data=away, columns=['Away From', 'Similarity'])
            towards_df = pd.DataFrame(data=towards, columns=['Towards', 'Similarity'])
            word_df = pd.DataFrame(data=word, columns=['Word'])
            
            result = pd.concat([word_df, away_df, towards_df], axis=1)
            display(result)
            print('-'*50)
            
show_movements(movements_df, len(words))

Unnamed: 0,Word,Away From,Similarity,Towards,Similarity.1
0,artırılmış,gıdalardaki,0.876354,gerçeklik,0.856493
1,artırılmış,maddelerine,0.875508,gözlükleri,0.771263
2,artırılmış,maddelerinden,0.86794,gerçeklikte,0.764975
3,artırılmış,kazandırılmış,0.86548,konsolu,0.756608
4,artırılmış,gübrelerin,0.863842,tabletler,0.75457
5,artırılmış,uygulanma,0.862435,antivirüs,0.750129
6,artırılmış,tatlandırıcılar,0.858777,televizyonlara,0.749654
7,artırılmış,ekosistemlerdeki,0.856568,barkodlar,0.749114
8,artırılmış,çeşitlerinin,0.855688,uygulamalarını,0.745955
9,artırılmış,değerlendirilebiliyor,0.855544,gözlüklerinin,0.745525


--------------------------------------------------


Unnamed: 0,Word,Away From,Similarity,Towards,Similarity.1
0,tesla,gauss,0.765724,edison,0.791748
1,tesla,skaler,0.756988,motors,0.785748
2,tesla,mile,0.752726,westinghouse,0.715885
3,tesla,gücünde,0.75224,otomobil,0.709292
4,tesla,josephson,0.745121,rakipleri,0.70662
5,tesla,gücündeki,0.744325,ford,0.704569
6,tesla,amper,0.741855,sürücüsüz,0.699385
7,tesla,terazisi,0.735567,bmw,0.692724
8,tesla,siklotron,0.734514,kamyon,0.689952


--------------------------------------------------


Unnamed: 0,Word,Away From,Similarity,Towards,Similarity.1
0,tabletler,papirüs,0.82435,telefonlar,0.909046
1,tabletler,çivi,0.809836,televizyonlara,0.896342
2,tabletler,oyularak,0.80914,telefonlarda,0.882236
3,tabletler,kabartma,0.80769,konsolu,0.876114
4,tabletler,tabletlerin,0.800547,televizyonu,0.875934
5,tabletler,kilden,0.794885,telefonlarında,0.874517
6,tabletler,kazınmış,0.791354,tabletlerden,0.874375
7,tabletler,yazısıyla,0.789542,tabletlerde,0.869887
8,tabletler,yontulmuş,0.788397,telefonları,0.867399


--------------------------------------------------


Unnamed: 0,Word,Away From,Similarity,Towards,Similarity.1
0,çekirdekli,ökaryotik,0.852352,terabyte,0.806461
1,çekirdekli,ökaryot,0.839876,sata,0.800955
2,çekirdekli,rrna,0.788468,ram,0.798457
3,çekirdekli,zigot,0.785787,işlemciye,0.792231
4,çekirdekli,prokaryotik,0.782794,diske,0.781086
5,çekirdekli,sentezleyen,0.777308,inç,0.780741
6,çekirdekli,asitlerden,0.776264,inch,0.77923
7,çekirdekli,sarmallı,0.773783,crt,0.779015
8,çekirdekli,ribozomlar,0.772064,ekranlı,0.77472
9,çekirdekli,oluşturabilen,0.77136,işlemcisi,0.773859


--------------------------------------------------


Unnamed: 0,Word,Away From,Similarity,Towards,Similarity.1
0,film,filmler,0.669394,filmler,0.73031
1,film,filmi,0.641436,ayrıntısına,0.700359
2,film,filmin,0.639304,katlanabilir,0.669935
3,film,filmleri,0.636229,kumaştan,0.665466
4,film,filmle,0.628323,sumo,0.658969
5,film,filmlerle,0.597604,baskılı,0.657875
6,film,gösterimleri,0.588276,ince,0.657704
7,film,saydam,0.584659,kaplamalar,0.656605


--------------------------------------------------


Unnamed: 0,Word,Away From,Similarity,Towards,Similarity.1
0,sancar,neşet,0.941386,aziz,0.933845
1,sancar,meral,0.938108,tomas,0.801561
2,sancar,arat,0.937179,aaron,0.732118
3,sancar,niyazi,0.93492,kandel,0.728966
4,sancar,kamil,0.932834,wiles,0.706544
5,sancar,baykara,0.93236,ralph,0.702709
6,sancar,erdener,0.931466,keşifleriyle,0.701468
7,sancar,gülkan,0.929259,paylaştı,0.690204
8,sancar,mecit,0.928249,azim,0.686333
9,sancar,handan,0.925732,ig,0.682156


--------------------------------------------------


Unnamed: 0,Word,Away From,Similarity,Towards,Similarity.1
0,sony,kodak,0.887899,asus,0.904479
1,sony,canon,0.870056,lg,0.898438
2,sony,cyber,0.86357,playstation,0.892083
3,sony,eos,0.862856,lenovo,0.887697
4,sony,nikon,0.850297,nokia,0.876621
5,sony,panasonic,0.841522,toshiba,0.873041
6,sony,828,0.837615,netbook,0.863701
7,sony,fuji,0.830885,touch,0.859858


--------------------------------------------------


Unnamed: 0,Word,Away From,Similarity,Towards,Similarity.1
0,google,earth,0.80889,microsoft,0.793035
1,google,adobe,0.805591,chrome,0.767096
2,google,msn,0.778999,glass,0.763991
3,google,iphone,0.770721,skype,0.753286
4,google,photoshop,0.768679,play,0.744941
5,google,firefox,0.765674,2.0,0.741576
6,google,sayfasına,0.760022,ebay,0.736741


--------------------------------------------------


Unnamed: 0,Word,Away From,Similarity,Towards,Similarity.1
0,nükleer,silahlar,0.65925,reaktörlerde,0.681916
1,nükleer,füzyon,0.658015,reaktörlerin,0.670999
2,nükleer,santralde,0.656731,toryum,0.669645
3,nükleer,silah,0.653785,reaktör,0.663419
4,nükleer,termonükleer,0.652217,reaktörler,0.660937
5,nükleer,bombalar,0.652184,santral,0.660136
6,nükleer,santrallerden,0.651826,santralin,0.655674
7,nükleer,santralinde,0.649334,reaktörün,0.644579
8,nükleer,bombanın,0.640644,felaketinden,0.643118


--------------------------------------------------


Unnamed: 0,Word,Away From,Similarity,Towards,Similarity.1
0,cinsiyet,cinsiyete,0.772544,farklılıkların,0.828849
1,cinsiyet,ırk,0.769105,ayrımcılığı,0.817888
2,cinsiyet,cinsiyetin,0.748884,şizofreniye,0.802715
3,cinsiyet,eşey,0.745203,bireye,0.789519
4,cinsiyet,gruplarındaki,0.739265,faktörleri,0.784914
5,cinsiyet,erkeklik,0.732854,hormonları,0.780028
6,cinsiyet,bireylere,0.731224,genindeki,0.772793
7,cinsiyet,cinsiyetler,0.72982,farklılığın,0.771215
8,cinsiyet,sinestezi,0.729192,dönemindeki,0.763822
9,cinsiyet,çocuklarında,0.728848,genetiğin,0.761909


--------------------------------------------------
time: 156 ms


# References
- William L. Hamilton, Jure Leskovec, and Dan Jurafsky. ACL 2016. Diachronic Word Embeddings Reveal Statistical Laws of Semantic Change. https://nlp.stanford.edu/projects/histwords/