In [5]:
from zipfile import ZipFile 
import shutil
import os
import urllib.request
import pandas as pd


DATA_DIR = "data"

print("Downloading dataset...")
# download dataset and save it in a zip archive
url = "https://zenodo.org/record/6626316/files/Greek%20Parliament%20Proceedings%20Dataset_Support%20Files_Word%20Usage%20Change%20Computations.zip"
with urllib.request.urlopen(url) as f:
    zip_contents = f.read()

zip_path = os.path.join("data", "greek.zip")
with open(zip_path, "wb") as file:
    file.write(zip_contents)

# unzip saved archived
print("Unzipping...")
with ZipFile(zip_path, 'r') as zfile:
    zfile.extractall(path=DATA_DIR)

# organize zipped directory
old_dir = os.path.join(DATA_DIR, "Greek Parliament Proceedings Dataset_Support Files_Word Usage Change Computations")
new_dir = os.path.join(DATA_DIR, "greek")

# delete MACOSX directory
shutil.rmtree(os.path.join(DATA_DIR, "__MACOSX"))
# rename directory to a sensible name
os.rename(old_dir, new_dir) 


# load dataset
print("Loading...")
df = pd.read_csv(os.path.join(new_dir, "tell_all_cleaned.csv"), encoding="utf-8")
df 

Downloading dataset...
Unzipping...
Loading...


Unnamed: 0,member_name,sitting_date,parliamentary_period,parliamentary_session,parliamentary_sitting,political_party,government,member_region,roles,member_gender,speaker_info,speech
0,κρητικος νικολαου παναγιωτης,03/07/1989,period 5,session 1,sitting 1,πανελληνιο σοσιαλιστικο κινημα,['τζαννετακη τζαννη(02/07/1989-12/10/1989)'],β' πειραιως,['δ αντιπροεδρος βουλης(07/03/1989-21/11/1989)'],male,προεδρευων,παρακαλειται @sw γραμματεας βουλγαρακης @sw συ...
1,κρητικος νικολαου παναγιωτης,03/07/1989,period 5,session 1,sitting 1,πανελληνιο σοσιαλιστικο κινημα,['τζαννετακη τζαννη(02/07/1989-12/10/1989)'],β' πειραιως,['δ αντιπροεδρος βουλης(07/03/1989-21/11/1989)'],male,προεδρευων,παρακαλειται @sw κυριος γραμματεας @sw συνοδευ...
2,κρητικος νικολαου παναγιωτης,03/07/1989,period 5,session 1,sitting 1,πανελληνιο σοσιαλιστικο κινημα,['τζαννετακη τζαννη(02/07/1989-12/10/1989)'],β' πειραιως,['δ αντιπροεδρος βουλης(07/03/1989-21/11/1989)'],male,προεδρευων,κυριοι συναδελφοι παρακαλω @sw βουλη @sw εξουσ...
3,,03/07/1989,period 5,session 1,sitting 1,βουλη,['τζαννετακη τζαννη(02/07/1989-12/10/1989)'],,,,βουλευτης/ες,@sw @sw
4,κρητικος νικολαου παναγιωτης,03/07/1989,period 5,session 1,sitting 1,πανελληνιο σοσιαλιστικο κινημα,['τζαννετακη τζαννη(02/07/1989-12/10/1989)'],β' πειραιως,['δ αντιπροεδρος βουλης(07/03/1989-21/11/1989)'],male,προεδρευων,@sw βουλη παρεσχε @sw ζητηθεισα εξουσιοδοτηση....
...,...,...,...,...,...,...,...,...,...,...,...,...
1280913,κωνσταντινοπουλος κωνσταντινου οδυσσεας,24/07/2020,period 18 review 9,session 1,sitting 187,κινημα αλλαγης,['μητσοτακη κυριακου(08/07/2019-28/07/2020)'],αρκαδιας,['ε αντιπροεδρος βουλης(18/07/2019-28/07/2020)'],male,προεδρευων,κυριες @sw κυριοι συναδελφοι παρακαλω @sw σωμα...
1280914,,24/07/2020,period 18 review 9,session 1,sitting 187,βουλη,['μητσοτακη κυριακου(08/07/2019-28/07/2020)'],,,,βουλευτης/ες,@sw @sw
1280915,κωνσταντινοπουλος κωνσταντινου οδυσσεας,24/07/2020,period 18 review 9,session 1,sitting 187,κινημα αλλαγης,['μητσοτακη κυριακου(08/07/2019-28/07/2020)'],αρκαδιας,['ε αντιπροεδρος βουλης(18/07/2019-28/07/2020)'],male,προεδρευων,@sw σωμα παρεσχε @sw ζητηθεισα εξουσιοδοτηση κ...
1280916,,24/07/2020,period 18 review 9,session 1,sitting 187,βουλη,['μητσοτακη κυριακου(08/07/2019-28/07/2020)'],,,,βουλευτης/ες,@sw @sw


We replaced all references to political parties with the symbol “@” followed by an abbreviation of the
party name. We removed accents, strings with length less than 2 characters, all punctuation except
full stops, and replaced stopwords with “@sw”.

In [36]:
from tqdm.notebook import tqdm_notebook


# enable progress bar functionality
tqdm_notebook().pandas()

0it [00:00, ?it/s]

In [51]:
speech_df = df.loc[:, ["sitting_date", "speech"]].copy()

speech_df.sitting_date = pd.to_datetime(speech_df.sitting_date, dayfirst=True)
speech_df.set_index("sitting_date", inplace=True)
speech_df.sort_index(inplace=True) # required to avoid wrong slicing (and warnings)

speech_df = speech_df[~speech_df.speech.isnull()]

speech_df = speech_df.loc["2010-01-01":] 
speech_df

Unnamed: 0_level_0,speech
sitting_date,Unnamed: 1_level_1
2010-01-11,κυριες @sw κυριοι συναδελφοι @sw @sw ευχηθω @s...
2010-01-11,ευχαριστουμε κυριε συναδελφε.κυριες @sw κυριοι...
2010-01-11,@sw @sw
2010-01-11,@sw βουλη ενεκρινε @sw ζητηθεισα αδεια.@sw @sw...
2010-01-11,@sw @sw
...,...
2020-07-24,κυριες @sw κυριοι συναδελφοι παρακαλω @sw σωμα...
2020-07-24,@sw @sw
2020-07-24,@sw σωμα παρεσχε @sw ζητηθεισα εξουσιοδοτηση κ...
2020-07-24,@sw @sw


In [52]:
# substantially faster than regex
def remove_placeholders(x: str) -> str:
    words = x.split()
    new_words = [word for word in words if not word.startswith("@")]
    return " ".join(new_words)


print("Reformatting words with periods...")
speech_df.speech = speech_df.speech.progress_apply(lambda x: " ".join(x.split(".")))

print("Removing placeholders...")
speech_df.speech = speech_df.speech.progress_apply(remove_placeholders)

speech_df = speech_df.loc[speech_df.speech.apply(lambda x: len(x.strip()) != 0)]
speech_df

Reformatting words with periods...


  0%|          | 0/539230 [00:00<?, ?it/s]

Removing placeholders...


  0%|          | 0/539230 [00:00<?, ?it/s]

Unnamed: 0_level_0,speech
sitting_date,Unnamed: 1_level_1
2010-01-11,κυριες κυριοι συναδελφοι ευχηθω κυριες κυριους...
2010-01-11,ευχαριστουμε κυριε συναδελφε κυριες κυριοι συν...
2010-01-11,βουλη ενεκρινε ζητηθεισα αδεια βουλευτης αικατ...
2010-01-11,βουλη ενεκρινε ζητηθεισα αδεια εισελθουμε ημερ...
2010-01-11,κυριες κυριοι συναδελφοι εισερχομαστε ημερησια...
...,...
2020-07-24,θεσεις κομματων αποτυπωθηκαν ψηφιση ηλεκτρονικ...
2020-07-24,ολοκληρωση ψηφοφοριας ηλεκτρονικο συστημα σχεδ...
2020-07-24,κυριες κυριοι συναδελφοι παρακαλω σωμα εξουσιο...
2020-07-24,σωμα παρεσχε ζητηθεισα εξουσιοδοτηση κυριοι συ...


In [53]:
import logging
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models.callbacks import CallbackAny2Vec


# https://stackoverflow.com/questions/77096387/how-to-get-a-progess-bar-for-gensim-models-fasttext-train
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

docs = speech_df.speech.apply(word_tokenize).to_list()
model = gensim.models.FastText(docs, 
                               vector_size=100, 
                               window=5, 
                               min_count=3,
                               workers=8,
                               callbacks=[callback()])

2023-12-09 14:28:49,343 : INFO : collecting all words and their counts
2023-12-09 14:28:49,348 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-12-09 14:28:49,771 : INFO : PROGRESS: at sentence #10000, processed 852790 words, keeping 60179 word types
2023-12-09 14:28:50,186 : INFO : PROGRESS: at sentence #20000, processed 1672178 words, keeping 81559 word types
2023-12-09 14:28:50,455 : INFO : PROGRESS: at sentence #30000, processed 2389021 words, keeping 94475 word types
2023-12-09 14:28:50,736 : INFO : PROGRESS: at sentence #40000, processed 3155198 words, keeping 106175 word types
2023-12-09 14:28:51,009 : INFO : PROGRESS: at sentence #50000, processed 4040750 words, keeping 122331 word types
2023-12-09 14:28:51,281 : INFO : PROGRESS: at sentence #60000, processed 4744397 words, keeping 129977 word types
2023-12-09 14:28:51,492 : INFO : PROGRESS: at sentence #70000, processed 5431045 words, keeping 136917 word types
2023-12-09 14:28:51,781 : INFO : PRO

Loss after epoch 0: 0.0


2023-12-09 14:32:49,565 : INFO : EPOCH 1 - PROGRESS: at 0.12% examples, 43471 words/s, in_qsize 13, out_qsize 2
2023-12-09 14:32:50,665 : INFO : EPOCH 1 - PROGRESS: at 0.47% examples, 91405 words/s, in_qsize 16, out_qsize 0
2023-12-09 14:32:51,770 : INFO : EPOCH 1 - PROGRESS: at 0.81% examples, 107403 words/s, in_qsize 16, out_qsize 0
2023-12-09 14:32:52,952 : INFO : EPOCH 1 - PROGRESS: at 1.19% examples, 113438 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:32:53,977 : INFO : EPOCH 1 - PROGRESS: at 1.59% examples, 120233 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:32:55,038 : INFO : EPOCH 1 - PROGRESS: at 1.99% examples, 125569 words/s, in_qsize 16, out_qsize 0
2023-12-09 14:32:56,173 : INFO : EPOCH 1 - PROGRESS: at 2.30% examples, 126965 words/s, in_qsize 14, out_qsize 1
2023-12-09 14:32:57,253 : INFO : EPOCH 1 - PROGRESS: at 2.79% examples, 130959 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:32:58,462 : INFO : EPOCH 1 - PROGRESS: at 3.23% examples, 131468 words/s, in_qsize 1

Loss after epoch 1: 0.0


2023-12-09 14:36:07,939 : INFO : EPOCH 2 - PROGRESS: at 0.23% examples, 91135 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:36:08,981 : INFO : EPOCH 2 - PROGRESS: at 0.60% examples, 123663 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:36:10,007 : INFO : EPOCH 2 - PROGRESS: at 1.00% examples, 135583 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:36:11,086 : INFO : EPOCH 2 - PROGRESS: at 1.39% examples, 139653 words/s, in_qsize 16, out_qsize 0
2023-12-09 14:36:12,115 : INFO : EPOCH 2 - PROGRESS: at 1.86% examples, 145263 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:36:13,125 : INFO : EPOCH 2 - PROGRESS: at 2.23% examples, 152300 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:36:14,225 : INFO : EPOCH 2 - PROGRESS: at 2.79% examples, 155611 words/s, in_qsize 16, out_qsize 0
2023-12-09 14:36:15,254 : INFO : EPOCH 2 - PROGRESS: at 3.19% examples, 155979 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:36:16,371 : INFO : EPOCH 2 - PROGRESS: at 3.64% examples, 154785 words/s, in_qsize 

Loss after epoch 2: 0.0


2023-12-09 14:39:24,696 : INFO : EPOCH 3 - PROGRESS: at 0.26% examples, 111708 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:39:25,783 : INFO : EPOCH 3 - PROGRESS: at 0.59% examples, 117920 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:39:26,850 : INFO : EPOCH 3 - PROGRESS: at 0.98% examples, 130016 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:39:27,939 : INFO : EPOCH 3 - PROGRESS: at 1.36% examples, 132829 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:39:28,971 : INFO : EPOCH 3 - PROGRESS: at 1.70% examples, 134170 words/s, in_qsize 16, out_qsize 0
2023-12-09 14:39:30,087 : INFO : EPOCH 3 - PROGRESS: at 2.05% examples, 133156 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:39:31,143 : INFO : EPOCH 3 - PROGRESS: at 2.36% examples, 134890 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:39:32,176 : INFO : EPOCH 3 - PROGRESS: at 2.80% examples, 136437 words/s, in_qsize 16, out_qsize 2
2023-12-09 14:39:33,202 : INFO : EPOCH 3 - PROGRESS: at 3.41% examples, 143931 words/s, in_qsize

Loss after epoch 3: 0.0


2023-12-09 14:42:36,784 : INFO : EPOCH 4 - PROGRESS: at 0.33% examples, 124784 words/s, in_qsize 16, out_qsize 0
2023-12-09 14:42:37,821 : INFO : EPOCH 4 - PROGRESS: at 0.74% examples, 150101 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:42:38,851 : INFO : EPOCH 4 - PROGRESS: at 1.20% examples, 165215 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:42:39,925 : INFO : EPOCH 4 - PROGRESS: at 1.79% examples, 175614 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:42:40,942 : INFO : EPOCH 4 - PROGRESS: at 2.20% examples, 177865 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:42:41,989 : INFO : EPOCH 4 - PROGRESS: at 2.71% examples, 177221 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:42:43,099 : INFO : EPOCH 4 - PROGRESS: at 3.18% examples, 175319 words/s, in_qsize 16, out_qsize 1
2023-12-09 14:42:44,218 : INFO : EPOCH 4 - PROGRESS: at 3.67% examples, 174799 words/s, in_qsize 15, out_qsize 0
2023-12-09 14:42:45,285 : INFO : EPOCH 4 - PROGRESS: at 4.53% examples, 178460 words/s, in_qsize

Loss after epoch 4: 0.0


2023-12-09 14:45:35,733 : INFO : FastText lifecycle event {'params': 'FastText<vocab=147458, vector_size=100, alpha=0.025>', 'datetime': '2023-12-09T14:45:35.733781', 'gensim': '4.3.0', 'python': '3.11.5 | packaged by conda-forge | (main, Aug 27 2023, 03:23:48) [MSC v.1936 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}


In [62]:
model.wv.most_similar("ελλαδα")

[('χωρα', 0.8639438152313232),
 ('ευρωπη', 0.7733089327812195),
 ('ελλαδι', 0.769740104675293),
 ('ελλαδoς', 0.739547848701477),
 ('χωραγε', 0.7323021292686462),
 ('βαλκανια', 0.7199378609657288),
 ('πατριδα', 0.685839056968689),
 ('χωραω', 0.6809353828430176),
 ('ξεχωρα', 0.6750528812408447),
 ('ενδοχωρα', 0.6686674952507019)]

In [66]:
model.wv.most_similar("κομμουνισμος")

[('αντικομμουνισμος', 0.8807011842727661),
 ('σοσιαλισμος', 0.8284322023391724),
 ('ναζισμος', 0.8013085126876831),
 ('σταλινισμος', 0.7946540713310242),
 ('κομμουνιστης', 0.7941724061965942),
 ('μαρξισμος', 0.7915278673171997),
 ('συριζαισμος', 0.787451982498169),
 ('λενινισμος', 0.7829488515853882),
 ('σοβινισμος', 0.781538724899292),
 ('φασισμος', 0.7795184850692749)]

In [67]:
model.wv.most_similar("ναζισμος")

[('νεοναζισμος', 0.9200035929679871),
 ('φασισμος', 0.9164180755615234),
 ('νατιβισμος', 0.9028304219245911),
 ('ναρκισσισμος', 0.8966827392578125),
 ('ισλαμισμος', 0.8897761702537537),
 ('μαρξισμος', 0.8746734261512756),
 ('ρεβανσισμος', 0.8737204074859619),
 ('ηρωισμος', 0.8714151382446289),
 ('συριζαισμος', 0.8701346516609192),
 ('σεξισμος', 0.8681153059005737)]

In [68]:
model.wv.most_similar("φασισμος")

[('εκφασισμος', 0.9450567960739136),
 ('ναζισμος', 0.9164181351661682),
 ('νεοναζισμος', 0.9056444764137268),
 ('ναρκισσισμος', 0.8971601128578186),
 ('ρεβανσισμος', 0.8968673944473267),
 ('φανατισμος', 0.8812100887298584),
 ('μαρξισμος', 0.8757075071334839),
 ('σαδισμος', 0.8744021058082581),
 ('σκοταδισμος', 0.8743776679039001),
 ('ηρωισμος', 0.872627854347229)]

In [56]:
model.wv.most_similar("ευρωπη")

[('ευρωπης', 0.8100565075874329),
 ('ευρωπαια', 0.7869395613670349),
 ('ευρωπο', 0.7845559120178223),
 ('ευρωζωνη', 0.7788133025169373),
 ('ελλαδα', 0.7733088731765747),
 ('ευρωπας', 0.7317231893539429),
 ('ευρωπαιο', 0.7290972471237183),
 ('βαλκανια', 0.729010820388794),
 ('ευρωπαισμο', 0.7058138251304626),
 ('ευρωπαιστη', 0.7003160119056702)]

In [73]:
model.wv.most_similar("τουρκια")

[('τουρκικα', 0.8457058072090149),
 ('αλβανια', 0.8390213251113892),
 ('τουρκοβουνια', 0.8367199301719666),
 ('τουρκολιβυκο', 0.8211721181869507),
 ('τουρκιας', 0.8036004304885864),
 ('τουρκικη', 0.8022465705871582),
 ('τουρκο', 0.7976134419441223),
 ('τουρκοφοβια', 0.7951493263244629),
 ('ελληνοτουρκικα', 0.7902024984359741),
 ('τουρκολαγνεια', 0.7830843329429626)]