# Assignment 1 - Computational Psycholinguistics
By Daan Brugmans

## 1. Imports and Global Constants

In [8]:
import os

import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from gensim.models import Word2Vec
from tqdm import tqdm


PATH_TO_EXPERIMENT_DATA = os.path.join(os.getcwd(), "data", "data_naming.csv")
PATH_TO_TRAIN_DATA = os.path.join(os.getcwd(), "data", "train_en.txt")
MODEL_DIR = os.path.join(os.getcwd(), "models")

SEED = 3131

## 2. Data Loading

In [7]:
test_df = pd.read_csv(PATH_TO_EXPERIMENT_DATA)

# TODO: Load train_df as parsed train_en.txt

Unnamed: 0,isi,prime,target,condition,meanRT
0,50,capability,ability,strong,580.5
1,1050,capability,ability,strong,577.4
2,50,means,ability,weak,538.0
3,1050,means,ability,weak,571.0
4,50,fuel,ability,unrel_strong,639.3
...,...,...,...,...,...
11035,1050,giraffe,zoo,weak,545.4
11036,50,crisp,zoo,unrel_strong,542.8
11037,1050,crisp,zoo,unrel_strong,685.2
11038,50,bottle,zoo,unrel_weak,545.6


## 3. Model Training
If you want to run this notebook, you do not need to train, the models, since the pre-trained models will be loaded later.

In [None]:
WORD2VEC_VECTOR_SIZE = 300
WORD2VEC_MIN_COUNT = 5
WORD2VEC_EPOCHS = 5
WORD2VEC_WORKERS = 2

WORD2VEC_SG_SPACE = set(0, 1)
WORD2VEC_WINDOW_SPACE = set(2, 5, 8)
WORD2VEC_SEARCH_SPACE = [(sg, window) for sg in WORD2VEC_SG_SPACE for window in WORD2VEC_WINDOW_SPACE]

for sg, window in tqdm(WORD2VEC_WINDOW_SPACE):
    word2vec_model = Word2Vec(
        sentences=train_df,
        sg=sg,
        window=window,
        vector_size=WORD2VEC_VECTOR_SIZE,
        min_count=WORD2VEC_MIN_COUNT,
        epochs=WORD2VEC_EPOCHS,
        seed=SEED,
        workers=WORD2VEC_WORKERS
    )
    
    if sg == 0:
        word2vec_architecture_name = "cbow"
    elif sg == 1:
        word2vec_architecture_name = "skipgram"
    else:
        raise ValueError("Invalid value of sg encountered:", sg, ", was of type", type(sg))
    
    path_to_word2vec_model = os.path.join(MODEL_DIR, f"{word2vec_architecture_name}_window_{window}.word2vec")
    word2vec_model.save()

## 4. Model Loading

In [None]:
cbow_window_2 = Word2Vec.load(os.path.join(MODEL_DIR, "cbow_window_2.wav2vec"))
cbow_window_5 = Word2Vec.load(os.path.join(MODEL_DIR, "cbow_window_5.wav2vec"))
cbow_window_8 = Word2Vec.load(os.path.join(MODEL_DIR, "cbow_window_8.wav2vec"))

skipgram_window_2 = Word2Vec.load(os.path.join(MODEL_DIR, "skipgram_window_2.wav2vec"))
skipgram_window_5 = Word2Vec.load(os.path.join(MODEL_DIR, "skipgram_window_5.wav2vec"))
skipgram_window_8 = Word2Vec.load(os.path.join(MODEL_DIR, "skipgram_window_8.wav2vec"))

## 5. Model Testing

In [None]:
# TODO: For every word2vec model, calculate the cosine similarities between test_df prime and target using `model.wv.similarity(prime, target)`

# TODO: Store the cosine similarities, group by semantic relatedness (strong, weak, unrelated) and calculate the mean.per group.
# Check if mean similarities match the three conditions: strong should be smallest (since small semantic distance), then weak, then unrelated

# TODO: Investigate how well the sizes of the priming effects for each target are predicted by cosine similarities
# Build seaborn regression plot where x = cosine_similarity, y = meanRT, hue = isi, seed = SEED
# Use SciPy to get linear regression slope, intercept, r from seaborn plot and add it to plot

### 5.1 CBOW Window 2

### 5.2 CBOW Window 5

### 5.3 CBOW Window 8

### 5.4 Skip-Gram Window 2

### 5.5 Skip-Gram Window 5

### 5.6 Skip-Gram Window 8