# Parsing the xml files for words

### This charges all the events for the messages events in a pandas dataframe

In [None]:
# set up stuff
%matplotlib inline
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../src/')
from parse_xml import *

In [None]:
# Compute a pandas data-frame with the time, word and type for the message send during the sleep
events = get_events_from_xml()
events.head()

In [None]:
night = 78
times = list(events.loc[events.night == night].index)
words = [events.loc[t].word for t in times]
print(words)
print(len(words))

# Events from raw files

In [None]:
import sys
sys.path.append('../src/')
from events_parser import *

In [None]:
%time table_night(night=78)

In [None]:
raw = mne.io.read_raw_egi("../data/raw/EEG/Nathalie-78_20171118_123017.mff",
                          montage='GSN-HydroCel-256',
                          preload=False)

In [None]:
%time events = mne.find_events(raw)

In [None]:
%time table_words_eeg = table_night(night=78, obj={'raw':raw, 'events':events})

In [None]:
table_words_eeg.loc[0].word

# Load the dreams.csv

In [None]:
import pandas as pd
import os.path

dreams_fname = '../data/derived/dreams.csv'
if os.path.isfile(dreams_fname):
    dreams = pd.read_csv(dreams_fname)
else:
    raise RuntimeError('you should pre-compute dreams.csv. Please run src/dream_parser.py')

dreams.head()

start pre-processing the events

In [None]:
# Combine both dataframes into single one. 
#
# this is not really used, since is better to work with dreams and events
assert len(dreams.night.unique()) == len(dreams.night)
_dreams = dreams.copy().set_index('night')
_events = events.copy().set_index('night')
df = _events.join(_dreams).reset_index()
del _dreams
del _events
df

Intersect Collect night event and report words

In [None]:
def get_event_words(night_id):
    return events[(events.type=='auto') & (events.night==night_id)].word.values

def get_dream_report_words(night_id):
    return set(dreams[dreams['night']==night_id].text.to_string().lower().split())

def intersect_set_list(my_set, my_list):
    return my_set.intersection(my_list)


In [None]:
xx = pd.DataFrame(columns=["night", "event_words", "report_words", "intersection"])
for n in events.night.unique():
    event_words = get_event_words(n)
    report_words = get_dream_report_words(n)
    intersection = intersect_set_list(report_words, event_words)
    xx = xx.append({'night':n, 'event_words':event_words, 'report_words':report_words, 'intersection':intersection}, ignore_index=True)

xx.set_index('night', drop=True, inplace=True)

xx[xx['intersection'].values != set()]

# Gettting the similarity for the two groups of words: dreams and played

In [None]:
from gensim.models.wrappers import FastText

model = FastText.load_fasttext_format('../data/raw/BIN/wiki.simple')

In [None]:
def compute_cosdistance(groupslist):
    event_words = groupslist[0]
    report_words = groupslist[1]
    _event_words = [w for w in event_words if w in model.wv.vocab]
    _report_words = [w for w in list(report_words) if w in model.wv.vocab]
    return model.n_similarity(_event_words, _report_words)

In [None]:
cos_distance = xx[["event_words", "report_words"]].apply(func=compute_cosdistance, axis=1)
xx["cos_distance"] = cos_distance

In [None]:
bs_distance = []
for perm in range(1000):
    yy = xx.copy()
    yy.report_words = yy.report_words.values[np.random.permutation(len(yy))]
    cos_distance = yy[["event_words", "report_words"]].apply(func=compute_cosdistance, axis=1)
    yy["cos_distance"] = cos_distance
    bs_distance.append(cos_distance)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as st

In [None]:
plt.figure(figsize=[10, 6])
A = np.array(bs_distance).flatten()
B = xx.cos_distance.values
plt.hist(A, bins=100, range=[0, 1], alpha=0.5, normed=True, label='Bootstrap')
plt.hist(B, bins=100, range=[0, 1], color='red', alpha=0.5, normed=True, label='Reality')
plt.xlabel('Cosine distance')
plt.ylabel('Probability')
plt.legend()
plt.title(st.mannwhitneyu(A,B))
plt.show()