In [1]:
%load_ext autoreload
%autoreload 2
import os
import matplotlib.pyplot as plt
from os.path import join
from tqdm import tqdm
import pandas as pd
from os.path import expanduser
import sys
from typing import List
import numpy as np
import joblib
from pprint import pprint
import imodelsx.util
from os.path import dirname
import pickle as pkl
import json
from copy import deepcopy
from numpy.linalg import norm
from math import ceil
from imodelsx.qaemb.qaemb import QAEmb, get_sample_questions_and_examples
from neuro.treebank.config import STORIES_POPULAR, STORIES_UNPOPULAR, ECOG_DIR

In [2]:
story_fname = 'cars-2'
features_df = pd.read_csv(
    join(ECOG_DIR, 'data', 'transcripts', story_fname, 'features.csv'))
sec_window = 3
ngram_list = []
for i in tqdm(range(0, len(features_df))):
    row = features_df.iloc[i]
    time_end = row['end']
    time_start = time_end - sec_window
    ngram = features_df[(features_df['end'] >= time_start) & (
        features_df['end'] <= time_end)]['text'].values.tolist()
    ngram_list.append(ngram)
features_df['ngram'] = ngram_list

100%|██████████| 11407/11407 [00:04<00:00, 2742.28it/s]


### Look at story

In [107]:
# remove duplicate consecutive values
df = features_df.loc[features_df['sentence'].shift() !=
                     features_df['sentence']]

# set speaker to '' for duplicate consecutive values
duplicate_speaker = df['speaker'].shift() == df['speaker']
df = df[df['sentence'].notna()]
df.loc[duplicate_speaker, 'speaker'] = ''

# numbered sentences
# df.loc[~duplicate_speaker, 'speaker'] = '<' + df['speaker'] + '>:\n'
# df['sentence_idx'] = np.arange(len(df)) + 1
# df['script'] = df['speaker'] + \
# df['sentence_idx'].astype(str) + '. ' + df['sentence']
# story = '\n'.join(df['script'].iloc[:50])

# unnumbered
df.loc[~duplicate_speaker, 'speaker'] = '\n<' + df['speaker'] + '>: '
df['script'] = df['speaker'] + df['sentence']
story = ' '.join(df['script'].iloc[:500])

In [None]:
print(story)

### Get popular stories

In [3]:
subject_metadata_files = os.listdir(join(ECOG_DIR, 'data', 'subject_metadata'))
jsons = {f: json.load(open(join(ECOG_DIR, 'data', 'subject_metadata', f)))
         ['title'] for f in subject_metadata_files}
df = pd.DataFrame(jsons.values(), index=jsons.keys()).reset_index()
df.rename(columns={'index': 'filename', 0: 'title'}, inplace=True)
df['subject'] = df['filename'].apply(lambda x: x.split('_trial')[
                                     0].split('_')[-1]).astype(int)
# df = df.sort_values(by='subject')
df = df.sort_values(by='title')

common_subjs = {3, 4, 6, 7, 10}
stories_popular = df[df.subject.isin(common_subjs)].title.unique()
stories_unpopular = df[~df.subject.isin(common_subjs)].title.unique()
print(f'{len(stories_popular)=} {len(stories_unpopular)=}')
stories_popular, stories_unpopular

len(stories_popular)=9 len(stories_unpopular)=12


(array(['Cars 2', 'Coraline', 'Lord Of The Rings 1', 'Lord Of The Rings 2',
        'Megamind', 'Shrek The Third', 'Spiderman Far From Home',
        'The Incredibles', 'Toy Story'], dtype=object),
 array(['Antman', 'Aquaman', 'Avengers Infinity War', 'Black Panther',
        'Fantastic Mr. Fox', 'Guardians Of The Galaxy 2',
        'Guardians Of the Galaxy', 'Sesame Street Episode 3990',
        'Spiderman Homecoming', 'The Martian', 'Thor Ragnarok', 'venom'],
       dtype=object))

In [6]:
df['title'].value_counts()

title
Cars 2                        3
Megamind                      3
Fantastic Mr. Fox             2
Antman                        1
Aquaman                       1
Black Panther                 1
Avengers Infinity War         1
Guardians Of The Galaxy 2     1
Coraline                      1
Guardians Of the Galaxy       1
Lord Of The Rings 1           1
Lord Of The Rings 2           1
Sesame Street Episode 3990    1
Shrek The Third               1
Spiderman Far From Home       1
Spiderman Homecoming          1
The Incredibles               1
The Martian                   1
Thor Ragnarok                 1
Toy Story                     1
venom                         1
Name: count, dtype: int64

In [7]:
df

Unnamed: 0,filename,title,subject
25,sub_9_trial000_metadata.json,Antman,9
11,sub_2_trial006_metadata.json,Aquaman,2
9,sub_2_trial004_metadata.json,Avengers Infinity War,2
10,sub_2_trial005_metadata.json,Black Panther,2
22,sub_7_trial000_metadata.json,Cars 2,7
0,sub_10_trial000_metadata.json,Cars 2,10
12,sub_3_trial000_metadata.json,Cars 2,3
21,sub_6_trial004_metadata.json,Coraline,6
2,sub_1_trial000_metadata.json,Fantastic Mr. Fox,1
18,sub_5_trial000_metadata.json,Fantastic Mr. Fox,5
