In [133]:
import os
import scipy
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [2]:
# !pip install pandas

In [3]:
mdata = scipy.io.loadmat(os.path.join("data", "mitchell", f"data-science-P{1}.mat"))

In [4]:
#TODO load all participants

### Create voxels-nouns matrix (21764 x 60)

In [5]:
arrays = {}
for item in mdata['info'][0]:
    word = item[2][0]
    if word in arrays:
        arrays[word].append(mdata['data'][np.where(mdata['info'][0] == item)][0][0][0])
    else:
        arrays[word] = [mdata['data'][np.where(mdata['info'][0] == item)][0][0][0]]

In [6]:
averaged_arrays = {}
for word in arrays.keys():
    averaged_arrays[word] = np.mean(np.stack(arrays[word]), axis=0)

In [7]:
df_voxel_noun = pd.DataFrame.from_dict(averaged_arrays)

In [8]:
df_voxel_noun.head()

Unnamed: 0,refrigerator,barn,bell,key,window,screwdriver,church,desk,cow,knife,...,door,leg,carrot,watch,table,chimney,bed,fly,dog,telephone
0,-0.119733,-0.19919,0.23034,-0.139996,0.022133,-0.112634,0.353272,-0.042343,-0.189063,-0.072357,...,0.276345,-0.140542,0.094865,-0.540518,-0.051653,-0.213026,0.045836,-0.050042,-0.281979,-0.194655
1,0.249838,-0.189012,0.154014,-0.064165,0.002591,0.137915,0.358341,0.143465,-0.304287,0.051566,...,0.252659,-0.099127,0.15101,-0.521764,-0.21917,-0.09775,0.12719,-0.162893,-0.130701,0.070468
2,0.622086,-0.208923,0.283291,0.10046,-0.197094,0.070147,0.186019,0.318936,-0.328637,0.028104,...,0.037496,-0.326009,0.203993,-0.307184,-0.072724,0.283097,0.055711,-0.17357,-0.028404,0.307875
3,0.042476,-0.221981,0.303526,0.301961,-0.179251,-0.358278,-0.209994,-0.044467,-0.310875,-0.081622,...,0.058031,-0.527249,0.06953,-0.235786,0.00548,0.153367,-0.044356,-0.054538,-0.023114,0.301743
4,0.008095,-0.751286,-0.086668,-0.12038,-0.293255,-0.342001,-0.292562,-0.080478,-0.086134,-0.020103,...,-0.185362,-0.575305,0.118295,-0.653277,-0.558018,0.0364,-0.057903,0.331068,-0.040789,0.281461


### Create nouns-sem_featuers matrix (60 x 25)

In [9]:
raw_file = os.path.join("data","mitchell_semantic_raw.txt")
semantic_features = {}
with open(raw_file, 'r') as f:
    lines = f.readlines()
    word = None
    for line in lines:
        if len(line) >= 5:
            if "Features for" in line:
                if word and len(semantic_features[word]['features']) < 25: del semantic_features[word] 
                word = line.split("<a name=\"")[1].split("\"")[0]
                semantic_features[word] = { "features": [], "values": []}
            elif word:
                feature_name = line.split()[0]
                val = float(line.split("(")[1].split(")")[0])
                semantic_features[word]["features"].append(feature_name)
                semantic_features[word]["values"].append(val)

In [134]:
df_noun_feature = pd.DataFrame()
for noun, info in semantic_features.items():
    temp_df = pd.DataFrame({
        'feature': info['features'],
        'value': info['values'],
        'noun': noun})
    df_noun_feature = pd.concat([df_noun_feature, temp_df], ignore_index=True)

df_noun_feature = df_noun_feature.pivot(columns='feature', values='value', index='noun')

In [135]:
df_noun_feature.head()

feature,approach,break,clean,drive,eat,enter,fear,fill,hear,lift,...,push,ride,rub,run,say,see,smell,taste,touch,wear
noun,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
airplane,0.048,0.028,0.019,0.023,0.024,0.056,0.019,0.032,0.066,0.072,...,0.024,0.918,0.0,0.052,0.201,0.221,0.005,0.0,0.048,0.001
ant,0.023,0.017,0.132,0.01,0.062,0.016,0.005,0.005,0.016,0.016,...,0.0,0.015,0.006,0.944,0.156,0.198,0.0,0.006,0.005,0.003
apartment,0.009,0.064,0.238,0.07,0.012,0.21,0.002,0.08,0.021,0.024,...,0.002,0.007,0.0,0.075,0.093,0.2,0.036,0.0,0.002,0.0
arch,0.06,0.047,0.185,0.075,0.035,0.032,0.024,0.054,0.012,0.053,...,0.004,0.018,0.007,0.129,0.244,0.474,0.0,0.0,0.127,0.015
arm,0.019,0.432,0.021,0.071,0.008,0.024,0.005,0.026,0.007,0.548,...,0.084,0.01,0.03,0.103,0.45,0.132,0.001,0.002,0.206,0.048


### Regression

In [136]:
from sklearn.linear_model import LinearRegression

In [137]:
model = LinearRegression()

In [138]:
X = df_noun_feature
y = df_voxel_noun

In [139]:
df_coefficients = pd.DataFrame(index=range(y.shape[0]), columns=X.columns)
df_coefficients.index.name = 'voxel'

In [140]:
for i in tqdm(range(y.shape[0])):
    model.fit(X, y.iloc[i])
    df_coefficients.iloc[i] = model.coef_

100%|██████████| 21764/21764 [00:12<00:00, 1715.44it/s]


In [141]:
df_coefficients

feature,approach,break,clean,drive,eat,enter,fear,fill,hear,lift,...,push,ride,rub,run,say,see,smell,taste,touch,wear
voxel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.260812,0.105105,0.500662,0.293935,0.191403,0.35884,0.585917,0.218998,1.529033,-0.018389,...,0.387297,0.081411,-2.23057,-0.018933,0.000773,-0.248814,-1.779865,0.727951,0.090013,0.431795
1,0.184131,0.117589,0.408119,0.304118,0.295056,0.080284,0.907594,0.196824,2.367746,0.058339,...,-0.261515,0.257804,-2.298005,0.003899,0.137707,-0.32663,-1.965451,0.405446,0.202994,0.3405
2,0.148509,-0.093776,0.082635,0.103735,0.233522,0.077332,0.868233,0.084488,2.242578,0.091085,...,-0.430838,0.232747,-1.082552,-0.087093,0.179793,-0.255823,-1.587466,-0.029408,0.195619,0.05746
3,0.514401,-0.203586,-0.241387,-0.047837,0.070978,-0.170892,0.67227,0.529699,1.611576,-0.078078,...,-0.290343,0.060858,-0.361332,-0.088612,0.011306,-0.111014,-0.368961,-0.565793,0.169746,0.012113
4,0.63947,0.084219,0.257235,0.383435,0.224725,0.174844,1.876852,0.451996,0.84066,0.401875,...,-0.247435,0.17755,-2.627525,-0.385282,0.211791,-0.136762,0.494599,-0.429638,0.069512,0.186496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21759,1.145533,-0.120977,1.107511,-0.054432,-0.035677,0.441035,1.938828,-0.197387,-0.713837,0.070168,...,1.419665,0.193706,-4.712016,-0.327759,0.340546,0.570977,1.002522,-0.390908,-0.072698,0.451768
21760,0.923167,-0.147661,0.740758,-0.026015,0.063978,0.479314,1.615127,-0.07971,-0.492357,0.115354,...,0.419213,0.104511,-1.984352,-0.325976,0.192832,0.263424,-1.110307,0.148901,0.111535,0.216366
21761,0.145882,-0.117167,0.13604,-0.048419,0.023393,0.247763,1.284126,0.142481,-0.1075,-0.017001,...,-0.381434,-0.036894,-0.593367,-0.194915,-0.054167,0.151622,-1.442088,0.232574,0.172297,-0.066002
21762,-0.083794,-0.059522,-0.007396,0.05835,-0.144096,-0.021163,1.422818,0.097488,0.132928,-0.230144,...,-1.038207,-0.0863,-0.180394,-0.15823,-0.090911,0.202125,-1.158284,0.054619,0.161161,-0.276968
