In [72]:
# Importing packages

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [73]:
# Loading data and converting to panda dataframe (and changing/defining the names)

data_file = '/Users/christianstenbro/Programming/ripley_project/Data_files/original_paper_data_frame_cleaned_v1.csv'

data = pd.read_csv(data_file, sep=',', names=['ID', 'bib', 'abstract', 'rep_score'])
data.head()

Unnamed: 0,ID,bib,abstract,rep_score
0,original_paper_ID,original_paper_bib,abstract,replication_score
1,O1,"Vitevitch, M. S. and Stamer, M. K. 2006. The c...",In previous studies in English examining the i...,yes
2,O2,"Cutler, A. , Mehler, J. , Noh, D. , and Seguí,...",Infants acquire whatever language is spoken in...,partial
3,O2,"Seguí, J. , 1986. The syllable's differing rol...",Speech segmentation procedures may differ in s...,partial
4,O3,"Braun , B. , & Tagliapietra , L. 2009 The role...",Sentences with a contrastive intonation contou...,yes


In [74]:
# Tfidf vectorizing the text and saving in a (sparse?) matrix

tfidf = TfidfVectorizer()
matrix = tfidf.fit_transform(data['abstract'].values.astype('U')) # the text needs to be converted to unicode strings (see https://stackoverflow.com/questions/39303912/tfidfvectorizer-in-scikit-learn-valueerror-np-nan-is-an-invalid-document)
matrix.shape

(98, 2595)

In [75]:
# Checking the vocabulary
tfidf.vocabulary_ # is the number the overall corpus frequency associated with each word?

{'abstract': 79,
 'in': 1151,
 'previous': 1786,
 'studies': 2251,
 'english': 795,
 'examining': 837,
 'the': 2342,
 'influence': 1187,
 'of': 1585,
 'phonological': 1704,
 'neighbourhood': 1527,
 'density': 643,
 'spoken': 2195,
 'word': 2566,
 'production': 1817,
 'words': 2567,
 'with': 2562,
 'many': 1397,
 'similar': 2133,
 'sounding': 2174,
 'or': 1614,
 'dense': 642,
 'were': 2540,
 'produced': 1815,
 'more': 1479,
 'quickly': 1865,
 'and': 185,
 'accurately': 97,
 'than': 2340,
 'few': 931,
 'sparse': 2179,
 'on': 1594,
 'process': 1809,
 'spanish': 2178,
 'was': 2528,
 'examined': 835,
 'picture': 1715,
 'naming': 1507,
 'task': 2320,
 'results': 2005,
 'showed': 2119,
 'that': 2341,
 'pictures': 1716,
 'names': 1506,
 'from': 990,
 'neighbourhoods': 1528,
 'named': 1504,
 'present': 1776,
 'pattern': 1668,
 'is': 1257,
 'opposite': 1611,
 'what': 2542,
 'has': 1051,
 'been': 309,
 'previously': 1787,
 'found': 975,
 'speech': 2188,
 'we': 2532,
 'hypothesise': 1105,
 'differ

In [76]:
row = 5
col = tfidf.vocabulary_['syllabifying'] # how is the tfidf matrix structured?

print('Abstract: "%s"' % data.loc[row, 'ID'])
print('TF-IDF score: %f' % matrix[row, col])

print(matrix)
matrix.shape

print("Column number: ", col)

# So clearly, the data is there now . . . we just need to extract the right data in the matrix to do the next step

Abstract: "O4"
TF-IDF score: 0.103853
  (0, 79)	1.0
  (1, 1989)	0.07840805166052696
  (1, 2029)	0.07840805166052696
  (1, 150)	0.053784483766245235
  (1, 197)	0.07840805166052696
  (1, 33)	0.07840805166052696
  (1, 1912)	0.07085446474511206
  (1, 608)	0.07840805166052696
  (1, 1851)	0.07840805166052696
  (1, 1293)	0.05956249563789154
  (1, 2419)	0.036964566106560875
  (1, 1576)	0.057941522623005705
  (1, 1812)	0.04150310848436469
  (1, 556)	0.08376740686721841
  (1, 1413)	0.055069782157893064
  (1, 1581)	0.07425101280376649
  (1, 2375)	0.040810025213235276
  (1, 2329)	0.09132099378263332
  (1, 1529)	0.09132099378263332
  (1, 2545)	0.06330087782969718
  (1, 1366)	0.09132099378263332
  (1, 1487)	0.07840805166052696
  (1, 678)	0.1309902190768412
  (1, 1105)	0.09132099378263332
  (1, 2532)	0.034603253120371824
  :	:
  (97, 1595)	0.17374025638184107
  (97, 2530)	0.24392810360149558
  (97, 1557)	0.02990264013947055
  (97, 2357)	0.04121770025030963
  (97, 1236)	0.05007834638626516
  (97, 2419

In [77]:
matrix.shape

(98, 2595)

## ====== pt. 2: importing the word2vec model and working with the dictionary file from the tf-idf =======


In [78]:
# Importing the word2vec model as dataframe:

model = '/Users/christianstenbro/AU/Applied_Cognitive_Science/Rep_rep_project/prediction_models/data and code/mag_200d_psy_eco_word2vec'

model = pd.read_csv(model, sep=' ', skiprows = 1, header=None)

print(model)

             0         1         2         3         4         5         6     
0            the -0.242396  0.136115  0.176884 -0.106608 -0.111945  0.008947  \
1             of -0.191138  0.168684  0.132903 -0.099730 -0.113432 -0.022978   
2            and -0.216545  0.155390  0.071324 -0.070429 -0.159644 -0.038712   
3             in -0.227537  0.125394  0.097702 -0.155055 -0.133572  0.052193   
4             to -0.257741  0.174983  0.165620 -0.139889 -0.098801 -0.049469   
...          ...       ...       ...       ...       ...       ...       ...   
275556  workover  0.394763 -0.138997 -0.243322 -0.157992 -0.094953  0.127140   
275557  condotel -0.091486  0.510353  0.115290 -0.514122 -0.335069  0.128298   
275558    kuntey -0.232415  0.054326  0.415971 -0.029960  0.161247  0.246369   
275559     houga  0.211834 -0.092607 -0.048265  0.064677  0.064417 -0.145476   
275560    gp-stn -0.207188  0.077137  0.205872 -0.182772  0.243247  0.688170   

             7         8         9    .

In [79]:
print(model.iloc[:,0]) # this is the column of works

0              the
1               of
2              and
3               in
4               to
            ...   
275556    workover
275557    condotel
275558      kuntey
275559       houga
275560      gp-stn
Name: 0, Length: 275561, dtype: object


In [80]:
# Defining dictionary as a data frame

# First, use dict.items() to get a group of the key-value pairs in the dictionary:

items = dict.items(tfidf.vocabulary_)

# Then, having this group as an object, use list(obj) to convert it to a list:

items = list(items)

# Finally, using this list as data, call numpy.array(data) to convert it to an array. But actually I want it to be a core pd data frame:

dict_df = pd.DataFrame(items)

dict_df.head()


Unnamed: 0,0,1
0,abstract,79
1,in,1151
2,previous,1786
3,studies,2251
4,english,795


In [81]:
# Renaming columns:

dict_df.columns = ['keys', 'values']

print(dict_df)


             keys  values
0        abstract      79
1              in    1151
2        previous    1786
3         studies    2251
4         english     795
...           ...     ...
2590   phonologic    1703
2591          yes    2584
2592         _obe      67
2593  constitutes     527
2594   misleading    1453

[2595 rows x 2 columns]


In [82]:
# . . . and flipping the order:

column_titles = ['values', 'keys']
dict_df = dict_df.reindex(columns = column_titles)
print(dict_df)

      values         keys
0         79     abstract
1       1151           in
2       1786     previous
3       2251      studies
4        795      english
...      ...          ...
2590    1703   phonologic
2591    2584          yes
2592      67         _obe
2593     527  constitutes
2594    1453   misleading

[2595 rows x 2 columns]


In [83]:
# Now we can index from the dataframe:

dict_df['keys'] # either extracting the entire column
dict_df['keys'][4] # or a single entry in a column
dict_df[['keys', 'values']] # how would I extract and entire row with data from both columns though?

# Link to indexing tips in Python: https://www.dataquest.io/blog/tutorial-indexing-dataframes-in-pandas/

Unnamed: 0,keys,values
0,abstract,79
1,in,1151
2,previous,1786
3,studies,2251
4,english,795
...,...,...
2590,phonologic,1703
2591,yes,2584
2592,_obe,67
2593,constitutes,527


In [84]:
# Examining some attribrutes of the data:

print(dict_df.columns)
print(dict_df.index) # this is useful knowledge when wanting to construct the for loop


Index(['values', 'keys'], dtype='object')
RangeIndex(start=0, stop=2595, step=1)


The next task is to remove the words from the word2vec model (model) that do not appear in the dictionary (dict_df) and create a new model data-frame.

- One way to do this is by making a for-loop which goes through each row of our model dataframe, checking if there is a match with any of the words in the dictionaries. 

- If there is a match, the **word and its associated vector of dimensional values** should be added to a new two-column data-frame. 

- Ultimately, this data-frame will be as long as the dictionary, which should mean that it will be commensurable with the tf-idf matrix (elaborate).

> Make a plan for how to concretely design the for loop. Remember that we need to loop through the word2vec model for each word entry (row) in our dict_df!

In [85]:
dict_df.shape
model.shape

model.iloc[:,0]

0              the
1               of
2              and
3               in
4               to
            ...   
275556    workover
275557    condotel
275558      kuntey
275559       houga
275560      gp-stn
Name: 0, Length: 275561, dtype: object

In [86]:
# Trying out a for loop to extract terms from the model: 

# data 1 = dict_df 
# data 2 = model

# First we create an empty data frame to store the matched rows from the model
matched_df = pd.DataFrame(columns = model.columns)

# Then we create an empty dataframe to store the missing matches
missing_matches = pd.DataFrame(columns=['Term', 'Value'])

# We iterate through each term in the model:
for i, term in enumerate(dict_df['keys']):
    # Checking if the terms exists in the dictionary
    if term in model.iloc[:,0].values:
        # If there is a match, we extract the entire row from the model and add it to the matched_df
        row = model.loc[model.iloc[:,0] == term] # is this extracting the row?
        row.index = [i]
        matched_df = pd.concat([matched_df, row]) # how does this line work?
    else:
        # If no match is found, add the term to missing_matches
        missing_matches.loc[len(missing_matches)] = [term, dict_df.iloc[i, 0]]

matched_df = matched_df.sort_index()    

# Print missing matches
print("Missing Matches:")
print(missing_matches)

# Print the matched data frame
print("Matched Data Frame:")
print(matched_df)

Missing Matches:
                   Term  Value
0          syllabifying   2297
1              grosjean   1037
2       whethersubjects   2549
3              thiscase   2360
4             wordswere   2568
..                  ...    ...
80                 _eap     63
81  feedbackconsistency    923
82                  _ip     65
83                  _ob     66
84                 _obe     67

[85 rows x 2 columns]
Matched Data Frame:
              0         1         2         3         4         5         6     
0        abstract -0.164419  0.233158  0.253387 -0.060318  0.008594 -0.342084  \
1              in -0.227537  0.125394  0.097702 -0.155055 -0.133572  0.052193   
2        previous -0.346380  0.006304  0.305970 -0.139675 -0.182512  0.049250   
3         studies -0.249250  0.010476  0.088974 -0.260289 -0.054414 -0.030271   
4         english -0.157916  0.097971  0.137283 -0.171298  0.103821  0.091728   
...           ...       ...       ...       ...       ...       ...       ...   
2

In [87]:
dict_df.iloc[:, 0]

0         79
1       1151
2       1786
3       2251
4        795
        ... 
2590    1703
2591    2584
2592      67
2593     527
2594    1453
Name: values, Length: 2595, dtype: int64

In [88]:
matched_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,200
0,abstract,-0.164419,0.233158,0.253387,-0.060318,0.008594,-0.342084,0.072384,-0.158849,-0.076478,...,0.127368,0.031234,-0.28687,0.123983,0.043852,0.073327,-0.301613,0.008358,-0.154836,-0.144531
1,in,-0.227537,0.125394,0.097702,-0.155055,-0.133572,0.052193,0.143991,-0.084266,0.066472,...,0.125955,0.08906,-0.21787,-0.011854,-0.141282,0.020798,-0.082806,-0.056006,-0.044313,-0.163796
2,previous,-0.34638,0.006304,0.30597,-0.139675,-0.182512,0.04925,0.182775,-0.317705,0.238295,...,0.125932,-0.114897,-0.025431,0.251213,-0.250936,-0.024032,0.100205,0.017044,0.066197,0.045374
3,studies,-0.24925,0.010476,0.088974,-0.260289,-0.054414,-0.030271,0.194103,-0.325225,0.160901,...,0.087359,-0.13002,-0.146982,0.124166,-0.426162,-0.074545,0.017245,-0.096232,-0.001212,-0.059817
4,english,-0.157916,0.097971,0.137283,-0.171298,0.103821,0.091728,0.227996,-0.237087,0.038814,...,0.054863,0.129875,-0.239072,0.025619,0.082328,0.073301,0.082712,0.122293,0.14735,-0.081697


In [89]:
dict_df.head()

Unnamed: 0,values,keys
0,79,abstract
1,1151,in
2,1786,previous
3,2251,studies
4,795,english


## ========== next step: remove missing words from TF_IDF matrix ========

In [145]:
# Checking some dimensions of the objects of interest:

print("Missing matches list dimensions: ", missing_matches.shape)
print("Dimensions of the new model data frame: ", matched_df.shape)
print("Dimensions of the TF-IDF matrix: ", matrix.shape)

df = matched_df.iloc[:, 1:]
df = df.reset_index(drop = True)
df.columns = range(200)

print(df)


Missing matches list dimensions:  (85, 2)
Dimensions of the new model data frame:  (2510, 201)
Dimensions of the TF-IDF matrix:  (98, 2595)
           0         1         2         3         4         5         6     
0    -0.164419  0.233158  0.253387 -0.060318  0.008594 -0.342084  0.072384  \
1    -0.227537  0.125394  0.097702 -0.155055 -0.133572  0.052193  0.143991   
2    -0.346380  0.006304  0.305970 -0.139675 -0.182512  0.049250  0.182775   
3    -0.249250  0.010476  0.088974 -0.260289 -0.054414 -0.030271  0.194103   
4    -0.157916  0.097971  0.137283 -0.171298  0.103821  0.091728  0.227996   
...        ...       ...       ...       ...       ...       ...       ...   
2505 -0.021529 -0.040331  0.093697  0.071710  0.295056 -0.512503  0.023171   
2506  0.166119  0.011583  0.617321  0.059052 -0.167412  0.449738  0.174121   
2507 -0.035224 -0.060090  0.592755 -0.317422 -0.246039  0.318855  0.028493   
2508 -0.191645  0.074492  0.268575 -0.336695 -0.071241 -0.189615  0.052777   
25

In [91]:
print(missing_matches)
# print(missing_matches['Index'])

print(dict_df.iloc[164, :])


                   Term  Value
0          syllabifying   2297
1              grosjean   1037
2       whethersubjects   2549
3              thiscase   2360
4             wordswere   2568
..                  ...    ...
80                 _eap     63
81  feedbackconsistency    923
82                  _ip     65
83                  _ob     66
84                 _obe     67

[85 rows x 2 columns]
values            2297
keys      syllabifying
Name: 164, dtype: object


In [92]:
# Saving the matching_df as csv and sending to emil

from pathlib import Path  
filepath = Path('/Users/christianstenbro/Programming/ripley_project/Data_files/matching_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
matched_df.to_csv(filepath)

Reflection pad:

- We want to modify the for loop creating the matched_df to also output the index for the missing matches

- Then, we can use this information to remove the corresponding columns from the TF-IDF

**FINALLY** we are ready to somehow (?) multiply the two matrices. 

- We want the output to have one number for each abstract - that's all!

In [93]:
missing_matches['Value']

0     2297
1     1037
2     2549
3     2360
4     2568
      ... 
80      63
81     923
82      65
83      66
84      67
Name: Value, Length: 85, dtype: int64

In [98]:
# Converting the TF-IDF matrix to panda pd

tf_idf_df = pd.DataFrame.sparse.from_spmatrix(matrix)

#print(tf_idf_df.shape)

# Removing the columns contained in the missing_matches['Value']

# Converting missing_matches['Value']

type(missing_matches['Value'])

missing_matches_list = list(missing_matches['Value'])
print(missing_matches_list)

# Sort tf-idf matrix and remove the columns corresponding to the values contained in the missing_matches list:

tf_idf_matched_df = tf_idf_df[tf_idf_df.columns[~tf_idf_df.columns.isin(missing_matches_list)]]


[2297, 1037, 2549, 2360, 2568, 411, 2373, 1762, 1079, 1858, 1666, 34, 53, 62, 1702, 929, 2296, 1963, 1266, 2478, 1264, 2515, 1495, 800, 1484, 1452, 1341, 2202, 235, 237, 973, 2586, 2495, 2383, 658, 904, 1087, 1625, 2491, 1588, 758, 757, 2492, 756, 161, 322, 1101, 1102, 1508, 2263, 2452, 2059, 1867, 2198, 1078, 1294, 880, 2498, 1323, 1545, 1597, 2343, 346, 2077, 2197, 1027, 344, 604, 726, 2196, 1671, 2387, 1342, 2235, 2236, 1100, 1099, 1643, 1586, 64, 63, 923, 65, 66, 67]


<bound method NDFrame.head of            1         2         3         4         5         6         7     
0    -0.164419  0.233158  0.253387 -0.060318  0.008594 -0.342084  0.072384  \
1    -0.227537  0.125394  0.097702 -0.155055 -0.133572  0.052193  0.143991   
2    -0.346380  0.006304  0.305970 -0.139675 -0.182512  0.049250  0.182775   
3    -0.249250  0.010476  0.088974 -0.260289 -0.054414 -0.030271  0.194103   
4    -0.157916  0.097971  0.137283 -0.171298  0.103821  0.091728  0.227996   
...        ...       ...       ...       ...       ...       ...       ...   
2589 -0.021529 -0.040331  0.093697  0.071710  0.295056 -0.512503  0.023171   
2590  0.166119  0.011583  0.617321  0.059052 -0.167412  0.449738  0.174121   
2591 -0.035224 -0.060090  0.592755 -0.317422 -0.246039  0.318855  0.028493   
2593 -0.191645  0.074492  0.268575 -0.336695 -0.071241 -0.189615  0.052777   
2594 -0.356981 -0.131151  0.376909 -0.188344 -0.202470  0.300605  0.007481   

           8         9         10

In [95]:
tf_idf_matched_df.shape

#print(tf_idf_matched_df)

# Checking some dimensions of the objects of interest:

print("Dimensions of the new model data frame: ", matched_df.shape)
print("Dimensions of the new TF-IDF matrix: ", tf_idf_matched_df.shape)

type(matched_df.shape)


Dimensions of the new model data frame:  (2510, 201)
Dimensions of the new TF-IDF matrix:  (98, 2510)


tuple

In [None]:
# Making the matrices allign

# Removing word column from matched dict file

matched_df_removed_col = matched_df.iloc[: , 1:]

matched_df_removed_col.head

In [118]:
# Re-index the dataframes to start with 0:

tf_idf_matched_df_reindex = tf_idf_matched_df.reset_index(drop = True)
matched_df_reindex = matched_df_removed_col.reset_index(drop = True)


In [134]:

matched_df_removed_col.reindex(columns=range(len(matched_df_removed_col.columns)))


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,,-0.164419,0.233158,0.253387,-0.060318,0.008594,-0.342084,0.072384,-0.158849,-0.076478,...,-0.045325,0.127368,0.031234,-0.286870,0.123983,0.043852,0.073327,-0.301613,0.008358,-0.154836
1,,-0.227537,0.125394,0.097702,-0.155055,-0.133572,0.052193,0.143991,-0.084266,0.066472,...,0.006355,0.125955,0.089060,-0.217870,-0.011854,-0.141282,0.020798,-0.082806,-0.056006,-0.044313
2,,-0.346380,0.006304,0.305970,-0.139675,-0.182512,0.049250,0.182775,-0.317705,0.238295,...,-0.189315,0.125932,-0.114897,-0.025431,0.251213,-0.250936,-0.024032,0.100205,0.017044,0.066197
3,,-0.249250,0.010476,0.088974,-0.260289,-0.054414,-0.030271,0.194103,-0.325225,0.160901,...,-0.150480,0.087359,-0.130020,-0.146982,0.124166,-0.426162,-0.074545,0.017245,-0.096232,-0.001212
4,,-0.157916,0.097971,0.137283,-0.171298,0.103821,0.091728,0.227996,-0.237087,0.038814,...,0.091245,0.054863,0.129875,-0.239072,0.025619,0.082328,0.073301,0.082712,0.122293,0.147350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2589,,-0.021529,-0.040331,0.093697,0.071710,0.295056,-0.512503,0.023171,0.033605,0.056809,...,0.176993,0.131217,0.100842,-0.334557,0.247142,0.097800,-0.080029,-0.135338,-0.150630,0.090382
2590,,0.166119,0.011583,0.617321,0.059052,-0.167412,0.449738,0.174121,0.070267,0.134964,...,-0.155802,-0.254067,-0.242696,0.105087,0.125844,-0.480237,-0.319941,-0.040487,-0.627847,-0.033238
2591,,-0.035224,-0.060090,0.592755,-0.317422,-0.246039,0.318855,0.028493,0.371991,0.457179,...,0.017718,-0.386318,0.015517,0.234608,0.019007,-0.049388,-0.146896,0.577466,0.029020,0.323428
2593,,-0.191645,0.074492,0.268575,-0.336695,-0.071241,-0.189615,0.052777,-0.034520,-0.013880,...,-0.016186,0.228042,0.233316,-0.016832,0.101470,-0.230295,0.158561,-0.253239,-0.000294,0.102868


In [119]:
print(tf_idf_matched_df_reindex.head)
print(matched_df_removed_col.head)

<bound method NDFrame.head of     0     1     2     3     4     5     6     7     8     9     ...      2584   
0    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...  0.000000  \
1    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...  0.000000   
2    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...  0.000000   
3    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...  0.000000   
4    0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...  0.000000   
..   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...       ...   
93   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...  0.000000   
94   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...  0.000000   
95   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...  0.000000   
96   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...  0.000000   
97   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...  0.085028  

The problem could occur because of different indices. How can we check this?

In [150]:
df@tf_idf_matched_df

ValueError: matrices are not aligned