In [56]:
# Importing packages

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer

In [108]:
# Loading data and converting to panda dataframe (and changing/defining the names)

data_file = '/Users/christianstenbro/Programming/ripley_project/Data_files/original_paper_data_frame_cleaned_v1.csv'

data = pd.read_csv(data_file, sep=',', names=['ID', 'bib', 'abstract', 'rep_score'])
data.head()

Unnamed: 0,ID,bib,abstract,rep_score
0,original_paper_ID,original_paper_bib,abstract,replication_score
1,O1,"Vitevitch, M. S. and Stamer, M. K. 2006. The c...",In previous studies in English examining the i...,yes
2,O2,"Cutler, A. , Mehler, J. , Noh, D. , and Seguí,...",Infants acquire whatever language is spoken in...,partial
3,O2,"Seguí, J. , 1986. The syllable's differing rol...",Speech segmentation procedures may differ in s...,partial
4,O3,"Braun , B. , & Tagliapietra , L. 2009 The role...",Sentences with a contrastive intonation contou...,yes


In [4]:
# Tfidf vectorizing the text and saving in a (sparse?) matrix

tfidf = TfidfVectorizer()
matrix = tfidf.fit_transform(data['abstract'].values.astype('U')) # the text needs to be converted to unicode strings (see https://stackoverflow.com/questions/39303912/tfidfvectorizer-in-scikit-learn-valueerror-np-nan-is-an-invalid-document)
matrix.shape

(98, 2595)

In [None]:
# Checking the vocabulary
tfidf.vocabulary_ # is the number the overall corpus frequency associated with each word?

In [None]:
row = 5
col = tfidf.vocabulary_['syllabifying'] # how is the tfidf matrix structured?

print('Abstract: "%s"' % data.loc[row, 'ID'])
print('TF-IDF score: %f' % matrix[row, col])

print(matrix)
matrix.shape

print("Column number: ", col)

# So clearly, the data is there now . . . we just need to extract the right data in the matrix to do the next step

In [7]:
matrix.shape

(98, 2595)

## ====== pt. 2: importing the word2vec model and working with the dictionary file from the tf-idf =======


In [None]:
# Importing the word2vec model as dataframe:

model = '/Users/christianstenbro/AU/Applied_Cognitive_Science/Rep_rep_project/prediction_models/data and code/mag_200d_psy_eco_word2vec'

model = pd.read_csv(model, sep=' ', skiprows = 1, header=None)

print(model)

In [None]:
print(model.iloc[:,0]) # this is the column of works

In [None]:
# Defining dictionary as a data frame

# First, use dict.items() to get a group of the key-value pairs in the dictionary:

items = dict.items(tfidf.vocabulary_)

# Then, having this group as an object, use list(obj) to convert it to a list:

items = list(items)

# Finally, using this list as data, call numpy.array(data) to convert it to an array. But actually I want it to be a core pd data frame:

dict_df = pd.DataFrame(items)

print(dict_df)

print(type(dict_df))


In [None]:
# Renaming columns:

dict_df.columns = ['keys', 'values']

print(dict_df)


In [None]:
# . . . and flipping the order:

column_titles = ['values', 'keys']
dict_df = dict_df.reindex(columns = column_titles)
print(dict_df)

In [None]:
# Now we can index from the dataframe:

dict_df['keys'] # either extracting the entire column
dict_df['keys'][4] # or a single entry in a column
dict_df[['keys', 'values']] # how would I extract and entire row with data from both columns though?

# Link to indexing tips in Python: https://www.dataquest.io/blog/tutorial-indexing-dataframes-in-pandas/

In [None]:
# Examining some attribrutes of the data:

print(dict_df.columns)
print(dict_df.index) # this is useful knowledge when wanting to construct the for loop


The next task is to remove the words from the word2vec model (model) that do not appear in the dictionary (dict_df) and create a new model data-frame.

- One way to do this is by making a for-loop which goes through each row of our model dataframe, checking if there is a match with any of the words in the dictionaries. 

- If there is a match, the **word and its associated vector of dimensional values** should be added to a new two-column data-frame. 

- Ultimately, this data-frame will be as long as the dictionary, which should mean that it will be commensurable with the tf-idf matrix (elaborate).

> Make a plan for how to concretely design the for loop. Remember that we need to loop through the word2vec model for each word entry (row) in our dict_df!

In [None]:
dict_df.shape
model.shape

model.iloc[:,0]

In [None]:
# Trying out a for loop to extract terms from the model: 

# data 1 = dict_df 
# data 2 = model

# First we create an empty data frame to store the matched rows from the model
matched_df = pd.DataFrame(columns = model.columns)

# Then we create an empty dataframe to store the missing matches
missing_matches = pd.DataFrame(columns=['Term', 'Value'])

# We iterate through each term in the model:
for i, term in enumerate(dict_df['keys']):
    # Checking if the terms exists in the dictionary
    if term in model.iloc[:,0].values:
        # If there is a match, we extract the entire row from the model and add it to the matched_df
        row = model.loc[model.iloc[:,0] == term] # is this extracting the row?
        row.index = [i]
        matched_df = pd.concat([matched_df, row]) # how does this line work?
    else:
        # If no match is found, add the term to missing_matches
        missing_matches.loc[len(missing_matches)] = [term, dict_df.iloc[i, 0]]

matched_df = matched_df.sort_index()    

# Print missing matches
print("Missing Matches:")
print(missing_matches)

# Print the matched data frame
print("Matched Data Frame:")
print(matched_df)

type(matched_df)

In [None]:
dict_df.iloc[:, 0]

In [None]:
matched_df.head()

In [None]:
dict_df.head()

## ========== next step: remove missing words from TF_IDF matrix ========

In [20]:
# Checking some dimensions of the objects of interest:

print("Missing matches list dimensions: ", missing_matches.shape)
print("Dimensions of the new model data frame: ", matched_df.shape)
print("Dimensions of the TF-IDF matrix: ", matrix.shape)

Missing matches list dimensions:  (85, 2)
Dimensions of the new model data frame:  (2510, 201)
Dimensions of the TF-IDF matrix:  (98, 2595)


In [21]:
print(missing_matches)
# print(missing_matches['Index'])

print(dict_df.iloc[164, :])

                   Term  Value
0          syllabifying   2297
1              grosjean   1037
2       whethersubjects   2549
3              thiscase   2360
4             wordswere   2568
..                  ...    ...
80                 _eap     63
81  feedbackconsistency    923
82                  _ip     65
83                  _ob     66
84                 _obe     67

[85 rows x 2 columns]
values            2297
keys      syllabifying
Name: 164, dtype: object


In [22]:
# Saving the matching_df as csv and sending to emil

from pathlib import Path  
filepath = Path('/Users/christianstenbro/Programming/ripley_project/Data_files/matching_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
matched_df.to_csv(filepath)

Reflection pad:

- We want to modify the for loop creating the matched_df to also output the index for the missing matches

- Then, we can use this information to remove the corresponding columns from the TF-IDF

**FINALLY** we are ready to somehow (?) multiply the two matrices. 

- We want the output to have one number for each abstract - that's all!

In [23]:
missing_matches['Value']

0     2297
1     1037
2     2549
3     2360
4     2568
      ... 
80      63
81     923
82      65
83      66
84      67
Name: Value, Length: 85, dtype: int64

In [42]:
# Converting the TF-IDF matrix to panda pd

tf_idf_df = pd.DataFrame.sparse.from_spmatrix(matrix)

#print(tf_idf_df.shape)

# Removing the columns contained in the missing_matches['Value']

# Converting missing_matches['Value']

type(missing_matches['Value'])

missing_matches_list = list(missing_matches['Value'])
print(missing_matches_list)

# Sort tf-idf matrix and remove the columns corresponding to the values contained in the missing_matches list:

tf_idf_matched_df = tf_idf_df[tf_idf_df.columns[~tf_idf_df.columns.isin(missing_matches_list)]]


[2297, 1037, 2549, 2360, 2568, 411, 2373, 1762, 1079, 1858, 1666, 34, 53, 62, 1702, 929, 2296, 1963, 1266, 2478, 1264, 2515, 1495, 800, 1484, 1452, 1341, 2202, 235, 237, 973, 2586, 2495, 2383, 658, 904, 1087, 1625, 2491, 1588, 758, 757, 2492, 756, 161, 322, 1101, 1102, 1508, 2263, 2452, 2059, 1867, 2198, 1078, 1294, 880, 2498, 1323, 1545, 1597, 2343, 346, 2077, 2197, 1027, 344, 604, 726, 2196, 1671, 2387, 1342, 2235, 2236, 1100, 1099, 1643, 1586, 64, 63, 923, 65, 66, 67]


In [43]:
type(tf_idf_df)

pandas.core.frame.DataFrame

# ===== Reindexing and alligning matrices ======

In [None]:
# Reindexing the 'matched' word2vec dataframe

word2vec_rindx = matched_df.iloc[:, 1:] # Removing the first column (with the actual words)
word2vec_rindx = word2vec_rindx.reset_index(drop = True) # Reindexing
word2vec_rindx.columns = range(len(word2vec_rindx.columns)) # . . . and changing the index range to 0-199

print(word2vec_rindx)
type(word2vec_rindx)

In [None]:
# Doing the same for the tf_idf.matched dataframe:

tf_idf_rindx = tf_idf_matched_df.reset_index(drop = True) # Reindexing
tf_idf_rindx.reset_index(drop = True, inplace = True) # . . . and changing the index range to 0-199

print(tf_idf_rindx)
type(tf_idf_rindx)

In [47]:
# Checking type and dimensions of our two dataframes:

print("Dimensions of the TF-IDF dataframe: ", tf_idf_rindx.shape)
print("Type of TF-IDF dataframe: ", type(tf_idf_rindx))
print(" ")
print("Dimensions of the word2vec dataframe: ", word2vec_rindx.shape)
print("Type of word2vec dataframe: ", type(word2vec_rindx))


Dimensions of the TF-IDF dataframe:  (98, 2510)
Type of TF-IDF dataframe:  <class 'pandas.core.frame.DataFrame'>
 
Dimensions of the word2vec dataframe:  (2510, 200)
Type of word2vec dataframe:  <class 'pandas.core.frame.DataFrame'>


# ======= Multiplying the matrices: TF-IDF x Word2Vec ======

In [52]:
# For matrices to be commensurable in a multiplication: 
# X * Y, the number of columns of X needs to correspond to the number of rows in Y

# Fortunately, the number of colums in the TF-IDF dataframe (98, 2510) is the same as the
# as the number of rows in the word2vec dataframe (2510, 200). 

# Mathematically, it should not be a problem to multiply the two:

#product = tf_idf_rindx.dot(word2vec_rindx) 
# This does not work. We can try to use numpy?

print(type(tf_idf_rindx))

tf_idf_rindx_np = tf_idf_rindx.to_numpy() # Remember this parenthesis!

print(type(tf_idf_rindx_np))

word2vec_rindx_np = word2vec_rindx.to_numpy()

print(type(word2vec_rindx_np))


<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [104]:
# Re-attempting the multiplication with numpy methods: 

tf_idf_w2v_product = np.matmul(tf_idf_rindx_np, word2vec_rindx_np)

print("Product shape: ", tf_idf_w2v_product.shape, "Product type: ", type(tf_idf_w2v_product))

Product shape:  (98, 200) Product type:  <class 'numpy.ndarray'>


# ====== Status update =======

1. Create a similar matrix, using tf instead of tf_idf:

    - Everything should be the same, only the input matrix should contain pure tf's instead of tf-idf's
    
    - We can reuse the script from pt. 1 and change one command?

In [80]:
# 1. Computing term frequencies:

# data_file = '/Users/christianstenbro/Programming/ripley_project/Data_files/original_paper_data_frame_cleaned_v1.csv'
# data = pd.read_csv(data_file, sep=',', names=['ID', 'bib', 'abstract', 'rep_score'])
# data.head()

vectorizer = CountVectorizer() # The CountVectorizer converts a text corpus (here our collection of abstracts in 'data) to a matrix of 'token counts' = term frequencies (tf).
matrix_tf = vectorizer.fit_transform(data['abstract'].values.astype('U'))

print("TF-matrix dimensions: ", matrix_tf.shape, "TF-matrix object type: ", type(matrix_tf))

TF-matrix dimensions:  (98, 2595) TF-matrix object type:  <class 'scipy.sparse._csr.csr_matrix'>


In [90]:
vectorizer.vocabulary_ # And this matches the vocab of the tf_idf matrix:

print("Are the vocabularies of the tf and tfidf matrices identical? Output:", vectorizer.vocabulary_ == tfidf.vocabulary_)

Are the vocabularies of the tf and tfidf matrices identical? Output: True


This means that we can use the exact same methods to process the tf-matrix as we already used for the tf-idf matrix!

In [94]:
# We already have the missing matches list from previously:

missing_matches_list # The column index for the terms in the abstract matrix not present in the word2vec model. 
# These will be extracted from the tf-matrix as well to make it commensurable with the word2vec model matrix:

# First, we convert the tf-matrix from a sparse matrix to a pd.DataFrame:

tf_df = pd.DataFrame.sparse.from_spmatrix(matrix_tf)

# Then we remove the columns correponding to the missing words in the word2vec model:

tf_matched_df = tf_df[tf_df.columns[~tf_df.columns.isin(missing_matches_list)]]

print("Dimensions of tf_matched_df: ", tf_matched_df.shape)

Dimensions of tf_matched_df:  (98, 2510)


In [97]:
# Finally, we reindex the dataframe and convert it to a numpy array:

tf_rindx = tf_matched_df.reset_index(drop = True) # Reindexing
tf_rindx.reset_index(drop = True, inplace = True) # . . . and changing the index range to 0-199

# print(tf_rindx) # One wonders if it actually contains any numbers?
# type(tf_rindx)

tf_rindx_np = tf_rindx.to_numpy()

In [107]:
# Again, we can check if the matrices are commensurable: 

print("Dimensions of the TF dataframe: ", tf_rindx_np.shape)
print("Type of TF dataframe: ", type(tf_rindx_np))
print(" ")
print("Dimensions of the word2vec dataframe: ", word2vec_rindx_np.shape)
print("Type of word2vec dataframe: ", type(word2vec_rindx_np))
print(" ")
print("Are the matrices commensurable? Output:", len(tf_rindx_np[0]) == len(word2vec_rindx_np))

# print(len(tf_rindx_np[0])) # Strange way of finding the column and row numbers, at least compared to R. 
# print(len(word2vec_rindx_np))

# Notice that 'array[0]' is the second dimension of the array while (columns), while 'array' is simply the first dimension (rows).

Dimensions of the TF dataframe:  (98, 2510)
Type of TF dataframe:  <class 'numpy.ndarray'>
 
Dimensions of the word2vec dataframe:  (2510, 200)
Type of word2vec dataframe:  <class 'numpy.ndarray'>
 
Are the matrices commensurable? Output: True


# ====== Multiplying matrices: TF x Word2Vec ========

In [105]:
# With everything in place, we can multiply the matrices using the np.matmul function: 

tf_w2v_product = np.matmul(tf_rindx_np, word2vec_rindx_np)

print("Product shape: ", tf_w2v_product.shape, "Product type: ", type(tf_w2v_product))

Product shape:  (98, 200) Product type:  <class 'numpy.ndarray'>


Finally, we have our two matrices.

But how do we actually conceptualize what these matrices consist of?