### Computing adjectival similarity using gensim

In [1]:
# Load gensim and the word embeddings trained on the Google News corpus.

import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [2]:
# Add new column to existing dataframe from csv
# Source: https://www.quora.com/How-do-I-add-a-new-column-to-my-already-existing-CSV-file-using-Pandas#

import pandas as pd


def add_similarity_to_df(df):
    """
    Computes the similarity between weak adjective and other adjectives.
    
    Args:
        df: dataframe containing cleaned corpus data
        
    Returns:
        A dataframe with a new column 'adjs_simil' containing word2vec similarity values
    """
    # Initialise empty list to collect similarity values as we go.
    simil_col=[]
    
    # Iterate through rows in the dataframe.
    for row_idx in range(len(df)):
        
        # Get the weak adjective and the other adjective for the current row.
        weak = df.loc[row_idx, ['weak_adj']][0]
        other = df.loc[row_idx, ['other_adjs_clean']][0]
        
        # KeyError raised if word not in the word2vec vocabulary, so if that happens, add
        # NA to the column instead.
        try:
            value = model.similarity(weak, other)
        except KeyError:
            value = 'NA'
        
        # Add similarity value to the list. At the end, this list will be of the same length
        # as the dataframe and will contain a similarity value (or NA) for each adjective pair.
        simil_col.append(value)
    
    # Add this list as a new column to the dataframe and return dataframe.
    df['adjs_simil'] = simil_col
    
    return df


In [4]:
# Set the name of the file to be read in.
FILENAME = 'scalar_cxns_sd2014_clean_annotated.csv'

# Read the csv, compute and add the new similarity column, save with same filename.
data = pd.read_csv(FILENAME)
data_w_simil = add_similarity_to_df(data)
data_w_simil.to_csv(FILENAME, index = None, header = True)