# Similarity Calculation

This code should be run on `Google Collab`, since it is very computationally heavy.

In [None]:
import gensim.downloader as api

word2vec_model = api.load("word2vec-google-news-300")

# Calculating simialrity

here, using `cosine similarity` and a `word2vec` model, we calculate the similarity between the lm generated stereotypical tuples and the human-annotated tuples. 

We compare those tuples which have the same identity term, and the tuple which has the highest similarity score is returned among all is returned. 

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity_between_same_religion(model, file1_path, file2_path, output_file):
    file1_df = pd.read_csv(file1_path, sep='\t')
    file2_df = pd.read_csv(file2_path, sep='\t')

    same_religion_df = pd.merge(file1_df, file2_df, on='Region')

    max_similarity_scores = {}
    for index, row in same_religion_df.iterrows():
        stereotype_token2 = row['StereoType_y']
        max_similarity_scores[stereotype_token2] = {'score': -1, 'token1': None}

    for index, row in same_religion_df.iterrows():
        stereotype_token1 = row['StereoType_x']
        stereotype_token2 = row['StereoType_y']
        if stereotype_token1 in model.key_to_index and stereotype_token2 in model.key_to_index:
            token1_vec = model[stereotype_token1]
            token2_vec = model[stereotype_token2]
            similarity_score = cosine_similarity([token1_vec], [token2_vec])[0][0]
            if similarity_score > max_similarity_scores[stereotype_token2]['score']:
                max_similarity_scores[stereotype_token2] = {'score': similarity_score, 'token1': stereotype_token1}

    results_df = pd.DataFrame(columns=['Stereotype Token 1', 'Stereotype Token 2', 'Similarity Score'])
    for token2, info in max_similarity_scores.items():
        token1 = info['token1']
        score = info['score']
        if token1 is not None:
            results_df = results_df.append({'Stereotype Token 1': token1, 'Stereotype Token 2': token2, 'Similarity Score': score}, ignore_index=True)

    results_df.to_csv(output_file, sep='\t', index=False)


We do this for region and religion idenitities

In [None]:
def getIdentityArray(filePath):
    df_religion = pd.read_csv(filePath,sep='\t')
    religion_list = df_religion.values.tolist()
    religion_idTerms = [item for sublist in religion_list for item in sublist]
    return religion_idTerms

filePathRegionIDterms = "region_idterms.tsv"


regionArray = getIdentityArray(filePathRegionIDterms)
print(regionArray)


for region in regionArray:
  currRegionPath = region + ".tsv"
  masterRegionPath = "region_stereotypes.tsv"
  outputFilePath = region + "_similarity_scores.tsv"
  calculate_similarity_between_same_religion(word2vec_model, currRegionPath, masterRegionPath, outputFilePath)


# file1_path = 'buddhist.tsv'
# file2_path = 'religion_stereotypes.tsv'
# output_file = 'buddhist_similarity_scores.tsv'
# calculate_similarity_between_same_religion(word2vec_model, file1_path, file2_path, output_file)