# stats jurafsky method

finds top adjectives associated with each queer word compared to person, women, and men

In [6]:
import numpy as np
import pickle as pickle
import pandas as pd
from numpy import dot
from numpy.linalg import norm
import json
import sys
MAX_INT = sys.maxsize

In [7]:
KEYWORDS = ['gay', 'homosexual', 'lesbian', 'bisexual', 'queer', 'transgender']
PERSON = ["person", "human", "somebody"]
FEMALE = ["she", "daughter", "hers", "her", "mother", "woman", "girl", "herself", "female", "sister", 
        "daughters", "mothers", "women", "girls", "females", "sisters", "aunt", "aunts", "niece", "nieces"]
MALE = ["he", "son", "his", "him", "father", "man", "boy", "himself", "male", "brother", 
        "sons", "fathers", "men", "boys", "males", "brothers", "uncle", "uncles", "nephew", "nephews"]

In [8]:
def euclideanDistance(v1, v2):
    if v1 == 0 or v2 == 0:
        return MAX_INT
    
    n1 = np.array(v1)
    n2 = np.array(v2)
    
    return (sum(np.square(n1-n2))) ** 0.5

In [5]:
def findNearestNeighbors(vec, dataframe, k):
    distances = [(euclideanDistance(vec, list(row)), index) for index,row in dataframe.iterrows()]
    return [v for k,v in sorted(distances)[:k]]

In [214]:
# year, word = keyword,embed = lemmas or words
def outputNN(year, word, pairs, embed, regAdj = False):
    assert embed == 'lemmas' or embed == 'words'
    
    # load dataframe
    with open('sgns-' + embed + '/'+str(year)+'-vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
        #print(np.shape(vocab))
    filename = ('sgns-' + embed + '/'+str(year)+'-w.npy')
    vecs = np.load(filename)
    #print(np.shape(vecs))
   
    df = pd.DataFrame(vecs, index=vocab)
    df = df[df[0] != 0] # drop all vectors that are too small to carry meaning
    
    if word not in df.index:
        return pd.DataFrame([None, None])
    
    wordVec = list(df.loc[word])
    
    # calculate average vector of pairs
    groupWords = [a for a in pairs if a in df.index]

    avgVec = [a + b for a, b in zip(list(df.loc[groupWords[0]]), list(df.loc[groupWords[1]]))]
    for pair in groupWords[2:]:
        #if pair in df. 
        avgVec = [a+b for a,b in zip(avgVec, list(df.loc[pair]))]
    avgVec = [a/(len(groupWords)) for a in avgVec]
    
    with open('coha-adj-' + embed +'.txt') as file:
    #with open('adjectives.txt') as file:
        adjList = file.readlines() 
    adjList = [a.strip() for a in adjList]
    
    if regAdj:
        with open('adjectives.txt') as file:
            adjList = file.readlines() 
    adjList = [a.strip() for a in adjList]
    
    df = df[df.index.isin(adjList)] # filter so only adjectives remain in dataframe
    
    # calculate distance of keyword to adjectives
    dfDist = pd.DataFrame(columns=['distKey', 'distAvg'])
    
    for index, row in df.iterrows():
        dfDist.loc[index] = [euclideanDistance(wordVec, list(row)), euclideanDistance(avgVec, list(row))]
    
    dfDist['bias'] = dfDist['distKey'] - dfDist['distAvg'] # positive bias = closer to avg, negative bias = closer to key
    
    dfDist = dfDist['bias'].to_frame()
    
    return dfDist.sort_values('bias', ascending=False)

In [220]:
df = (outputNN(1980, 'homosexual', FEMALE, 'lemmas', regAdj=True))
df


Unnamed: 0,bias
beautiful,0.426447
pretty,0.388674
warm,0.380693
soft,0.364429
quiet,0.329318
...,...
cooperative,-0.240787
intuitive,-0.253985
irresponsible,-0.254890
outgoing,-0.259326


In [218]:
for avg in [FEMALE, MALE, PERSON+FEMALE+MALE]:
    for embed in ['lemmas', 'words']:
        for word in KEYWORDS:
            print((word, embed, avg[0]))
            colNames = []
            for i in range(1810, 2001, 10):
                colNames.append(i)
                colNames.append(str(i)+'-bias')
            df = pd.DataFrame(index=list(range(1, 11))+list(range(-10, 0)), columns = colNames)
            for year in range(1810, 2001, 10):
                temp = (outputNN(year, word, avg, embed, regAdj=True))
                if temp.isnull().values.any():
                    continue
                df[year] = list(temp.index[:10])+list(temp.index[-10:])
                df[str(year)+'-bias'] = list(temp['bias'])[:10]+list(temp['bias'])[-10:]
            df.to_csv('adj-' + embed + '/'+word+'-'+avg[0]+'.csv')


('gay', 'lemmas', 'she')
('homosexual', 'lemmas', 'she')
('lesbian', 'lemmas', 'she')
('bisexual', 'lemmas', 'she')
('queer', 'lemmas', 'she')
('transgender', 'lemmas', 'she')
('gay', 'words', 'she')
('homosexual', 'words', 'she')
('lesbian', 'words', 'she')
('bisexual', 'words', 'she')
('queer', 'words', 'she')
('transgender', 'words', 'she')
('gay', 'lemmas', 'he')
('homosexual', 'lemmas', 'he')
('lesbian', 'lemmas', 'he')
('bisexual', 'lemmas', 'he')
('queer', 'lemmas', 'he')
('transgender', 'lemmas', 'he')
('gay', 'words', 'he')
('homosexual', 'words', 'he')
('lesbian', 'words', 'he')
('bisexual', 'words', 'he')
('queer', 'words', 'he')
('transgender', 'words', 'he')
('gay', 'lemmas', 'person')
('homosexual', 'lemmas', 'person')
('lesbian', 'lemmas', 'person')
('bisexual', 'lemmas', 'person')
('queer', 'lemmas', 'person')
('transgender', 'lemmas', 'person')
('gay', 'words', 'person')
('homosexual', 'words', 'person')
('lesbian', 'words', 'person')
('bisexual', 'words', 'person')
('

In [219]:
pd.read_csv('adj-lemmas/homosexual-she.csv', index_col="Unnamed: 0")

Unnamed: 0,1810,1810-bias,1820,1820-bias,1830,1830-bias,1840,1840-bias,1850,1850-bias,...,1960,1960-bias,1970,1970-bias,1980,1980-bias,1990,1990-bias,2000,2000-bias
1,,,,,,,,,,,...,beautiful,0.43689,beautiful,0.444752,beautiful,0.426447,beautiful,0.455643,beautiful,0.394877
2,,,,,,,,,,,...,soft,0.360192,warm,0.380685,pretty,0.388674,pretty,0.386199,soft,0.392489
3,,,,,,,,,,,...,stern,0.352639,cool,0.3681,warm,0.380693,thin,0.344722,gorgeous,0.390127
4,,,,,,,,,,,...,pretty,0.341877,gentle,0.350339,soft,0.364429,warm,0.336431,slender,0.386277
5,,,,,,,,,,,...,pleasant,0.340213,pretty,0.348855,quiet,0.329318,steady,0.327261,shy,0.373011
6,,,,,,,,,,,...,warm,0.336547,calm,0.347854,calm,0.329128,soft,0.325,pretty,0.362752
7,,,,,,,,,,,...,loud,0.334353,quiet,0.337543,cold,0.323286,shy,0.322942,neat,0.337014
8,,,,,,,,,,,...,thin,0.318289,soft,0.335114,thin,0.321203,dirty,0.320967,delicate,0.330797
9,,,,,,,,,,,...,ugly,0.315068,pleasant,0.318899,gentle,0.320451,quiet,0.318747,silent,0.321543
10,,,,,,,,,,,...,gentle,0.312936,cold,0.318571,pleasant,0.305915,cool,0.317776,quiet,0.313562
