In [1]:
import numpy as np
import pandas as pd
import sys
from langdetect import detect_langs

# Build Bag-of-word Vectors for training

In [2]:
# identify all words that either identify "male" or "female"
def get_gendered_words():
    vocab10K=pd.read_csv("../vocab10K.csv")
    male = vocab10K.loc[vocab10K['female'] == 0,:]
    male = male['word'].tolist()
    female = vocab10K.loc[vocab10K['female'] == 1,:]
    female = female['word'].tolist()
    return male, female

In [3]:
# takes either "twitter" or "reddit" as a parameter.
# Goes through each post and identifies the one using only male words as male
# and female words as female.  
# returns a list of unambiguous posts and a vector of male (0) and female (1) words
def identify_gendered_posts(male, female, platform = "twitter"):
    path = "../custom_data/all_tweets.csv"
    if platform == "reddit":
        path = "../custom_data/economics_posts.csv"
    data = pd.read_csv(path)
    vocab = pd.read_csv("../vocab10K.csv")
    num_entries = data.shape[0]
    y = list()
    male_set = set(male)
    female_set = set(female)
    nFemale = 0
    nMale = 0
    unambiguous_posts = list()
    for i in range(num_entries):
        if i % 1000 == 0:
            print(str(i) + " posts scanned")
        post = str(data['text'][i]).lower()
        lang = "en"
        try:
            lang = str(detect_langs(post))
        except Exception as e:
            pass
        if "en" not in lang:
            continue
        i_male, i_female = 0, 0
        for word in post.split():
            if word in male_set:
                i_male += 1
            elif word in female_set:
                i_female += 1
        if i_male > 0 and i_female == 0:
            y.append(0)
            nMale += 1
            unambiguous_posts.append(post)
        if i_female > 0 and i_male == 0:
            y.append(1)
            nFemale += 1
            unambiguous_posts.append(post)
    print("nMale: ", nMale)
    print("nFemale: ", nFemale)
    return unambiguous_posts, y

In [4]:
# converts a matrix of unambiguous posts to a bag of words. 
# Saves the bow and labels to a text file corresponding to the platform
def build_bow(unambiguous_posts, y, platform = "twitter"):
    vocab = pd.read_csv("../vocab10K.csv")
    num_entries = len(unambiguous_posts)
    X = np.zeros((num_entries, 10000))
    vocab_list = vocab['word'].tolist()
    vocab_set = set(vocab_list)
    for i in range(num_entries):
        if i % 100 == 0:
            print(str(i) + " rows populated")
        post = unambiguous_posts[i].split()
        for word in post:
            if word in vocab_set:
                X[i, vocab_list.index(word)] = 1
    if platform == "twitter":
        np.save("twitter_data", X)
        np.save("twitter_labels", y)
    elif platform == "reddit":
        np.save("reddit_data", X)
        np.save("reddit_labels", y)

# Train a LASSO model on the BOW and labels and analyze the results

In [5]:
# train lasso function defined in lasso.py
from lasso import train_lasso

In [8]:
# takes the coefficients learned from lasso and extracts which words were the most predictive
def get_top_words_from_model(platform = "twitter"):
    vocablist = pd.read_csv("../vocab10K.csv")
    coeffs = np.loadtxt("../coef_twitter.txt")
    if platform == "reddit":
        coeffs = np.loadtxt("../coef_reddit.txt")
    women_words = coeffs.argsort()[-15:][::-1]
    men_words = coeffs.argsort()[:15]
    # look at which words correspond to the most important coeff
    word_map = np.loadtxt("../i_keep_columns.txt")
    women_indices = word_map[women_words] + 1
    men_indices = word_map[men_words] + 1
    print("women: ")
    print(vocablist.loc[vocablist['index'].isin(women_indices)]['word'])
    print("men: ")
    print(vocablist.loc[vocablist['index'].isin(men_indices)]['word'])

In [9]:
# running everything in unison
platform = "twitter"
print("identifying gendered words")
male, female = get_gendered_words()
print("identifying gendered posts")
unambiguous_posts, y = identify_gendered_posts(male, female,  platform = platform)
print("building bag of words")
build_bow(unambiguous_posts, y, platform = platform)
print("training model")
train_lasso(platform = platform)
print("running analysis")
get_top_words_from_model(platform = platform)

identifying gendered words
identifying gendered posts
0 posts scanned
1000 posts scanned
2000 posts scanned
3000 posts scanned
4000 posts scanned
5000 posts scanned
6000 posts scanned
7000 posts scanned
8000 posts scanned
9000 posts scanned
10000 posts scanned
11000 posts scanned
12000 posts scanned
13000 posts scanned
14000 posts scanned
15000 posts scanned
nMale:  996
nFemale:  600
building bag of words
0 rows populated
100 rows populated
200 rows populated
300 rows populated
400 rows populated
500 rows populated
600 rows populated
700 rows populated
800 rows populated
900 rows populated
1000 rows populated
1100 rows populated
1200 rows populated
1300 rows populated
1400 rows populated
1500 rows populated
training model
full X:  (1596, 10000)
train:  (1436, 9540)
train:  (1436,)
test:  (160, 9540)


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.4s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished


22.264828021450747
running analysis
women: 
282          higher
436       including
463           macro
480           black
483          please
486         support
580         talking
667           share
877      colleagues
2040    suggestions
2245      colleague
2579            aea
3512     bargaining
6426       murdered
7286      discusses
Name: word, dtype: object
men: 
153             too
207          theory
221           point
276             pay
383             yet
475       professor
515         harvard
532            mind
966          modern
979           death
1196          types
1368          views
2142          curve
4223    forthcoming
5927    fascinating
Name: word, dtype: object


# Miscellaneous

In [None]:
# count instances of word "Yellen" (case-insensitive)
import pandas as pd
vocab = pd.read_csv("../gendered_posts.csv")
list_of_posts = vocab['raw_post'].tolist()
y_count = 0
for post in list_of_posts:
    p = post.lower()
    if "yellen" in p:
        y_count += 1
print(y_count)

In [None]:
# Reproduction of Wu's results:
# (1) get most important coeffs from lasso-logit-full
coeffs = np.loadtxt("../coef_lasso_logit_full.txt")
top_words = coeffs.argsort()[-10:][::-1]
print(coeffs[top_words])
# (2) look at which words correspond to the most important coeff
vocablist = pd.read_csv("../vocab10K.csv")
word_map = np.loadtxt("../i_keep_columns.txt")
vocab_indices = word_map[top_words] + 1
print(vocablist.loc[vocablist['index'].isin(vocab_indices)])