In [366]:
import pandas as pd
import numpy as np
import os
import nltk
import codecs
from collections import Counter
from nltk.corpus import stopwords
import argparse
from gensim.models import phrases, word2vec
import re
import string
from gensim.models import KeyedVectors
import csv
import statsmodels.formula.api as smf
import pickle

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ctoups/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [115]:
data_path = "cscareerquestions.corpus/"
utterances_path = os.path.join(data_path, "utterances.jsonl")
csv_path = os.path.join(data_path, "RedditDataframe.csv")

In [118]:
# Takes a few minutes to load into a dataframe so best to avoid calling this unless you've just started the kernel
first_run = True 
if first_run:
#     df = pd.read_csv(csv_path, converters={"tokenize": lambda x: x.strip("[]").split(", ")})
    df = pd.read_json(utterances_path, lines=True)


In [119]:
df

Unnamed: 0,id,user,root,reply_to,timestamp,text,meta
0,nyv4b,[deleted],nyv4b,,2012-01-01 20:25:05,I've been considering going back to school (I ...,"{'score': 12, 'top_level_comment': None, 'retr..."
1,nytxd,fraincs,nytxd,,2012-01-01 19:52:35,I've studied Graphic Design at school and to m...,"{'score': 4, 'top_level_comment': None, 'retri..."
2,ny58i,Slaughtermatic,ny58i,,2012-01-01 01:33:24,I'm looking at all the internships available f...,"{'score': 1, 'top_level_comment': None, 'retri..."
3,ny45r,[deleted],ny45r,,2012-01-01 01:00:02,I am working in industry and plan to pursue a ...,"{'score': 4, 'top_level_comment': None, 'retri..."
4,o070l,m555,o070l,,2012-01-02 21:13:15,I've been reading about the many ills of most ...,"{'score': 13, 'top_level_comment': None, 'retr..."
...,...,...,...,...,...,...,...
1985074,e8tkc2b,riddleadmiral,9swgn6,e8s07pb,2018-10-31 23:57:09,ask: How responsive are people to emails/Slack...,"{'score': 2, 'top_level_comment': 'e8s07pb', '..."
1985075,e8tkcje,diduxchange,9t06mp,9t06mp,2018-10-31 23:57:22,&gt; work in big-N (in Seattle no less)\n&gt; ...,"{'score': 2, 'top_level_comment': 'e8tkcje', '..."
1985076,e8tke5v,istareatscreens,9t1ufl,9t1ufl,2018-10-31 23:58:08,"""will I be severely stunted in programming and...","{'score': 1, 'top_level_comment': 'e8tke5v', '..."
1985077,e8tkeud,dopkick,9t2gph,9t2gph,2018-10-31 23:58:27,Jobs 2+ have all been via contacts for me.,"{'score': 7, 'top_level_comment': 'e8tkeud', '..."


In [9]:
df["text"]

0          I've been considering going back to school (I ...
1          I've studied Graphic Design at school and to m...
2          I'm looking at all the internships available f...
3          I am working in industry and plan to pursue a ...
4          I've been reading about the many ills of most ...
                                 ...                        
1985150    ask: How responsive are people to emails/Slack...
1985151    &gt; work in big-N (in Seattle no less)\n&gt; ...
1985152    "will I be severely stunted in programming and...
1985153           Jobs 2+ have all been via contacts for me.
1985154                                               Truth!
Name: text, Length: 1985155, dtype: object

In [363]:
# Create vectors of words
def read_from_txt(filename):
    result = []
    with open(filename, 'r') as file:
        for line in file:
            result.append(line.strip())
    return result

man_words =  ['he', 'him', 'hes', 'his', 'himself','man', 'boy', 'male', 'men', 'boys', 'males']
woman_words = [ 'she', 'shes', 'her', 'hers', 'herself', 'woman', 'girl', 'female', 'women','girls', 'females']
combined = man_words + woman_words
adjectives_list = set(read_from_txt('adjectives_garg.txt') + read_from_txt('adjectives_princeton.txt'))
personal_list = ['qualified', 'under qualified', 'unqualified']
competence_adjectives = read_from_txt('adjectives_intelligencegeneral.txt')
vocab = read_from_txt(os.path.join(data_path, 'RedditVocab.txt'))


In [124]:
stops = set(stopwords.words('english')).difference(set(combined))
print(stops)
punct_chars = list((set(string.punctuation) | {'»', '–', '—', '-',"­", '\xad', '-', '◾', '®', '©','✓','▲', '◄','▼','►', '~', '|', '“', '”', '…', "'", "`", '_', '•', '*', '■'} - {"'"}))
punct_chars.sort()
punctuation = ''.join(punct_chars)
replace = re.compile('[%s]' % re.escape(punctuation))
sno = nltk.stem.SnowballStemmer('english')
printable = set(string.printable)

def clean_text(text,
               remove_stopwords=True,
               remove_numeric=True,
               stem=False,
               remove_short=False):
    # lower case
    text = text.lower()
    # eliminate urls
    text = re.sub(r'http\S*|\S*\.com\S*|\S*www\S*', ' ', text)
    # substitute all other punctuation with whitespace
    text = replace.sub(' ', text)
    # replace all whitespace with a single space
    text = re.sub(r'\s+', ' ', text)
    # strip off spaces on either end
    text = text.strip()
    # make sure all chars are printable
    text = ''.join([c for c in text if c in printable])
    words = text.split()
    if remove_stopwords:
        words = [w for w in words if w not in stops]
    if remove_numeric:
        words = [w for w in words if not w.isdigit()]
    if stem:
        words = [sno.stem(w) for w in words]
    if remove_short:
        words = [w for w in words if len(w) >= 3]
    return words

def tokenize(text):
    sents = nltk.sent_tokenize(text)
    return [clean_text(s, stem=False, remove_stopwords=True, remove_short=False) for s in sents]



    


{'mustn', 'down', 'yourself', 'has', 'its', 'y', 'haven', 'until', 'myself', 'only', 'nor', 'couldn', 'any', 'didn', 'our', 'where', 'other', 'ma', 'did', 'by', 'of', 'such', 've', 'off', 'on', "shan't", "don't", 'wasn', 'hadn', 'their', 'do', 'are', 'there', 'should', 'the', 're', "mightn't", 'as', 'and', 'ourselves', 'when', 'yours', 'up', 'your', 'they', 'have', 'after', 'own', 'can', 'me', 'again', 'same', 't', "doesn't", 'been', "needn't", 'over', 'does', 'above', 'needn', "she's", 'here', 'mightn', 'in', "isn't", 'ain', 'most', 'then', 'wouldn', "should've", 'those', 'what', 'than', 'not', 'am', 'into', 'each', 'doing', 'be', 'during', 'o', 'some', 'had', "you'll", 'my', 'through', 'while', 'll', "that'll", 'under', "hasn't", 'we', 'an', 'no', 'isn', "you'd", 'because', 'out', 'all', 'why', 'these', 'but', "aren't", "wasn't", 'that', 'is', 'doesn', 'was', 's', "wouldn't", 'm', 'once', 'about', "haven't", "didn't", 'ours', 'or', 'don', 'now', "mustn't", 'won', "you're", 'shouldn',

In [125]:
all_sentences = []
for i, text in enumerate(df["text"]):
    if type(text) != str:
        continue
    all_sentences.extend(tokenize(text))
    if i % 100000 == 0:
        print(i)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000


In [126]:
len(all_sentences)

7555245

In [128]:
bigrams = phrases.Phrases(all_sentences, min_count=5, delimiter=b' ', common_terms=stops)


In [129]:
print("Creating vocabulary...")
vocab = [w for sent in bigrams[all_sentences] for w in sent]
vocab_count = Counter(vocab)
vocab = [w for w, count in counted.most_common() if count >= 5]




Creating vocabulary...


In [367]:
f = open("vocab_count.pkl","wb")
pickle.dump(vocab_count,f)
f.close()


In [130]:
#Save Vocab
with codecs.open(os.path.join(data_path, 'RedditVocab.txt'), 'w', encoding='utf-8') as f:
    f.write('\n'.join(vocab))

#Save Loaded Dataframe into CSV
df.to_csv(os.path.join(data_path, 'RedditDataframe.csv'))

In [131]:
num_runs = 1 
dims = 100
bootstrap = False
window = 5
data = bigrams[all_sentences]
model = word2vec.Word2Vec(data, size=dims, window=window, sg=1, min_count=5)
model.wv.save(os.path.join(data_path, str(run_idx) + '.wv'))
vectors = model.wv
print('FINISHED')


Run #0
