In [None]:
from gensim.test.utils import datapath
from gensim import utils
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import json
import string 

corpus_path = datapath('/Users/kodymoodley/Documents/nlesc-projects/disaster-capitalism/embeddings/processed_ngram_ner_data.json')
with open(corpus_path, encoding='utf-8') as f:
    datajson = json.load(f)

corpus = ''
for key in datajson:
    corpus += datajson[key] + ' '
    
f = open('ngram_replacements.json')
ngram_replacements = json.load(f)
        
def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

def get_preprocessed_corpus(corpus):
    global ngram_replacements
    
    output_sentences = []
    
    # split corpus into sentences
    sentences = sent_tokenize(corpus)
    cleaned_sentences = []
    for sentence in sentences:
        # remove numeric tokens
        sentence = re.sub(r'\d+', '', sentence)
 
        # replace ngrams with single tokens
        sentence = replace_all(sentence, ngram_replacements)
        
        # remove URLs
        sentence = re.sub('http://\S+|https://\S+', '', sentence)
        sentence = re.sub('http[s]?://\S+', '', sentence)
        sentence = re.sub(r'http\S+', '', sentence)
        
        # replace slashes and dashes used for concatentation with underscores
        sentence = sentence.replace("/", "_")
        sentence = sentence.replace("-", "_")
        
        cleaned_sentences.append(sentence)

    # tokenization
    for cleaned_sentence in cleaned_sentences:
        curr_tokens = word_tokenize(cleaned_sentence)
        
        # lower casing 
        for i in range(len(curr_tokens)):
            curr_tokens[i] = curr_tokens[i].lower()

        # remove punctuation and whitespace characters
        tokens_minus_punctuation = []
        for token in curr_tokens:
            token_contains_punct = False
            for chara in token:
                # underscores and are allowed
                if ((chara in string.punctuation and (chara not in ['_'])) or (chara in ['©','θ','•','','��'])):
                    token_contains_punct = True
                    break
            if not token_contains_punct:
                test_whitespace = token.replace(' ','')
                # ''', '“', '”' and '-', '‘', '’' are still appearing
                test_whitespace = test_whitespace.replace('”','')
                test_whitespace = test_whitespace.replace('“','')
                test_whitespace = test_whitespace.replace('‘','')
                test_whitespace = test_whitespace.replace('’','')
                if (len(test_whitespace) > 0):
                    token = token.strip()
                    token = token.strip('-')
                    token = token.replace('-','')
                    tokens_minus_punctuation.append(token)

        # remove stop words
        irrelevant_tokens = ['et', 'al.', 'x', 'pdf', 'yes', 'abbrev','also','fe',
                            'page', 'pp', 'p', 'er', 'doi', 'can', 'b', 'c', 'd', 'e',
                            'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'q', 'r', 's', 'herein', 'furthermore',
                            't', 'u', 'v', 'w', 'y', 'z', 'www', 'com', 'org', 'de', 'dx', 'th', 'ii', 'le']

        stop_words = set(stopwords.words('english')).union(set(irrelevant_tokens))
        cleaned_tokens = [w for w in tokens_minus_punctuation if not w in stop_words]
        
        # lemmatization
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(w) for w in cleaned_tokens]
        output_sentences.append(lemmatized_tokens)
            
    return output_sentences

In [None]:
processed_corpus = get_preprocessed_corpus(corpus)

In [None]:
import gensim.models
model_200_10 = gensim.models.Word2Vec(sentences=processed_corpus, sg=1, vector_size=200, window=10, workers=4, min_count=2, epochs=25)

In [None]:
model_200_20 = gensim.models.Word2Vec(sentences=processed_corpus, sg=1, vector_size=200, window=20, workers=4, min_count=2, epochs=25)

In [None]:
model_200_30 = gensim.models.Word2Vec(sentences=processed_corpus, sg=1, vector_size=200, window=30, workers=4, min_count=2, epochs=25)

In [None]:
model_100_10 = gensim.models.Word2Vec(sentences=processed_corpus, sg=1, vector_size=100, window=10, workers=4, min_count=2, epochs=25)

In [None]:
model_100_20 = gensim.models.Word2Vec(sentences=processed_corpus, sg=1, vector_size=100, window=20, workers=4, min_count=2, epochs=25)

In [None]:
model_200_40_30 = gensim.models.Word2Vec(sentences=processed_corpus, sg=1, vector_size=200, window=40, workers=4, min_count=2, epochs=30)

In [None]:
model_100_30 = gensim.models.Word2Vec(sentences=processed_corpus, sg=1, vector_size=100, window=30, workers=4, min_count=2, epochs=25)

In [None]:
model_100_40_30 = gensim.models.Word2Vec(sentences=processed_corpus, sg=1, vector_size=100, window=40, workers=4, min_count=2, epochs=30)

In [None]:
import tempfile

with tempfile.NamedTemporaryFile(delete=False) as tmp:
    filepath_200_10 = '/Users/kodymoodley/Documents/nlesc-projects/disaster-capitalism/embeddings/models/gensim-oecd-word2vec-200-10.model'
    filepath_200_20 = '/Users/kodymoodley/Documents/nlesc-projects/disaster-capitalism/embeddings/models/gensim-oecd-word2vec-200-20.model'
    filepath_200_30 = '/Users/kodymoodley/Documents/nlesc-projects/disaster-capitalism/embeddings/models/gensim-oecd-word2vec-200-30.model'
    filepath_100_10 = '/Users/kodymoodley/Documents/nlesc-projects/disaster-capitalism/embeddings/models/gensim-oecd-word2vec-100-10.model'
    filepath_100_20 = '/Users/kodymoodley/Documents/nlesc-projects/disaster-capitalism/embeddings/models/gensim-oecd-word2vec-100-20.model'
    filepath_100_30 = '/Users/kodymoodley/Documents/nlesc-projects/disaster-capitalism/embeddings/models/gensim-oecd-word2vec-100-30.model'
    filepath_200_40_30 = '/Users/kodymoodley/Documents/nlesc-projects/disaster-capitalism/embeddings/models/gensim-oecd-word2vec-200-40-30.model'
    filepath_100_40_30 = '/Users/kodymoodley/Documents/nlesc-projects/disaster-capitalism/embeddings/models/gensim-oecd-word2vec-100-40-30.model'
    model_200_10.save(filepath_200_10)
    model_200_20.save(filepath_200_20)
    model_200_30.save(filepath_200_30)
    model_100_10.save(filepath_100_10)
    model_100_20.save(filepath_100_20)
    model_100_30.save(filepath_100_30)
    model_200_40_30.save(filepath_200_40_30)
    model_100_40_30.save(filepath_100_40_30)

In [None]:
# vec_water = model.wv['water']

In [None]:
# print(vec_water)

In [None]:
# model.wv.most_similar(positive=['adaptation'], topn=10)

In [None]:
# for index, word in enumerate(model.wv.index_to_key):
#     if index == 10:
#         break
#     print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

In [None]:
a_testmodel = gensim.models.Word2Vec.load(filepath_100_10)
b_testmodel = gensim.models.Word2Vec.load(filepath_200_10)

In [None]:
a_testmodel.wv.most_similar(positive=['finance'], topn=20)

In [None]:
b_testmodel.wv.most_similar(positive=['finance'], topn=20)

In [None]:
c_testmodel = gensim.models.Word2Vec.load(filepath_100_20)
d_testmodel = gensim.models.Word2Vec.load(filepath_200_20)

In [None]:
c_testmodel.wv.most_similar(positive=['finance'], topn=20)

In [None]:
d_testmodel.wv.most_similar(positive=['finance'], topn=20)

In [None]:
e_testmodel = gensim.models.Word2Vec.load(filepath_100_30)
f_testmodel = gensim.models.Word2Vec.load(filepath_200_30)

In [None]:
e_testmodel.wv.most_similar(positive=['rockefeller_foundation'], topn=20)

In [None]:
f_testmodel.wv.most_similar(positive=['rockefeller_foundation'], topn=20)

In [None]:
g_testmodel = gensim.models.Word2Vec.load(filepath_200_40_30)
h_testmodel = gensim.models.Word2Vec.load(filepath_100_40_30)

In [None]:
g_testmodel.wv.most_similar(positive=['rockefeller_foundation'], topn=20)

In [None]:
h_testmodel.wv.most_similar(positive=['inei'], topn=20)

In [None]:
def get_more_info_about_entity(entity):
    global df
    new_df = df.drop(['entity', 'model'], axis=1)
    relevant_df = new_df[new_df['entity_as_single_token'] == entity].reset_index(drop=True)
    types = list(set(relevant_df['entity_type'].tolist()))
    docs = list(set(relevant_df['docid'].tolist()))
    potential_contexts = relevant_df['sentence'].tolist()
    new_contexts = []
    spans = relevant_df['span'].tolist()
    for i in range(0, len(potential_contexts)):
        span_parts = spans[i].split(':')
        l_span = int(span_parts[0])
        r_span = int(span_parts[1])
        left_str = potential_contexts[i][:l_span-1]
        right_str = potential_contexts[i][r_span+1:]
        left_str_parts = left_str.split()
        right_str_parts = right_str.split()
        if ((len(left_str_parts) > 3) or (len(right_str_parts) > 3)):
            if (len(left_str_parts) <= 3):
                new_str = potential_contexts[i][:r_span] + ' ' + ' '.join(right_str_parts[0:3])
                new_contexts.append(new_str)
            elif (len(right_str_parts) <= 3):
                new_str = ' '.join(left_str_parts[-3:]) + ' ' + potential_contexts[i][l_span:]
                new_contexts.append(new_str)
            else:
                new_str = ' '.join(left_str_parts[-3:]) + ' ' + potential_contexts[i][l_span:r_span] +  ' ' + ' '.join(right_str_parts[0:3])
                new_contexts.append(new_str)
        else:
            new_contexts.append(potential_contexts[i])
    return {'name' : entity, 'types' : types, 'docs' : docs, 'contexts' : new_contexts}

def pretty_print_entity_info(entity_info):
    print()
    print("name\t\t:\t", entity_info['name']) 
    print()
    print("types\t\t:\t", entity_info['types'])
    print()
    print("documents\t:\t", entity_info['docs'])
    print()
    print("contexts\t:\t", end="")
    print(" 1. " + entity_info['contexts'][0])
    for i in range(1, len(entity_info['contexts'])):
        print("\t\t\t " + str(i+1) + ". " + entity_info['contexts'][i])
    print()
    

In [None]:
result = get_more_info_about_entity('imta')
print(pretty_print_entity_info(result))