read

In [1]:
import timeit
import glob, os
from os import listdir
import string
import re
import json
import pandas as pd
from gensim.utils import SaveLoad
from gensim.models.word2vec import Word2Vec
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from itertools import islice
from gensim.models import KeyedVectors
import csv
from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer
#from nltk import pos_tag, word_tokenize
from nltk.tag import PerceptronTagger
from nltk.corpus import stopwords  # Import the stop word list

In [2]:
# Pywsd's Lemmatizer.
porter = PorterStemmer()
wnl = WordNetLemmatizer()
tagger = PerceptronTagger()
pos_tag = tagger.tag
tokenizer = RegexpTokenizer(r'\w+')


def lemmatize(ambiguous_word, pos=None, neverstem=True, 
              lemmatizer=wnl, stemmer=porter):
    """
    Tries to convert a surface word into lemma, and if lemmatize word is not in
    wordnet then try and convert surface word into its stem.
    This is to handle the case where users input a surface word as an ambiguous 
    word and the surface word is a not a lemma.
    """
    if pos:
        lemma = lemmatizer.lemmatize(ambiguous_word, pos=pos)
    else:
        lemma = lemmatizer.lemmatize(ambiguous_word)
    stem = stemmer.stem(ambiguous_word)
    # Ensure that ambiguous word is a lemma.
    if not wn.synsets(lemma):
        if neverstem:
            return ambiguous_word
        if not wn.synsets(stem):
            return ambiguous_word
        else:
            return stem
    else:
        return lemma

def penn2morphy(penntag, returnNone=False):
    morphy_tag = {'NN':wn.NOUN, 'JJ':wn.ADJ,
                  'VB':wn.VERB, 'RB':wn.ADV}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return None if returnNone else ''

def word_tokenize(text,tokenize=tokenizer):
    #return tokenize.tokenize(text.lower())# doesn't remove stop words
    return [w for w in tokenize.tokenize(text.lower()) if not w in stopwords.words("english")]

def lemmatize_sentence(sentence, neverstem=False, keepWordPOS=False, 
                       tokenizer=word_tokenize, postagger=pos_tag, 
                       lemmatizer=wnl, stemmer=porter):
    words, lemmas, poss = [], [], []
    for word, pos in postagger(tokenizer(sentence)):
        pos = penn2morphy(pos)
        lemmas.append(lemmatize(word.lower(), pos, neverstem,
                                lemmatizer, stemmer))
        poss.append(pos)
        words.append(word)
    if keepWordPOS:
        return words, lemmas, [None if i == '' else i for i in poss]
    return lemmas

In [3]:
def extract(path, sentence_stream):
    count = 0
    subfolder_list = glob.glob(path +'/*')
    pbar = tqdm(total=len(subfolder_list))
    folders = [x[0] for x in os.walk(path)]
    for x in folders[1:-1]:
        #print(count)
        pbar.set_description('Extracting {}'.format(x))
        pbar.update(1)
        for i in listdir(x):
            count += 1
            with open(x + "/" + i, 'r') as f:
                lines = (line for line in islice(f, 7, None))
                try:
                    for line in lines:
                        line = re.sub("\d+\S\d+", 'xxx', line)#remove numbers
                        line = re.sub("\d+", 'xxx', line)#remove numbers
                        sentence_stream +=[lemmatize_sentence(i) for i in line.strip().split('. ')]
                        #sentence_stream +=[word_tokenize(i) for i in line.strip().split('. ')]
                        '''sentence_stream += [
                            list(filter(None, i.strip().lower().translate(
                                str.maketrans(string.punctuation, ' ' * len(string.punctuation))).split(' '))) for i
                            in line.strip().split('. ')]'''
                        # line=[i.strip().lower().replace('^[{}]'.format(string.punctuation), ' ') for i in
                        #                   line]
                except UnicodeDecodeError as e:
                    print(str(e))
    pbar.close()

In [4]:
sentence_stream_bloom = []
#sentence_stream_our = []
start = timeit.default_timer()
print("="*80)
print('start the bloombergs')
extract("20061020_20131126_bloomberg_news", sentence_stream_bloom)
print(len(sentence_stream_bloom))

start = timeit.default_timer()
sentence_stream = list(filter(None, sentence_stream_bloom))
print('after:{}'.format(len(sentence_stream)))
print(timeit.default_timer() - start)
#with open("sentence_bloom.csv", "w") as f:
 #   writer = csv.writer(f)
  #  writer.writerows(sentence_stream)

  0%|          | 0/1944 [00:00<?, ?it/s]

start the bloombergs


Extracting 20061020_20131126_bloomberg_news/2011-06-13: 100%|█████████▉| 1943/1944 [69:18:55<02:08, 128.43s/it]   


23970107
after:23284555
0.9336905162781477


In [6]:
with open("sentence_reuters.csv", "r") as f:
    reader = csv.reader(f)
    sentence_stream=list(reader)

    
with open("sentence_bloom.csv", "r") as f:
    reader = csv.reader(f)
    sentence_stream_bloom=list(reader)
    
sentence_stream  += sentence_stream_bloom
#print(sentence_stream.shape)
with open("sentence.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(sentence_stream)

In [7]:
with open("sentence.csv", "r") as f:
    reader = csv.reader(f)
    sentence_stream=list(reader)

phrases = Phrases(sentence_stream,min_count=500, threshold=2)
bigram = Phraser(phrases)
# print(list(bigram[sentence_stream]))
print(bigram['u', 's', 'wall', 'st', 'wall', 'street','s','p','500','s','p','xxx'])

['u', 's', 'wall', 'st', 'wall_street', 's', 'p', '500', 's', 'p', 'xxx']


In [8]:
bigram.save("big_phrase.pickle")

In [9]:
print('start trigram')
start = timeit.default_timer()
phrases = Phrases(bigram[sentence_stream],min_count=500, threshold=2)
trigram = Phraser(phrases)
trigram.save("trig_phrase.pickle")
print(trigram[bigram['u', 's', 'wall', 'st', 'wall', 'street','bank','of','america','s','p','500','s','p','xxx']])
print('finish phrase time:{}'.format(timeit.default_timer() - start))

start trigram
['u', 's', 'wall', 'st', 'wall_street', 'bank', 'of', 'america', 's', 'p', '500', 's', 'p', 'xxx']
finish phrase time:841.0161524452269
