In [1]:
import nltk,re
import pandas as pd
import numpy as np
from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from pprint import pprint
from collections import Counter
from math import *
from datetime import datetime
from progressbar import progressbar
import matplotlib.pyplot as plt
# import 
# nltk.download('punkt')

In [2]:
def word_to_vector(this_year_token,pairwise_with_windows_list,mystopword,window_size):
    this_year_vs=[]
    for tokenized_article in progressbar(this_year_token,prefix="word to vector"):
        finder = nltk.BigramCollocationFinder.from_words([word for word in tokenized_article if word not in mystopword.stop_word],window_size=window_size)
        this_vs= {key: 0 for key in pairwise_with_windows_list}
        for pair,times in finder.ngram_fd.items():
            if pair in this_vs.keys():
                this_vs[pair]=times
        this_year_vs.append(this_vs)            
    return(this_year_vs)

def train(train_documents,vector_size=300):
    model = Doc2Vec(train_documents, vector_size=vector_size, window=4, min_count=2, workers=12)
    model.train(train_documents,total_examples=model.corpus_count,epochs=30)
    return(model)
def get_content(data_df):
    content=data_df.content
    content.index = pd.DatetimeIndex(content.index)
    content=content.dropna(how="any")
    return(content)
class Preprocessor:
    def __init__(self,stopword=[],use_stem=False,use_summarize=True,summarize_word_count=200):
        self.use_stem=use_stem
        self.use_summarize=use_summarize
        self.summarize_word_count=summarize_word_count
        self.stopword=stopword
    def stem_and_other_stuff(self,each_news):
        ps=PorterStemmer()
        return([ps.stem(word.lower()) for word in each_news.split(" ") if word.isalpha() and word not in self.stopword])
    
    def check_alpha_tolower(self,each_news):
        return([word.lower() for word in each_news.split(" ") if word.isalpha()])
        
    def get_tokenized_content(self,content):
        tokenized_content_s=content.apply(word_tokenize)
        if self.use_stem:        
            output_token=tokenized_content_s.apply(self.stem_and_other_stuff)
        else:
            output_token=tokenized_content_s.apply(self.check_alpha_tolower)
        return(output_token)
    
    def get_counter(self,content):
        tokenized_content_s=self.get_tokenized_content(content)
        content_counter=Counter()
        for aStemmed_token in tokenized_content_s:
            content_counter.update(aStemmed_token)
#             self.counter = content_counter
        return(content_counter)
    
    def get_summarize(self,content,summarize_ratio=None):
        if summarize_ratio:
            return(content.apply(lambda txt:summarize(txt,word_count = summarize_word_count)))
        else:
            return(content.apply(lambda txt:summarize(txt,word_count = self.summarize_word_count)))
#             return(content.apply(lambda txt:summarize(txt,ratio = self.summarize_ratio)))
    def preprocess(self,content):
        if self.use_summarize:
            content=content.loc[content.apply(clean_text_by_sentences).apply(list).apply(len).apply(lambda x:x>1)]
            content=self.get_summarize(content)
        content_counter=self.get_counter(content)
        return(content_counter)
        
        


class MyStopWord:
    def __init__(self,content_counter,most_common=100,stop_word=None):
        from nltk.corpus import stopwords
        self.counter_stop_word=[word for word,time in content_counter.most_common(most_common)]
        self.user_keep=[]
        self.user_define=[]
        if stop_word:
            self.stop_word=stop_word
        else:
            self.stop_word=set(self.counter_stop_word+stopwords.words('english')) 
    def keep(self,word):
        self.user_keep.append(word)
        self.stop_word.discard(word)
    def define(self,word):
        self.user_define.append(word)
        self.stop_word.add(word)

class Unigram:
    def __init__(self,target_counter,other_counter):
        self.target_counter = target_counter
        self.other_counter = other_counter
        
    def get_different_corpus_set(self,mystopword,TF_OTHER_THRESHOLD=20,TF_TARGET_THRESHOLD=5):
        other_corpus_set=set(key for key,times in self.other_counter.items() if times>TF_OTHER_THRESHOLD)-mystopword.stop_word
        target_corpus_set=set(key for key,times in self.target_counter.items() if times>TF_TARGET_THRESHOLD)-mystopword.stop_word
        self.different_corpus_set = target_corpus_set-other_corpus_set

class Bigram:
    def __init__(self,token):
        self.token = token
    def count_word_pair_with_windows(self,window_size,mystopword):
        stop_word = mystopword.stop_word
        self.pair_counts = Counter()
        self.pair_distance_counts = Counter()
        for tokens in self.token:
            for i in range(len(tokens) - 1):
                for distance in range(1, window_size):
                    if i + distance < len(tokens):
                        w1 = tokens[i]
                        w2 = tokens[i + distance]
                        if w1 not in stop_word and w2 not in stop_word:
                            self.pair_distance_counts[(w1, w2, distance)] += 1
                            self.pair_counts[(w1, w2)] += 1

In [17]:
# for time in a:
#     TRAIN_START_DATE=pd.to_datetime(time).strftime("%Y-%m")
#     TRAIN_INTIVAL=11
#     TEST_INTIVAL=1

#     from_date = TRAIN_START_DATE
#     to_date = str(np.datetime64(TRAIN_START_DATE)+np.timedelta64(TRAIN_INTIVAL,'M'))
#     test_from_date=to_date
#     test_to_date=str(np.datetime64(to_date)+np.timedelta64(TEST_INTIVAL,'M'))
#     print("from_date",from_date,"to_date",to_date,"test_from_date",test_from_date,"test_to_date",test_to_date)

TRAIN_START_DATE = "2017-12"

TRAIN_INTIVAL = 4
TEST_INTIVAL = 1

from_date = TRAIN_START_DATE
to_date = str(np.datetime64(TRAIN_START_DATE) +
              np.timedelta64(TRAIN_INTIVAL, 'M'))
test_from_date = str(np.datetime64(to_date) +
              np.timedelta64(1, 'M'))
test_to_date = str(np.datetime64(test_from_date)+np.timedelta64(TEST_INTIVAL, 'M'))

print(from_date,to_date,test_from_date,test_to_date)
window_size=5

2017-12 2018-04 2018-05 2018-06


In [18]:
raw_df = pd.read_csv("../../data/crawler_news_data/oilprice_news.csv")
raw_df_cnbc = pd.read_csv("../../data/crawler_news_data/cnbc_oil_news.csv")
data_df=raw_df.sort_values(by="publish_datetime",ascending=True).set_index('publish_datetime')
data_df_cnbc = raw_df_cnbc.sort_values(by="story_publish_datetime",ascending=True).set_index('story_publish_datetime')
data_df_oilprice = pd.DataFrame({"date":raw_df.publish_datetime,"content":raw_df.content})
data_df_cnbc = pd.DataFrame({"date":raw_df_cnbc.story_publish_datetime,"content":raw_df_cnbc.story_full_article})
data_df_oilprice_cnbc = data_df_oilprice.append(data_df_cnbc)
data_df_oilprice_cnbc = data_df_oilprice_cnbc.sort_values(by="date",ascending=True).set_index('date')
raw_content = get_content(data_df_oilprice_cnbc)
train_content = raw_content[from_date:to_date]

## 建立stopword

In [19]:
all_year_preprocessor = Preprocessor(content=train_content)
all_year_preprocessor.to_counter()
mystopword=MyStopWord(content_counter=all_year_preprocessor.counter,most_common=100)
mystopword.define('c')
mystopword.keep('demand')


to counter100% (635 of 635) |############| Elapsed Time: 0:00:00 Time:  0:00:00


## 用Target corpus - Other corpus find dictionary

### 1. effectivate news date

In [20]:
effective_news_df = pd.read_csv("../../data/crude_oil_price/effective_news_date_percentage_positive.csv")
effective_news_date = effective_news_df['date']
effective_news_date=pd.DatetimeIndex(effective_news_date)

### 2. find target and other corpus

In [21]:
train_content = raw_content[from_date:to_date]
target_content = train_content.loc[train_content.index.isin(effective_news_date.values)]
other_content = train_content.loc[~train_content.index.isin(effective_news_date.values)]
target_preprocessor = Preprocessor(content = target_content)
other_preprocessor = Preprocessor(content= other_content)
target_preprocessor.to_counter()
other_preprocessor.to_counter()

to counter100% (53 of 53) |##############| Elapsed Time: 0:00:00 Time:  0:00:00
to counter100% (582 of 582) |############| Elapsed Time: 0:00:00 Time:  0:00:00


### 3. find bigram dictionary

In [22]:
TF_TARGET_THRESHOLD=1
OTHER_TARGET_THRESHOLD=5

target_bigram = Bigram(token=target_preprocessor.tokens)
other_bigram = Bigram(token=other_preprocessor.tokens)

target_bigram.count_word_pair_with_windows(mystopword=mystopword,window_size=window_size)
other_bigram.count_word_pair_with_windows(mystopword=mystopword,window_size=window_size)

target_corpus_set=set([key for key,times in target_bigram.pair_counts.most_common(1000) if times>TF_TARGET_THRESHOLD])
other_corpus_set=set([key for key,times in other_bigram.pair_counts.items() if times>OTHER_TARGET_THRESHOLD])

pairwise_dictionary = target_corpus_set - other_corpus_set
print("len(pairwise_dictionary)",len(pairwise_dictionary))
# pairwise_dictionary=[pair for pair,count in pairwise_dictionary]
# pairwise_dictionary = set(target_bigram.pair_counts.most_common(100)) - set(other_bigram.pair_counts)
# pairwise_dictionary=[pair for pair,count in pairwise_dictionary]

len(pairwise_dictionary) 852


## 4. word to vector

In [23]:
train_preprocessor = Preprocessor(content=train_content)
train_preprocessor.to_counter()
train_vs = word_to_vector(train_preprocessor.tokens,pairwise_dictionary,mystopword,window_size)
train_vs_df=pd.DataFrame(train_vs)
train_vs_df=train_vs_df.set_index(pd.DatetimeIndex(train_content.index))

to counter100% (635 of 635) |############| Elapsed Time: 0:00:00 Time:  0:00:00
word to vector100% (635 of 635) |########| Elapsed Time: 0:00:01 Time:  0:00:01


In [24]:
print("target:",train_vs_df.loc[train_content.index.isin(effective_news_date.values)].sum(axis=1).mean())
print("other:",train_vs_df.loc[~train_content.index.isin(effective_news_date.values)].sum(axis=1).mean())

target: 43.60377358490566
other: 0.9518900343642611


In [25]:
train_vs_df['tags'] = 0
train_vs_df.loc[train_vs_df.index.isin(effective_news_date.values),'tags']=1
train_vs_df.to_csv("../../data/train_test_dataset/oilprice_cnbc_new_train.csv")

## Generate Test Data

In [26]:
if len(test_to_date):
    test_content = raw_content[test_from_date:test_to_date]
else:
    test_content = raw_content[test_from_date:]
# effective_date = pd.read_csv("../../data/crude_oil_price/effective_news_date_percentage_positive.csv")


In [31]:
test_content

date
2018-05-01                                                     
2018-05-01    Apple's second-quarter earnings beat on Tuesda...
2018-05-01    The increasingly volatile stock market has for...
2018-05-01    Iran's fragile economic recovery is in jeopard...
2018-05-01    Oil prices popped on Wednesday as the U.S. dol...
2018-05-01    Oil prices look to have climbed to unsustainab...
2018-05-01    BP beat analyst expectations on Tuesday, as hi...
2018-05-01    BP's finance chief does not believe there is a...
2018-05-02    The U.S. dollar has gained strength in recent ...
2018-05-02    China has become the world’s largest oil impor...
2018-05-02    “$60 is like the new $100,” Dallas Fed economi...
2018-05-02    Oil-importing countries in the Middle East and...
2018-05-02    Higher oil prices won't change the pace of Sau...
2018-05-02    Oil prices rose on Thursday, boosted by OPEC p...
2018-05-02    After yesterday the American Petroleum Institu...
2018-05-03    China’s Sinopec will 

In [27]:
test_preprocessor = Preprocessor(content=test_content)
test_preprocessor.to_counter()
test_vs = word_to_vector(test_preprocessor.tokens,pairwise_dictionary,mystopword,window_size)
test_vs_df=pd.DataFrame(test_vs)
test_vs_df=test_vs_df.set_index(pd.DatetimeIndex(test_content.index))
test_vs_df['tags'] = 0
###############挫賽了拉
test_vs_df.loc[test_vs_df.index.isin(effective_news_date.values),'tags'] =1
###############挫賽了拉
test_vs_df.to_csv("../../data/train_test_dataset/oilprice_cnbc_new_test.csv")

to counter100% (664 of 664) |############| Elapsed Time: 0:00:00 Time:  0:00:00
word to vector100% (664 of 664) |########| Elapsed Time: 0:00:01 Time:  0:00:01


In [28]:
print("target:",test_vs_df.loc[test_content.index.isin(effective_news_date.values)].sum(axis=1).mean())
print("other:",test_vs_df.loc[~test_content.index.isin(effective_news_date.values)].sum(axis=1).mean())

target: 1.9150943396226414
other: 0.8297491039426523


## do some experiment

In [None]:
test_vs_sum=test_vs_df.loc[~test_content.index.isin(effective_news_date.values)].sum(axis=1)

In [None]:
train_preprocessor.tokens