In [15]:
import nltk,re
import pandas as pd
import numpy as np
from pprint import pprint
from collections import Counter
from math import *
from datetime import datetime
from progressbar import progressbar

from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords
from gensim.summarization.textcleaner import get_sentences
from gensim.summarization.textcleaner import clean_text_by_sentences
from gensim.models.doc2vec import Doc2Vec,TaggedDocument

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
TRAIN_START_DATE = "2017-12"

TRAIN_INTIVAL = 7
TEST_INTIVAL = 2


from_date = TRAIN_START_DATE
to_date = str(np.datetime64(TRAIN_START_DATE) +
              np.timedelta64(TRAIN_INTIVAL, 'M'))
test_from_date = str(np.datetime64(to_date) +
              np.timedelta64(1, 'M'))
test_to_date = str(np.datetime64(test_from_date)+np.timedelta64(TEST_INTIVAL, 'M'))

print(from_date,to_date,test_from_date,test_to_date)
window_size=5

2017-12 2018-07 2018-08 2018-10


In [59]:
def train(train_documents,vector_size=300):
    model = Doc2Vec(train_documents, vector_size=vector_size, window=4, min_count=2, workers=12)
    model.train(train_documents,total_examples=model.corpus_count,epochs=30)
    return(model)
def get_content(data_df):
    content=data_df.content
    content.index = pd.DatetimeIndex(content.index)
    content=content.dropna(how="any")
    return(content)
class Preprocessor:
    def __init__(self,stopword=[],use_stem=False,use_summarize=True,summarize_word_count=200):
        self.use_stem=use_stem
        self.use_summarize=use_summarize
        self.summarize_word_count=summarize_word_count
        self.stopword=stopword
    def stem_and_other_stuff(self,each_news):
        ps=PorterStemmer()
        return([ps.stem(word.lower()) for word in each_news.split(" ") if word.isalpha() and word not in self.stopword])
    
    def check_alpha_tolower(self,each_news):
        return([word.lower() for word in each_news.split(" ") if word.isalpha()])
        
    def get_tokenized_content(self,content):
        tokenized_content_s=content.apply(word_tokenize)
        if self.use_stem:        
            output_token=tokenized_content_s.apply(self.stem_and_other_stuff)
        else:
            output_token=tokenized_content_s.apply(self.check_alpha_tolower)
        return(output_token)
    
    def get_counter(self,content):
        tokenized_content_s=self.get_tokenized_content(content)
        content_counter=Counter()
        for aStemmed_token in tokenized_content_s:
            content_counter.update(aStemmed_token)
#             self.counter = content_counter
        return(content_counter)
    
    def get_summarize(self,content,summarize_ratio=None):
        if summarize_ratio:
            return(content.apply(lambda txt:summarize(txt,word_count = summarize_word_count)))
        else:
            return(content.apply(lambda txt:summarize(txt,word_count = self.summarize_word_count)))
#             return(content.apply(lambda txt:summarize(txt,ratio = self.summarize_ratio)))
    def preprocess(self,content):
        if self.use_summarize:
            content=content.loc[content.apply(clean_text_by_sentences).apply(list).apply(len).apply(lambda x:x>1)]
            content=self.get_summarize(content)
        content_counter=self.get_counter(content)
        return(content_counter)
        
        


class MyStopWord:
    def __init__(self,content_counter,most_common=100,stop_word=None):
        from nltk.corpus import stopwords
        self.counter_stop_word=[word for word,time in content_counter.most_common(most_common)]
        self.user_keep=[]
        self.user_define=[]
        if stop_word:
            self.stop_word=stop_word
        else:
            self.stop_word=set(self.counter_stop_word+stopwords.words('english')) 
    def keep(self,word):
        self.user_keep.append(word)
        self.stop_word.discard(word)
    def define(self,word):
        self.user_define.append(word)
        self.stop_word.add(word)

In [39]:
raw_df = pd.read_csv("../../data/crawler_news_data/oilprice_news.csv")
raw_df_cnbc = pd.read_csv("../../data/crawler_news_data/cnbc_oil_news.csv")
bdate=pd.bdate_range("2009","2019")

raw_df.publish_datetime=pd.DatetimeIndex(raw_df.publish_datetime)
raw_df.loc[~raw_df.publish_datetime.isin(bdate),'publish_datetime']=np.nan
raw_df.publish_datetime=raw_df.publish_datetime.fillna(method='ffill')

raw_df_cnbc.story_publish_datetime=pd.DatetimeIndex(raw_df_cnbc.story_publish_datetime)
raw_df_cnbc.loc[~raw_df_cnbc.story_publish_datetime.isin(bdate),'story_publish_datetime']=np.nan
raw_df_cnbc.story_publish_datetime=raw_df_cnbc.story_publish_datetime.fillna(method='ffill')

data_df=raw_df.sort_values(by="publish_datetime",ascending=True).set_index('publish_datetime')
data_df_cnbc = raw_df_cnbc.sort_values(by="story_publish_datetime",ascending=True).set_index('story_publish_datetime')
data_df_oilprice = pd.DataFrame({"date":raw_df.publish_datetime,"content":raw_df.content})
data_df_cnbc = pd.DataFrame({"date":raw_df_cnbc.story_publish_datetime,"content":raw_df_cnbc.story_full_article})
data_df_oilprice_cnbc = data_df_oilprice.append(data_df_cnbc)
data_df_oilprice_cnbc = data_df_oilprice_cnbc.sort_values(by="date",ascending=True).set_index('date')
raw_content = get_content(data_df_oilprice_cnbc)
raw_content=raw_content.loc[raw_content.apply(clean_text_by_sentences).apply(list).apply(len).apply(lambda x:x>1)]
raw_train_content=raw_content[from_date:to_date]
raw_test_content=raw_content[test_from_date:test_to_date]

In [103]:
effective_news_df = pd.read_csv("../../data/crude_oil_price/effective_news_date_days_before_and_after.csv")
effective_news_df.date=pd.DatetimeIndex(effective_news_df.date)
effective_news_df=effective_news_df.set_index('date')
effective_news_date = effective_news_df.index
effective_news_df.head()

Unnamed: 0_level_0,Unnamed: 0,latest,open,high,low,quantity,percentage,two_day_percentage
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-10-28,1331,98.68,97.88,98.82,97.37,230.16K,0.85,-0.019355
2013-10-30,1329,96.77,97.77,97.82,96.55,268.14K,-1.46,-0.022321
2013-11-08,1322,94.6,94.36,94.92,93.9,241.69K,0.42,-0.01649
2013-11-11,1321,95.14,94.45,95.38,94.11,206.72K,0.57,-0.013244
2013-11-19,1315,93.34,93.0,93.46,92.43,131.46K,0.33,0.022498


In [40]:
preprocesser=Preprocessor()
all_year_counter = preprocesser.preprocess(content=raw_content)
mystopword=MyStopWord(content_counter=all_year_counter,most_common=300)

In [42]:
doc_word_list=preprocesser.get_tokenized_content(raw_content)
train_word_list=doc_word_list[from_date:to_date]
test_word_list=doc_word_list[test_from_date:test_to_date]
train_documents=[TaggedDocument(doc, [i]) for i, doc in enumerate(train_word_list)]

In [60]:
model=train(train_documents=train_documents)

In [126]:
train_vs_df=pd.DataFrame(model.docvecs.vectors_docs).set_index(train_word_list.index)
train_vs_df['tags'] = 0
train_vs_df.loc[train_vs_df.index.isin(effective_news_date),'tags']=1
train_vs_df.to_csv("../../data/train_test_dataset/oilprice_cnbc_new_train.csv")

len(train_vs_df['tags'])

1644

In [127]:
test_vs_np=np.array([model.infer_vector(doc_words=word_list,alpha=0.01,steps=500) for word_list in test_word_list])
test_vs_df=pd.DataFrame(test_vs_np).set_index(test_word_list.index)
test_vs_df['tags'] = 0
test_vs_df.loc[test_vs_df.index.isin(effective_news_date),'tags']=1
test_vs_df.to_csv("../../data/train_test_dataset/oilprice_cnbc_new_test.csv")

len(test_vs_df['tags'])

893

In [129]:
test_vs_df.shape

(893, 301)