In [1]:
import nltk,re
import pandas as pd
from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from pprint import pprint
from collections import Counter
from math import *
from datetime import datetime
# nltk.download('punkt')

In [12]:
def get_content(data_df):
    content=data_df.content
    content.index = pd.DatetimeIndex(content.index)
    content=content.dropna(how="any")
    return(content)
class Preprocessor:
    def __init__(self,path="",content=False):
        if len(path)>1:
            raw_df = pd.read_csv(path)
            self.data_df = raw_df.sort_values(by="publish_datetime",ascending=True).set_index('publish_datetime')
            content=self.data_df.content
            content.index = pd.DatetimeIndex(content.index)
            content=content.dropna(how="any")
            self.content = content
        else:
            self.content = content
    def stem_and_other_stuff(self,each_news):
        ps=PorterStemmer()
        return([ps.stem(word.lower()) for word in each_news if word.isalpha()])
    def check_alpha_tolower(self,each_news):
        return([word.lower() for word in each_news if word.isalpha()])
    def get_content_from_date(self,from_date,to_date):
        self.content = self.content[from_date:to_date]
    def to_counter(self,stem=False):
        self.token_content=self.content.apply(word_tokenize)
        if stem:        
            self.tokens=self.token_content.apply(self.stem_and_other_stuff)
        else:
            self.tokens=self.token_content.apply(self.check_alpha_tolower)
        content_counter = Counter()
        for news in self.tokens:
            content_counter.update(news)
        self.counter = content_counter


class MyStopWord:
    def __init__(self,content_counter,most_common=100,stop_word=None):
        from nltk.corpus import stopwords
        self.counter_stop_word=[word for word,time in content_counter.most_common(most_common)]
        self.user_keep=[]
        self.user_define=[]
        if stop_word:
            self.stop_word=stop_word
        else:
            self.stop_word=set(self.counter_stop_word+stopwords.words('english')) 
    def keep(self,word):
        self.user_keep.append(word)
        self.stop_word.discard(word)
    def define(self,word):
        self.user_define.append(word)
        self.stop_word.add(word)

class Unigram:
    def __init__(self,target_counter,other_counter):
        self.target_counter = target_counter
        self.other_counter = other_counter
        
    def get_different_corpus_set(self,mystopword,TF_OTHER_THRESHOLD=20,TF_TARGET_THRESHOLD=5):
        other_corpus_set=set(key for key,times in self.other_counter.items() if times>TF_OTHER_THRESHOLD)-mystopword.stop_word
        target_corpus_set=set(key for key,times in self.target_counter.items() if times>TF_TARGET_THRESHOLD)-mystopword.stop_word
        self.different_corpus_set = target_corpus_set-other_corpus_set

class Bigram:
    def __init__(self,token):
        self.token = token
    def count_word_pair_with_windows(self,window_size,mystopword):
        stop_word = mystopword.stop_word
        self.pair_counts = Counter()
        self.pair_distance_counts = Counter()
        for tokens in self.token:
            for i in range(len(tokens) - 1):
                for distance in range(1, window_size):
                    if i + distance < len(tokens):
                        w1 = tokens[i]
                        w2 = tokens[i + distance]
                        if w1 not in stop_word and w2 not in stop_word:
                            self.pair_distance_counts[(w1, w2, distance)] += 1
                            self.pair_counts[(w1, w2)] += 1

In [81]:
raw_df = pd.read_csv("../../data/crawler_news_data/oilprice_news.csv")
raw_df_cnbc = pd.read_csv("../../data/crawler_news_data/cnbc_oil_news.csv")
data_df=raw_df.sort_values(by="publish_datetime",ascending=True).set_index('publish_datetime')
data_df_cnbc = raw_df_cnbc.sort_values(by="story_publish_datetime",ascending=True).set_index('story_publish_datetime')
data_df_oilprice = pd.DataFrame({"date":raw_df.publish_datetime,"content":raw_df.content})
data_df_cnbc = pd.DataFrame({"date":raw_df_cnbc.story_publish_datetime,"content":raw_df_cnbc.story_full_article})
data_df_oilprice_cnbc = data_df_oilprice.append(data_df_cnbc)
data_df_oilprice_cnbc = data_df_oilprice_cnbc.sort_values(by="date",ascending=True).set_index('date')
raw_content = get_content(data_df_oilprice_cnbc)

In [88]:
from_date = "2017-11"
to_date = "2018-09"
train_content = raw_content[from_date:to_date]

## 建立stopword

In [89]:
all_year_preprocessor = Preprocessor(content=train_content)
all_year_preprocessor.to_counter()
mystopword=MyStopWord(content_counter=all_year_preprocessor.counter,most_common=87)
mystopword.define('c')
mystopword.keep('demand')

## 用Target corpus - Other corpus find dictionary

### 1. effectivate news date

In [101]:
effective_news_df = pd.read_csv("../../data/crude_oil_price/effective_news_date_from_2013_3d_rolling.csv")
effective_news_date = effective_news_df['date']
effective_news_date=pd.DatetimeIndex(effective_news_date)
effective_news_date

DatetimeIndex(['2013-11-29', '2013-12-02', '2013-12-31', '2014-01-01',
               '2014-01-20', '2014-02-05', '2014-02-17', '2014-02-27',
               '2014-03-03', '2014-03-07',
               ...
               '2018-08-07', '2018-09-10', '2018-09-27', '2018-10-09',
               '2018-10-16', '2018-10-19', '2018-10-22', '2018-10-30',
               '2018-11-09', '2018-11-12'],
              dtype='datetime64[ns]', name='date', length=182, freq=None)

### 2. find target and other corpus

In [91]:
from_date = "2017-11"
to_date = "2018-09"
train_content = raw_content[from_date:to_date]
target_content = train_content.loc[train_content.index.isin(effective_news_date.values)]
other_content = train_content.loc[~train_content.index.isin(effective_news_date.values)]
target_preprocessor = Preprocessor(content = target_content)
other_preprocessor = Preprocessor(content= other_content)
target_preprocessor.to_counter()
other_preprocessor.to_counter()

### 3. find bigram dictionary

In [92]:
window_size = 5
target_bigram = Bigram(token=target_preprocessor.tokens)
other_bigram = Bigram(token=other_preprocessor.tokens)
target_bigram.count_word_pair_with_windows(mystopword=mystopword,window_size=window_size)
other_bigram.count_word_pair_with_windows(mystopword=mystopword,window_size=window_size)
pairwise_dictionary = set(target_bigram.pair_counts.most_common(100)) - set(other_bigram.pair_counts)
pairwise_dictionary=[pair for pair,count in pairwise_dictionary]

## 4. word to vector

In [93]:
def word_to_vector(this_year_token,pairwise_with_windows_list,mystopword,window_size):
    this_year_vs=[]
    for tokenized_article in this_year_token:
        finder = nltk.BigramCollocationFinder.from_words([word for word in tokenized_article if word not in mystopword.stop_word],window_size=window_size)
        this_vs= {key: 0 for key in pairwise_with_windows_list}
        for pair,times in finder.ngram_fd.items():
            if pair in this_vs.keys():
                this_vs[pair]=times
        this_year_vs.append(this_vs)            
    return(this_year_vs)

In [94]:
train_preprocessor = Preprocessor(content=train_content)
train_preprocessor.to_counter()
train_vs = word_to_vector(train_preprocessor.tokens,pairwise_dictionary,mystopword,window_size)
train_vs_df=pd.DataFrame(train_vs)
train_vs_df=train_vs_df.set_index(pd.DatetimeIndex(train_content.index))

In [95]:
print("target:",train_vs_df.loc[effective_news_date.values].sum(axis=1).mean())
print("other:",train_vs_df.loc[~train_content.index.isin(effective_news_date.values)].sum(axis=1).mean())

target: 10.72549019607843
other: 9.175970228601807


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


In [96]:
train_vs_df['tags'] = 0
train_vs_df.loc[train_vs_df.index.isin(effective_news_date.values),'tags']=1
train_vs_df.to_csv("../../data/train_test_dataset/oilprice_cnbc_new_train.csv")

## Generate Test Data

In [97]:
window_size=5
test_content = raw_content["2018-10-01":]
effective_date = pd.read_csv("../../data/crude_oil_price/effective_news_date_from_2013.csv")
test_target_date = effective_date.loc[effective_date.date>="2018-10-01","date"]

In [98]:
test_preprocessor = Preprocessor(content=test_content)
test_preprocessor.to_counter()
test_vs = word_to_vector(test_preprocessor.tokens,pairwise_dictionary,mystopword,window_size)
test_vs_df=pd.DataFrame(test_vs)
test_vs_df=test_vs_df.set_index(pd.DatetimeIndex(test_content.index))
test_vs_df['tags'] = 0
test_vs_df.loc[test_vs_df.index.isin(test_target_date)] =1
test_vs_df.to_csv("../../data/train_test_dataset/oilprice_cnbc_new_test.csv")

In [99]:
print("target:",test_vs_df.loc[effective_news_date.values].sum(axis=1).mean())
print("other:",test_vs_df.loc[~test_content.index.isin(effective_news_date.values)].sum(axis=1).mean())

target: 45.15172413793103
other: 20.21014492753623


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.
