In [115]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.metrics import classification_report,confusion_matrix,consensus_score
from sklearn.preprocessing import Normalizer
from sklearn.feature_selection import SelectKBest,chi2,mutual_info_classif,f_classif,SelectPercentile
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
from imblearn.under_sampling import AllKNN

In [116]:
test_df = pd.read_csv("../../data/train_test_dataset/oilprice_cnbc_new_summarize_test.csv")
train_df = pd.read_csv("../../data/train_test_dataset/oilprice_cnbc_new_summarize_train.csv")
train_df.index = pd.DatetimeIndex(train_df.date)
test_df.index = pd.DatetimeIndex(test_df.date)

## Include history price variables

In [76]:
history_price_df = pd.read_csv("../../data/crude_oil_price/history_price_clean_from2013.csv")
history_price_df.index = pd.DatetimeIndex(history_price_df.date)

In [135]:
history_10_mean = history_price_df['latest'].rolling(10).mean()
history_10_std = history_price_df['latest'].rolling(10).std()
history_5_mean = history_price_df['latest'].rolling(5).mean()
history_5_std = history_price_df['latest'].rolling(5).std()
history_3_mean = history_price_df['latest'].rolling(3).mean()
history_3_std = history_price_df['latest'].rolling(3).std()
# history_quantity = [ float(x[:-1]) for x in history_price_df.quantity if len(x)>1]
history_price_statistic_df = pd.DataFrame({"10_mean":history_10_mean,"10_std":history_10_std,"5_mean":history_5_mean,"5_std":history_5_std,"3_mean":history_3_mean,"3_std":history_3_std})
history_price_statistic_df=history_price_statistic_df#.fillna(method='bfill',axis=1).fillna(method='ffill',axis=1)

train_df = pd.concat([train_df, history_price_statistic_df], axis=1, join_axes=[train_df.index]).fillna(method='bfill',axis=0).fillna(method='ffill',axis=0)
test_df = pd.concat([test_df, history_price_statistic_df], axis=1, join_axes=[test_df.index]).fillna(method='bfill',axis=0).fillna(method='ffill',axis=0)

## To train test numpy

In [78]:
x_drop = ['tags','date']
train_x = train_df.drop(x_drop,axis=1).values.astype(float)
train_y = train_df['tags'].values.astype(float)
test_x = test_df.drop(x_drop,axis=1).values.astype(float)
test_y = test_df['tags'].values.astype(float)
print("train_x.shape",train_x.shape)
print("test_x.shape",test_x.shape)

train_x.shape (1925, 730)
test_x.shape (572, 730)


In [79]:
sm = SMOTE()
train_x, train_y = sm.fit_resample(train_x,train_y)
print("X_res.shape",train_x.shape)
print("y_res.shape",train_y.shape)

X_res.shape (2666, 730)
y_res.shape (2666,)


In [84]:
# pca=PCA()
# pca.fit(train_x)
# train_x = pca.transform(train_x)
# test_x = pca.transform(test_x)

selector=SelectFpr(chi2)
train_x=selector.fit_transform(train_x,train_y)
test_x=selector.transform(test_x)
# selector=SelectPercentile(mutual_info_classif,80)
# train_x=selector.fit_transform(train_x,train_y)
# test_x=selector.transform(test_x)

normalizer = Normalizer()
train_x = normalizer.fit_transform(train_x)
test_x = normalizer.transform(test_x)


print("train_x.shape",train_x.shape)
print("test_x.shape",test_x.shape)

train_x.shape (2666, 350)
test_x.shape (572, 350)


In [85]:
# Perform classification with SVM, kernel=linear 
svc_model = svm.LinearSVC(C=6000) 
svc_model.fit(train_x, train_y) 
prediction_svm = svc_model.predict(test_x)
print("svc_model")
print (classification_report(test_y, prediction,))
print(confusion_matrix(test_y, prediction))

nb_model=MultinomialNB(alpha=0.0001)
nb_model.fit(train_x, train_y) 
prediction_nb_model = nb_model.predict(test_x)
print("nb_model")
print (classification_report(test_y, prediction,))
print(confusion_matrix(test_y, prediction))

rfc_model=RandomForestClassifier()
rfc_model.fit(train_x, train_y) 
prediction = rfc_model.predict(test_x)
print("rfc_model")
print (classification_report(test_y, prediction,))
print(confusion_matrix(test_y, prediction))

svc_model
              precision    recall  f1-score   support

         0.0       0.71      0.73      0.72       429
         1.0       0.10      0.09      0.10       143

   micro avg       0.57      0.57      0.57       572
   macro avg       0.40      0.41      0.41       572
weighted avg       0.56      0.57      0.56       572

[[314 115]
 [130  13]]
nb_model
              precision    recall  f1-score   support

         0.0       0.71      0.73      0.72       429
         1.0       0.10      0.09      0.10       143

   micro avg       0.57      0.57      0.57       572
   macro avg       0.40      0.41      0.41       572
weighted avg       0.56      0.57      0.56       572

[[314 115]
 [130  13]]
rfc_model
              precision    recall  f1-score   support

         0.0       0.76      0.82      0.79       429
         1.0       0.29      0.22      0.25       143

   micro avg       0.67      0.67      0.67       572
   macro avg       0.53      0.52      0.52       572



In [57]:
for c in range(1000,30000,1000):
    svc_model = svm.LinearSVC(C=c) 
    svc_model.fit(train_x, train_y) 
    prediction = svc_model.predict(test_x)
    print("svc_model",c)
    print (classification_report(test_y, prediction,))
    print(confusion_matrix(test_y, prediction))



svc_model 1000
              precision    recall  f1-score   support

         0.0       0.74      0.75      0.75       429
         1.0       0.23      0.22      0.23       143

   micro avg       0.62      0.62      0.62       572
   macro avg       0.49      0.49      0.49       572
weighted avg       0.62      0.62      0.62       572

[[322 107]
 [111  32]]
svc_model 2000
              precision    recall  f1-score   support

         0.0       0.74      0.76      0.75       429
         1.0       0.22      0.20      0.21       143

   micro avg       0.62      0.62      0.62       572
   macro avg       0.48      0.48      0.48       572
weighted avg       0.61      0.62      0.62       572

[[328 101]
 [115  28]]




svc_model 3000
              precision    recall  f1-score   support

         0.0       0.47      0.02      0.04       429
         1.0       0.24      0.93      0.38       143

   micro avg       0.25      0.25      0.25       572
   macro avg       0.36      0.48      0.21       572
weighted avg       0.42      0.25      0.13       572

[[  9 420]
 [ 10 133]]
svc_model 4000
              precision    recall  f1-score   support

         0.0       0.75      0.28      0.40       429
         1.0       0.25      0.72      0.37       143

   micro avg       0.39      0.39      0.39       572
   macro avg       0.50      0.50      0.39       572
weighted avg       0.62      0.39      0.39       572

[[118 311]
 [ 40 103]]




svc_model 5000
              precision    recall  f1-score   support

         0.0       0.67      0.09      0.16       429
         1.0       0.24      0.86      0.38       143

   micro avg       0.28      0.28      0.28       572
   macro avg       0.45      0.48      0.27       572
weighted avg       0.56      0.28      0.22       572

[[ 40 389]
 [ 20 123]]
svc_model 6000
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       429
         1.0       0.25      0.99      0.40       143

   micro avg       0.25      0.25      0.25       572
   macro avg       0.12      0.50      0.20       572
weighted avg       0.06      0.25      0.10       572

[[  0 429]
 [  1 142]]
svc_model 7000
              precision    recall  f1-score   support

         0.0       0.75      1.00      0.86       429
         1.0       0.00      0.00      0.00       143

   micro avg       0.75      0.75      0.75       572
   macro avg       0.38      0.50    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


svc_model 8000
              precision    recall  f1-score   support

         0.0       0.75      1.00      0.86       429
         1.0       0.00      0.00      0.00       143

   micro avg       0.75      0.75      0.75       572
   macro avg       0.38      0.50      0.43       572
weighted avg       0.56      0.75      0.64       572

[[429   0]
 [143   0]]
svc_model 9000
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       429
         1.0       0.25      0.99      0.40       143

   micro avg       0.25      0.25      0.25       572
   macro avg       0.12      0.49      0.20       572
weighted avg       0.06      0.25      0.10       572

[[  0 429]
 [  2 141]]




svc_model 10000
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       429
         1.0       0.25      0.99      0.40       143

   micro avg       0.25      0.25      0.25       572
   macro avg       0.12      0.50      0.20       572
weighted avg       0.06      0.25      0.10       572

[[  0 429]
 [  1 142]]


  'precision', 'predicted', average, warn_for)


svc_model 11000
              precision    recall  f1-score   support

         0.0       0.75      1.00      0.86       429
         1.0       0.00      0.00      0.00       143

   micro avg       0.75      0.75      0.75       572
   macro avg       0.38      0.50      0.43       572
weighted avg       0.56      0.75      0.64       572

[[429   0]
 [143   0]]
svc_model 12000
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       429
         1.0       0.25      0.99      0.40       143

   micro avg       0.25      0.25      0.25       572
   macro avg       0.12      0.49      0.20       572
weighted avg       0.06      0.25      0.10       572

[[  0 429]
 [  2 141]]




svc_model 13000
              precision    recall  f1-score   support

         0.0       0.50      0.02      0.03       429
         1.0       0.24      0.95      0.39       143

   micro avg       0.25      0.25      0.25       572
   macro avg       0.37      0.48      0.21       572
weighted avg       0.44      0.25      0.12       572

[[  7 422]
 [  7 136]]
svc_model 14000
              precision    recall  f1-score   support

         0.0       0.75      0.98      0.85       429
         1.0       0.10      0.01      0.01       143

   micro avg       0.74      0.74      0.74       572
   macro avg       0.42      0.49      0.43       572
weighted avg       0.59      0.74      0.64       572

[[420   9]
 [142   1]]
svc_model 15000
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       429
         1.0       0.25      0.99      0.40       143

   micro avg       0.25      0.25      0.25       572
   macro avg       0.12      0.50 



svc_model 16000
              precision    recall  f1-score   support

         0.0       0.75      0.98      0.85       429
         1.0       0.10      0.01      0.01       143

   micro avg       0.74      0.74      0.74       572
   macro avg       0.42      0.49      0.43       572
weighted avg       0.59      0.74      0.64       572

[[420   9]
 [142   1]]
svc_model 17000
              precision    recall  f1-score   support

         0.0       0.73      0.66      0.70       429
         1.0       0.21      0.27      0.24       143

   micro avg       0.56      0.56      0.56       572
   macro avg       0.47      0.47      0.47       572
weighted avg       0.60      0.56      0.58       572

[[284 145]
 [104  39]]




svc_model 18000
              precision    recall  f1-score   support

         0.0       0.43      0.01      0.01       429
         1.0       0.25      0.97      0.39       143

   micro avg       0.25      0.25      0.25       572
   macro avg       0.34      0.49      0.20       572
weighted avg       0.38      0.25      0.11       572

[[  3 426]
 [  4 139]]


  'precision', 'predicted', average, warn_for)


svc_model 19000
              precision    recall  f1-score   support

         0.0       0.74      0.70      0.72       429
         1.0       0.23      0.26      0.24       143

   micro avg       0.59      0.59      0.59       572
   macro avg       0.48      0.48      0.48       572
weighted avg       0.61      0.59      0.60       572

[[302 127]
 [106  37]]
svc_model 20000
              precision    recall  f1-score   support

         0.0       0.75      1.00      0.86       429
         1.0       0.00      0.00      0.00       143

   micro avg       0.75      0.75      0.75       572
   macro avg       0.38      0.50      0.43       572
weighted avg       0.56      0.75      0.64       572

[[429   0]
 [143   0]]




svc_model 21000
              precision    recall  f1-score   support

         0.0       0.50      0.01      0.02       429
         1.0       0.25      0.97      0.39       143

   micro avg       0.25      0.25      0.25       572
   macro avg       0.37      0.49      0.21       572
weighted avg       0.44      0.25      0.11       572

[[  5 424]
 [  5 138]]
svc_model 22000
              precision    recall  f1-score   support

         0.0       0.75      0.99      0.85       429
         1.0       0.00      0.00      0.00       143

   micro avg       0.74      0.74      0.74       572
   macro avg       0.37      0.50      0.43       572
weighted avg       0.56      0.74      0.64       572

[[425   4]
 [143   0]]


  'precision', 'predicted', average, warn_for)


svc_model 23000
              precision    recall  f1-score   support

         0.0       0.75      1.00      0.86       429
         1.0       0.00      0.00      0.00       143

   micro avg       0.75      0.75      0.75       572
   macro avg       0.38      0.50      0.43       572
weighted avg       0.56      0.75      0.64       572

[[429   0]
 [143   0]]
svc_model 24000
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       429
         1.0       0.25      0.99      0.40       143

   micro avg       0.25      0.25      0.25       572
   macro avg       0.12      0.50      0.20       572
weighted avg       0.06      0.25      0.10       572

[[  0 429]
 [  1 142]]
svc_model 25000
              precision    recall  f1-score   support

         0.0       0.72      0.30      0.42       429
         1.0       0.24      0.66      0.35       143

   micro avg       0.39      0.39      0.39       572
   macro avg       0.48      0.48 

  'precision', 'predicted', average, warn_for)


svc_model 26000
              precision    recall  f1-score   support

         0.0       0.75      1.00      0.86       429
         1.0       0.00      0.00      0.00       143

   micro avg       0.75      0.75      0.75       572
   macro avg       0.38      0.50      0.43       572
weighted avg       0.56      0.75      0.64       572

[[429   0]
 [143   0]]
svc_model 27000
              precision    recall  f1-score   support

         0.0       0.75      1.00      0.86       429
         1.0       0.00      0.00      0.00       143

   micro avg       0.75      0.75      0.75       572
   macro avg       0.38      0.50      0.43       572
weighted avg       0.56      0.75      0.64       572

[[429   0]
 [143   0]]


  'precision', 'predicted', average, warn_for)


svc_model 28000
              precision    recall  f1-score   support

         0.0       0.75      0.89      0.81       429
         1.0       0.23      0.10      0.14       143

   micro avg       0.69      0.69      0.69       572
   macro avg       0.49      0.49      0.48       572
weighted avg       0.62      0.69      0.64       572

[[382  47]
 [129  14]]
svc_model 29000
              precision    recall  f1-score   support

         0.0       0.46      0.01      0.03       429
         1.0       0.24      0.95      0.39       143

   micro avg       0.25      0.25      0.25       572
   macro avg       0.35      0.48      0.21       572
weighted avg       0.41      0.25      0.12       572

[[  6 423]
 [  7 136]]




## Analyze prediction

In [66]:
def word_to_vector(this_year_token,pairwise_with_windows_list,mystopword,window_size):
    this_year_vs=[]
    for tokenized_article in progressbar(this_year_token,prefix="word to vector"):
        finder = nltk.BigramCollocationFinder.from_words([word for word in tokenized_article if word not in mystopword.stop_word],window_size=window_size)
        this_vs= {key: 0 for key in pairwise_with_windows_list}
        for pair,times in finder.ngram_fd.items():
            if pair in this_vs.keys():
                this_vs[pair]=times
        this_year_vs.append(this_vs)            
    return(this_year_vs)

def get_content(data_df):
    content=data_df.content
    content.index = pd.DatetimeIndex(content.index)
    content=content.dropna(how="any")
    return(content)
class Preprocessor:
    def __init__(self,path="",content=False):
        if len(path)>1:
            raw_df = pd.read_csv(path)
            self.data_df = raw_df.sort_values(by="publish_datetime",ascending=True).set_index('publish_datetime')
            content=self.data_df.content
            content.index = pd.DatetimeIndex(content.index)
            content=content.dropna(how="any")
            self.content = content
        else:
            self.content = content
    def stem_and_other_stuff(self,each_news):
        ps=PorterStemmer()
        return([ps.stem(word.lower()) for word in each_news if word.isalpha()])
    def check_alpha_tolower(self,each_news):
        return([word.lower() for word in each_news if word.isalpha()])
    def get_content_from_date(self,from_date,to_date):
        self.content = self.content[from_date:to_date]
    def to_counter(self,stem=False):
        self.token_content=self.content.apply(word_tokenize)
        if stem:        
            self.tokens=self.token_content.apply(self.stem_and_other_stuff)
        else:
            self.tokens=self.token_content.apply(self.check_alpha_tolower)
        content_counter = Counter()
        for news in progressbar(self.tokens,prefix="to counter"):
            content_counter.update(news)
        self.counter = content_counter


class MyStopWord:
    def __init__(self,content_counter,most_common=100,stop_word=None):
        from nltk.corpus import stopwords
        self.counter_stop_word=[word for word,time in content_counter.most_common(most_common)]
        self.user_keep=[]
        self.user_define=[]
        if stop_word:
            self.stop_word=stop_word
        else:
            self.stop_word=set(self.counter_stop_word+stopwords.words('english')) 
    def keep(self,word):
        self.user_keep.append(word)
        self.stop_word.discard(word)
    def define(self,word):
        self.user_define.append(word)
        self.stop_word.add(word)

class Unigram:
    def __init__(self,target_counter,other_counter):
        self.target_counter = target_counter
        self.other_counter = other_counter
        
    def get_different_corpus_set(self,mystopword,TF_OTHER_THRESHOLD=20,TF_TARGET_THRESHOLD=5):
        other_corpus_set=set(key for key,times in self.other_counter.items() if times>TF_OTHER_THRESHOLD)-mystopword.stop_word
        target_corpus_set=set(key for key,times in self.target_counter.items() if times>TF_TARGET_THRESHOLD)-mystopword.stop_word
        self.different_corpus_set = target_corpus_set-other_corpus_set

class Bigram:
    def __init__(self,token):
        self.token = token
    def count_word_pair_with_windows(self,window_size,mystopword):
        stop_word = mystopword.stop_word
        self.pair_counts = Counter()
        self.pair_distance_counts = Counter()
        for tokens in self.token:
            for i in range(len(tokens) - 1):
                for distance in range(1, window_size):
                    if i + distance < len(tokens):
                        w1 = tokens[i]
                        w2 = tokens[i + distance]
                        if w1 not in stop_word and w2 not in stop_word:
                            self.pair_distance_counts[(w1, w2, distance)] += 1
                            self.pair_counts[(w1, w2)] += 1

In [86]:

# raw_content=raw_df.set_index(pd.DatetimeIndex(raw_df.publish_datetime)).content.sort_index()

TRAIN_START_DATE = "2018-01"

TRAIN_INTIVAL = 8
TEST_INTIVAL = 2

TF_TARGET_THRESHOLD=2
OTHER_TARGET_THRESHOLD=20
DATA_ROOT = "../../../data/"
SUMMARIZE = True
TRAIN_SENTIMENT = False
TEST_SENTIMENT = False

most_common_count = 1000
summarize_word_count = 300

effective_date_path = DATA_ROOT+"crude_oil_price/effective_news_date_days_before_and_after.csv"
from_date = TRAIN_START_DATE
to_date = str(np.datetime64(TRAIN_START_DATE) +
              np.timedelta64(TRAIN_INTIVAL-1, 'M'))

test_from_date = str(np.datetime64(TRAIN_START_DATE) +
              np.timedelta64(TRAIN_INTIVAL, 'M'))
test_to_date = str(np.datetime64(test_from_date)+np.timedelta64(TEST_INTIVAL-1, 'M'))

bdate=pd.bdate_range("2009","2019")
window_size=5
raw_df = pd.read_csv("../../data/crawler_news_data/oilprice_news.csv")
raw_df_cnbc = pd.read_csv("../../data/crawler_news_data/cnbc_oil_news.csv")

raw_df.publish_datetime=pd.DatetimeIndex(raw_df.publish_datetime)
raw_df.loc[~raw_df.publish_datetime.isin(bdate),'publish_datetime']=np.nan
raw_df.publish_datetime=raw_df.publish_datetime.fillna(method='ffill')

raw_df_cnbc.story_publish_datetime=pd.DatetimeIndex(raw_df_cnbc.story_publish_datetime)
raw_df_cnbc.loc[~raw_df_cnbc.story_publish_datetime.isin(bdate),'story_publish_datetime']=np.nan
raw_df_cnbc.story_publish_datetime=raw_df_cnbc.story_publish_datetime.fillna(method='ffill')

data_df=raw_df.sort_values(by="publish_datetime",ascending=True).set_index('publish_datetime')
data_df_cnbc = raw_df_cnbc.sort_values(by="story_publish_datetime",ascending=True).set_index('story_publish_datetime')
data_df_oilprice = pd.DataFrame({"date":raw_df.publish_datetime,"content":raw_df.content})
data_df_cnbc = pd.DataFrame({"date":raw_df_cnbc.story_publish_datetime,"content":raw_df_cnbc.story_full_article})
data_df_oilprice_cnbc = data_df_oilprice.append(data_df_cnbc)
data_df_oilprice_cnbc = data_df_oilprice_cnbc.sort_values(by="date",ascending=True).set_index('date')
raw_content = get_content(data_df_oilprice_cnbc)
train_content = raw_content[from_date:to_date]

In [90]:
if len(test_to_date):
    test_content = raw_content[test_from_date:test_to_date]
else:
    test_content = raw_content[test_from_date:]

test_y_date=test_content.loc[test_y.astype(bool)].index.unique()
prediction_date=test_content.loc[prediction.astype(bool)].index.unique()

In [117]:
#recall
print("recall:",sum(test_y_date.isin(prediction_date))/len(test_y_date), "precision:",sum(prediction_date.isin(test_y_date))/len(prediction_date))
test_df['prediction']=prediction
test_df.to_csv("../../data/prediction/75recall_60precision.csv")

recall: 0.75 precision: 0.6


In [138]:
print(len(test_y_date),len(prediction_date))

20 25


In [139]:
sum(prediction_date.isin(test_y_date))

15

## Draw

In [118]:
# plotly 
import plotly as py
import plotly.graph_objs as go
import plotly.io as pio
import numpy as np
import plotly
%load_ext autoreload
%autoreload 2

# offline plot  
# 因為如果寫成func去調用plotly會無法出現，所以只好用offline的方式。
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# 設定 path 不然找不到 orca
plotly.io.orca.config.executable = '/Users/amandachen/.npm-packages/bin/orca'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [120]:
effective_vs = test_df.loc[prediction==1]
non_effective_vs = test_df.loc[prediction!=1]
pairwise_dictionary = list(test_df.columns[1:-1])
trace1 = go.Bar(
    x=pairwise_dictionary,
    y=[ i/len(effective_vs) for i in list(effective_vs.sum()[1:-1].values)],
    name='Effective_sum'
)
trace2 = go.Bar(
    x=pairwise_dictionary,
    y=[ i/len(non_effective_vs) for i in list(non_effective_vs.sum()[1:-1].values)],
    name='Non_effective_sum'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='stacked-bar')