## Importing Libraries

In [94]:
import pandas as pd

In [95]:
from collections import Counter

## Loading Data and Preprocessing

In [27]:
NEWS_DATA = "FINAL_DATA/predicted_news.csv"
INDICATOR_DATA = "FINAL_DATA/predicted_indicators.csv"
TWEETS_DATA = "FINAL_DATA/predicted_tweets.csv"

In [28]:
news_df = pd.read_csv(NEWS_DATA, sep="~")
indicator_df = pd.read_csv(INDICATOR_DATA, sep="~")
tweets_df = pd.read_csv(TWEETS_DATA, sep="~")

In [74]:
all_news_data = []
for x in news_df.groupby(["date", "Symbol"]).groups:
    all_news_data.append(
        {
            "date": x[0],
            "Symbol": x[1]
        }
    )
for i,pred in enumerate(news_df.groupby(["date", "Symbol"])['prediction'].agg(lambda x: pd.Series.mode(x)[0]).values):
    all_news_data[i].update({"news_pred": pred})

In [75]:
refined_news_data = pd.DataFrame(all_news_data)

In [77]:
refined_news_data.head()

Unnamed: 0,Symbol,date,news_pred
0,ICICIBANK,2019-01-02,0
1,AXISBANK,2019-01-14,0
2,HDFCBANK,2019-01-14,0
3,ICICIBANK,2019-01-14,0
4,AXISBANK,2019-01-15,0


In [20]:
indicator_df.head()

Unnamed: 0,Date,Symbol,close_price_change,indicators_prediction
0,2019-03-25,HDFCBANK,1,1
1,2019-01-11,AXISBANK,1,1
2,2019-01-16,HDFCBANK,0,0
3,2019-03-13,HDFCBANK,1,1
4,2019-03-19,AXISBANK,1,1


In [38]:
tweets_df = tweets_df.drop(["index", ], axis=1)
tweets_df = tweets_df[tweets_df.date >= "2019-01-01"]

In [71]:
all_tweets_data = []
for x in tweets_df.groupby(["date", "Symbol"]).groups:
    all_tweets_data.append(
        {
            "date": x[0],
            "Symbol": x[1]
        }
    )
for i,pred in enumerate(tweets_df.groupby(["date", "Symbol"])['change_prediction'].agg(lambda x: pd.Series.mode(x)[0]).values):
    all_tweets_data[i].update({"tweets_pred": pred})

In [None]:
refined_tweets_data = pd.DataFrame(all_tweets_data)

In [82]:
refined_tweets_data.head()

Unnamed: 0,Symbol,date,tweets_pred
0,AXISBANK,2019-01-01,1
1,HDFCBANK,2019-01-01,1
2,ICICIBANK,2019-01-01,1
3,AXISBANK,2019-01-02,0
4,HDFCBANK,2019-01-02,0


## Combining all data

In [80]:
combined_data = indicator_df.merge(refined_news_data, right_on=["date", "Symbol"], left_on=["Date", "Symbol"])

In [83]:
combined_data = combined_data.merge(refined_tweets_data, left_on=["Date", "Symbol"], right_on=["date", "Symbol"])

In [86]:
combined_data = combined_data.drop(["date_x", "date_y"], axis=1)

In [112]:
combined_data.head(n=6)

Unnamed: 0,Date,Symbol,close_price_change,indicators_prediction,news_pred,tweets_pred,total_pred
0,2019-03-13,HDFCBANK,1,1,1,1,1
1,2019-02-04,ICICIBANK,0,0,0,0,0
2,2019-04-09,AXISBANK,1,0,0,1,0
3,2019-01-28,HDFCBANK,0,0,1,0,0
4,2019-03-05,ICICIBANK,1,0,1,1,0
5,2019-02-07,ICICIBANK,0,1,0,0,0


## Classification

### Fixed Weights

In [103]:
combined_data["total_pred"] = 0.5*combined_data["indicators_prediction"] + 0.25*combined_data["news_pred"] + 0.25*combined_data["tweets_pred"]

In [104]:
combined_data["total_pred"] = [1 if x > 0.6 else 0 for x in combined_data["total_pred"].values]

In [105]:
Counter(combined_data["close_price_change"] == combined_data["total_pred"])

Counter({True: 63, False: 17})

In [107]:
accuracy_fixed_weights = 78.75

In [108]:
from sklearn.metrics import confusion_matrix

In [110]:
tn, fp, fn, tp = confusion_matrix(combined_data["close_price_change"], combined_data["total_pred"]).ravel()

In [111]:
print(tn, fp, fn, tp)

39 2 15 24
