# Processing the tweets and get params for stock price movement

In [1]:
# import packages
import numpy as np
import pandas as pd
from functools import partial

In [2]:
# import utils
from lib.preprocess import pattern_remove
from lib.sentiment import get_score
from functools import partial

In [35]:
# data file path
data_files = "./data/tweet.csv"

In [10]:
# params
stock_name = "AMZN"
company_name= "Amazon"

In [36]:
# import tweets
data = pd.read_csv(data_files)

In [37]:
# clean the text content
data["text_clean"] = data.apply(pattern_remove,axis=1)

In [38]:
compound = partial(get_score, "text_clean", "compound")
negative = partial(get_score, "text_clean", "neg")
neutral = partial(get_score, "text_clean", "neu")
positive = partial(get_score, "text_clean", "pos")

In [39]:
# using vader to analysis the context sentiment    
data["compound"] = data.apply(compound, axis=1)
data["negative"] = data.apply(negative, axis=1)
data["neutral"] = data.apply(neutral, axis=1)
data["positive"] = data.apply(positive, axis=1)

In [40]:
# format the date record
data["datetime_format"] = pd.to_datetime(data["datetime"]).dt.date

In [41]:
def sentiment(row):
    compound = row["compound"]
    if compound > 0:
        return 1
    elif compound < 0:
        return -1
    return 0

In [42]:
def is_favorite(row):
    favorite=int(row["nbr_favorite"])
    if favorite > 0:
        return 1
    else:
        return -1

In [43]:
# add sentiment classification
data["sentiment"] = data.apply(sentiment, axis=1)

In [44]:
# change te is_reply record
data["is_reply"] = data["is_reply"].astype(int)

In [45]:
# add favoriate or not
data["is_favorite"] = data.apply(is_favorite, axis=1)

In [46]:
# add stocks and company infos to the data record
data["stock"] = "AMZN"
data["company"] = "Amazon"

In [47]:
# drop empty value
# TODO

In [48]:
# save the processed data
processed_data_save_path = "./result/preprocessed_data.csv"
data.to_csv(processed_data_save_path)

## Mining params from tweets
1. sentiment_distribution

----

### sentiment_distribution

In [49]:
def sentiment_distribution(raw_data):
    """
    get the sentiment distribution from the raw data
    :param raw_data: pandas.Dataframe cleaned & processed data
    :return: pandas.Dataframe sentiment_distribution 
    example of the output sentiment_distribution:

        datetime_format no_reply_neg no_reply_neu no_reply_pos replied_neg replied_neu replied_pos
    0   2018-01-01 0.1070 0.0642 0.2193 0.1818 0.0535 0.3743
    """
    data = raw_data.copy()
    pivot = pd.pivot_table(data,
        index=["datetime_format"],
        columns=["is_reply","sentiment"],
        values=["ID"],aggfunc=[lambda x: len(x)])
    pivot["total"] = pivot.apply(np.sum,axis=1)
    pivot.columns = pivot.columns.droplevel([0,1])
    pivot.columns = pivot.columns.map(lambda x :"_".join([str(m) for m in x]))
    pivot.columns=["no_reply_neg","no_reply_neu",
                                 "no_reply_pos","replied_neg",
                                 "replied_neu","replied_pos","total"]

    sentiment_total = pivot.loc[:,"total"].copy()
    normal_data = pivot.div(sentiment_total,axis=0)
    normal_data = normal_data.applymap(lambda x: '%.4f' % x)

    return normal_data.drop(["total"],axis=1).reset_index().copy()

In [50]:
sentiment_dis = sentiment_distribution(data)

In [55]:
sentiment_dis.head()

Unnamed: 0,datetime_format,no_reply_neg,no_reply_neu,no_reply_pos,replied_neg,replied_neu,replied_pos
0,2018-01-01,0.1143,0.0667,0.6762,0.0381,0.0476,0.0571
1,2018-01-07,0.0857,0.0476,0.6762,0.0095,0.1143,0.0667
2,2018-01-13,0.1132,0.1415,0.5566,0.0283,0.0943,0.066
3,2018-01-19,0.1429,0.1238,0.5619,0.0286,0.0952,0.0476
4,2018-01-25,0.1333,0.0857,0.5714,0.0286,0.0762,0.1048


In [56]:
# saving the params
params_saving_path ="./result/params.csv"

In [57]:
sentiment_dis.to_csv(params_saving_path)