# 数据预处理

In [1]:
import numpy as np
import pandas as pd
from functools import partial

In [2]:
from lib.preprocess import pattern_remove
from lib.sentiment import get_score

In [3]:
pattern_remove?

## 导入测试数据
---

In [4]:
test_data_filepath = "./data/tweet.csv"

In [5]:
data = pd.read_csv(test_data_filepath)

In [6]:
data.head(5)

Unnamed: 0,nbr_retweet,user_id,url,text,usernameTweet,datetime,is_reply,is_retweet,ID,nbr_reply
0,0,361550292,/waltgolf3/status/1013207446532296705,Get your facts straight or get off the kool ...,waltgolf3,2018-06-30 23:47:34,True,False,1013207446532296705,0
1,0,734185973114773504,/PirateNinjaFrog/status/1013201101468807169,Unemployment and illegal immigration have been...,PirateNinjaFrog,2018-06-30 23:22:21,True,False,1013201101468807169,0
2,0,3073317560,/aaronauto2015/status/1013198130383327232,Need a Quality Used Engine for your Imported C...,aaronauto2015,2018-06-30 23:10:33,False,False,1013198130383327232,0
3,0,2262994542,/jkw_iii/status/1013197092267921412,"So, if that works for you, my d land price c...",jkw_iii,2018-06-30 23:06:25,True,False,1013197092267921412,1
4,0,949111167439589376,/podburnout/status/1013196248030089217,Hmm just a guess: maybe the wealthy pulled the...,podburnout,2018-06-30 23:03:04,False,False,1013196248030089217,0


In [7]:
data["text_clean"] = data.apply(pattern_remove,axis=1)

In [8]:
data.head(20)["text_clean"]

0     Get your facts straight get off the kool aid t...
1     Unemployment and illegal immigration have been...
2     Need Quality Used Engine for your Imported Car...
3     that works for you land price can pricing did ...
4     Hmm just guess maybe the wealthy pulled their ...
5     wasn referring one specific stock Plenty Just ...
6     Are you Bullish Bearish #Microsoft Leverage Re...
7     you buy the entire stock single item you are t...
8     not Neither Secretary Clinton President Obama ...
9     Are you Bullish Bearish #Google Leverage Regis...
10    They say the most powerful factor investing ti...
11    Obama used fake reports people being turned aw...
12    god expected interviewing him would more suite...
13    The best way gain financial market currency st...
14    #corporate #tax #cut doesn increase #investmen...
15    Hmm can entirely separate what know about the ...
16    What stupid article are strong bull market any...
17    Big difference Stock market reflects the c

In [9]:
compound = partial(get_score,"text_clean","compound")
negative = partial(get_score,"text_clean","neg")
neutral = partial(get_score,"text_clean","neu")
positive = partial(get_score,"text_clean","pos")

In [10]:
data["compound"] = data.apply(compound,axis=1)
data["negative"] = data.apply(negative,axis=1)
data["neutral"] = data.apply(neutral,axis=1)
data["positive"] = data.apply(positive,axis=1)

In [29]:
data.head(4)

Unnamed: 0,nbr_retweet,user_id,url,text,usernameTweet,datetime,is_reply,is_retweet,ID,nbr_reply,text_clean,compound,negative,neutral,positive,stock,company,datetime_format,sentiment
0,0,361550292,/waltgolf3/status/1013207446532296705,Get your facts straight or get off the kool ...,waltgolf3,2018-06-30 23:47:34,True,False,1013207446532296705,0,Get your facts straight get off the kool aid t...,-0.738,0.203,0.749,0.047,AMZN,Amazon,2018-06-30,-1
1,0,734185973114773504,/PirateNinjaFrog/status/1013201101468807169,Unemployment and illegal immigration have been...,PirateNinjaFrog,2018-06-30 23:22:21,True,False,1013201101468807169,0,Unemployment and illegal immigration have been...,-0.8438,0.295,0.605,0.099,AMZN,Amazon,2018-06-30,-1
2,0,3073317560,/aaronauto2015/status/1013198130383327232,Need a Quality Used Engine for your Imported C...,aaronauto2015,2018-06-30 23:10:33,False,False,1013198130383327232,0,Need Quality Used Engine for your Imported Car...,0.296,0.0,0.939,0.061,AMZN,Amazon,2018-06-30,1
3,0,2262994542,/jkw_iii/status/1013197092267921412,"So, if that works for you, my d land price c...",jkw_iii,2018-06-30 23:06:25,True,False,1013197092267921412,1,that works for you land price can pricing did ...,0.6369,0.0,0.802,0.198,AMZN,Amazon,2018-06-30,1


In [12]:
data["stock"] = "AMZN"
data["company"] = "Amazon"

In [None]:
!mkdir result

In [49]:
data.to_csv("./result/preprocessed_data.csv")

In [20]:
data["datetime_format"] = pd.to_datetime(data["datetime"]).dt.date

## 标记tweet的sentiment

In [25]:
def sentiment_class(row,key="compound"):
    sentiment_score = row[key]
    if sentiment_score > 0:
        return 1
    elif sentiment_score < 0:
        return -1
    return 0

In [26]:
data["sentiment"] = data.apply(sentiment_class,axis=1)

## 根据日期合并每天的social media sentiment

### market sentiment score

In [30]:
data_groups = data.groupby("datetime_format")

In [39]:
result = {"date":[],"neg":[],"pos":[],"neu":[],}

In [48]:
for key, group in data_groups:
    value_count = pd.value_counts(group["sentiment"])
    print(pd.value_counts(group["nbr_retweet"]))
    print('\n')
    print(pd.value_counts(group["is_reply"]))
    print('\n')
    print(value_count)
    print('\n') 
    print(pd.DataFrame(value_count).T)
    print(type(value_count))
    break

0     166
1      11
2       5
5       2
29      1
4       1
3       1
Name: nbr_retweet, dtype: int64


True     114
False     73
Name: is_reply, dtype: int64


 1    111
-1     54
 0     22
Name: sentiment, dtype: int64


             1  -1   0
sentiment  111  54  22
<class 'pandas.core.series.Series'>
