In [239]:
import pandas as pd
import numpy as np


def fix_date(df): 
    dx = df["Date"].apply(lambda x: x.split(" "))
    dx = dx.apply(lambda x: "{} {} {}".format(x[1][:-3], x[0], x[2])) 
    return dx 

data_names = []
file_names = []
companies = ["MMM", "SYF", "BAYRY", "HON"]
for w in companies:
    for i in np.arange(2009,2019):
        file_names.append(w+"_"+str(i)+".csv")
        data_names.append(w+"_"+str(i))
mmm = pd.read_pickle('3m_stocktwits.pickle', 'gzip').rename(columns = {'published_date':"Date", 'content':"Tweet"})
hon = pd.read_pickle('honeywell_stocktwits.pickle', 'gzip').rename(columns = {'published_date':"Date", 'content':"Tweet"})
syf = pd.read_pickle('synchrony_stocktwits.pickle', 'gzip').rename(columns = {'published_date':"Date", 'content':"Tweet"})
bayry = pd.read_pickle('bayer_stocktwits.pickle', 'gzip').rename(columns = {'published_date':"Date", 'content':"Tweet"})
mmm["Date"] = fix_date(mmm)
hon["Date"] = fix_date(hon)
syf["Date"] = fix_date(syf)
bayry["Date"] = fix_date(bayry)


def process_pickles(company_name, frame, dict_to_add):
    temp_dict = {}
    frame["Year"] = frame["Date"].apply(lambda x: x.split(" ")[2])
    years = frame["Year"].unique()
    for i in years:
        name = company_name+"_"+i
        temp = frame.groupby(['Year']).get_group(str(i))
        if int(i) != 2019:
            dict_to_add[name].append(temp[["Date", "Tweet"]])
#         else: 
#             dict_to_add[name] = (temp[["Date", "Tweet"]])
    return dict_to_add


file_dict = {}
for file, name in zip(file_names, data_names):
    file_dict[name] = pd.read_csv(file, names = ["Date", "Tweet"])
    
file_dict = process_pickles("MMM", mmm, file_dict)
file_dict = process_pickles("HON", hon, file_dict)
file_dict = process_pickles("SYF", syf, file_dict)
file_dict = process_pickles("BAYRY", bayry, file_dict)





In [240]:
pre_proc = {}

count = 1;
for key, value in file_dict.items():
    temp_dict = {}
    temp_frame = pd.DataFrame(columns = ["Tweet"])
    for index, row in value.iterrows():
        temp_dict.setdefault(row['Date'], [])
        temp_dict[row['Date']].append(row['Tweet'])
                              
    for keys, tweet in temp_dict.items():
        temp_frame.loc[keys] = [tweet]
    temp_frame = temp_frame.reset_index()
    temp_frame.rename(columns = {"index": "Date", "Tweet":"Tweet"}, inplace = True)
    temp_frame.sort_values(by=['Date'])
    pre_proc[key] = temp_frame

In [241]:
dates = np.arange(1,32)
feb = np.arange(1, 29)
months = {"Jan": 31, "Feb":28, "Mar":31, "Apr":30, "May":31, "Jun":30, "Jul":31, "Aug":31, "Sep":30, "Oct": 31, "Nov":30, "Dec":31}
index_dict = {}
for k, v in months.items():
    for j in range(1,v+1):
            key = str(j) + " " + k
            index_dict[key] = len(index_dict.keys())+1

def make_df(stack, key):
    temp = {}
    year = key.split("_")[-1]
    data = [(t+" "+str(year)) for t in index_dict.keys()]
    for i in data: 
        temp[i] = ""
    return temp

final_proc = {}
for k, v in pre_proc.items():
    temp = make_df(v, k)
    for indx, val in v.iterrows():
        temp[val["Date"]] = val["Tweet"]
    out = pd.DataFrame(columns = {"Date", "Tweet"})
    out["Date"] = temp.keys()
    out["Tweet"] = list(temp.values())
    final_proc[k] = out


In [150]:
def convert_dates(stack):
    for i, v in stack.items():
        v['Date'] = pd.to_datetime(v['Date'], dayfirst = True)
    return stack

In [245]:
import re
def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = re.sub('@[^\s]+','USER', text)
    text = text.lower().replace("ё", "е")
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +',' ', text)
    return text.strip()

for key, value in final_proc.items():
    for idx, txt in value.iterrows():
        txt["Tweet"] = [preprocess_text(t) for t in txt["Tweet"]]

final_proc = convert_dates(final_proc)


invalid escape sequence \.


invalid escape sequence \s


invalid escape sequence \.


invalid escape sequence \s


invalid escape sequence \.


invalid escape sequence \s


invalid escape sequence \.


invalid escape sequence \s



In [10]:
# Read the data from CSV files
n = ['ItemID', 'Sentiment', 'SentimentSource', 'SentimentText']
raw_data = pd.read_csv('Sentiment Analysis Dataset 2.csv', names=n, header = 0, usecols=['Sentiment', 'SentimentText'])

In [11]:
neg, pos = raw_data.groupby('Sentiment')
neg_c = raw_data.Sentiment.value_counts()
sample_size = int(min(neg_c[0], neg_c[1]))
raw_data = np.concatenate((neg[1].values[:sample_size], pos[1].values[:sample_size]), axis=0)
labels = [1]*sample_size + [0]*sample_size

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}
data = [preprocess_text(t[1]) for t in raw_data]

In [14]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42)
print(len(x_test))

520368


In [15]:
from sklearn.metrics import classification_report
clf = GridSearchCV(text_clf, tuned_parameters, cv=3, verbose = 100)
clf.fit(x_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] clf__alpha=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1), score=0.7769849134932376, total=  17.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   27.1s remaining:    0.0s
[CV] clf__alpha=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1), score=0.776203402921333, total=  17.2s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   55.1s remaining:    0.0s
[CV] clf__alpha=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 1), score=0.7765463544284388, total=  17.8s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.4min remaining:    0.0s
[CV] clf__alpha=1, tfidf__norm=l1, tfidf__use_idf=True, vect__ngram_range=(1, 2) 
[C

[CV]  clf__alpha=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 1), score=0.7809136006496917, total=  18.4s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 27.1min remaining:    0.0s
[CV] clf__alpha=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2) 
[CV]  clf__alpha=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2), score=0.7984036073589669, total=  52.4s
[Parallel(n_jobs=1)]: Done  31 out of  31 | elapsed: 28.4min remaining:    0.0s
[CV] clf__alpha=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2) 
[CV]  clf__alpha=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2), score=0.7976249971604461, total=  51.4s
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed: 29.6min remaining:    0.0s
[CV] clf__alpha=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2) 
[CV]  clf__alpha=1, tfidf__norm=l2, tfidf__use_idf=False, vect__ngram_range=(1, 2), score=0.798311596486894, total=  53.4s
[Parallel(n_jobs

[CV] clf__alpha=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2) 
[CV]  clf__alpha=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.7875019876876945, total= 1.1min
[Parallel(n_jobs=1)]: Done  59 out of  59 | elapsed: 55.8min remaining:    0.0s
[CV] clf__alpha=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2) 
[CV]  clf__alpha=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.7878676877731303, total= 1.0min
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 57.4min remaining:    0.0s
[CV] clf__alpha=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(2, 2) 
[CV]  clf__alpha=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(2, 2), score=0.7688581334529728, total=  45.6s
[Parallel(n_jobs=1)]: Done  61 out of  61 | elapsed: 58.4min remaining:    0.0s
[CV] clf__alpha=0.1, tfidf__norm=l2, tfidf__use_idf=True, vect__ngram_range=(2, 2) 
[CV]  clf__alpha=0.1, tfidf__norm=l2, tfidf__use_

[CV] clf__alpha=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 2) 
[CV]  clf__alpha=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(1, 2), score=0.7966050197775487, total=  52.0s
[Parallel(n_jobs=1)]: Done  87 out of  87 | elapsed: 82.8min remaining:    0.0s
[CV] clf__alpha=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(2, 2) 
[CV]  clf__alpha=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(2, 2), score=0.7734809139929977, total=  38.8s
[Parallel(n_jobs=1)]: Done  88 out of  88 | elapsed: 83.7min remaining:    0.0s
[CV] clf__alpha=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(2, 2) 
[CV]  clf__alpha=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(2, 2), score=0.7732701437950069, total=  38.6s
[Parallel(n_jobs=1)]: Done  89 out of  89 | elapsed: 84.6min remaining:    0.0s
[CV] clf__alpha=0.01, tfidf__norm=l1, tfidf__use_idf=False, vect__ngram_range=(2, 2) 
[CV]  clf__alpha=0.01, tfidf__norm=

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2), (2, 2)], 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': [1, 0.1, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=100)

In [16]:
print(classification_report(y_test, clf.predict(x_test), digits=4))    

             precision    recall  f1-score   support

          0     0.8209    0.7777    0.7987    260642
          1     0.7881    0.8297    0.8084    259726

avg / total     0.8045    0.8036    0.8035    520368



In [18]:
import pickle
model = "grid.sav"
pickle.dump(clf, open(model, "wb"))

In [246]:
from scipy import stats
import matplotlib.pyplot as plt

for i, v in final_proc.items():
    emot = []
    for index, val in v.iterrows():
        if len(val["Tweet"]) != 0:
            emot.append([clf.predict(t) for t in [val["Tweet"]]])
        else:
            emot.append(np.NaN)
    v["emot"] = emot
    v["mean"] = [np.mean(t) for t in emot]

In [247]:
final_proc

{'MMM_2009':                                                  Tweet       Date  \
 0                                                   [] 2009-01-01   
 1                                                   [] 2009-01-02   
 2                                                   [] 2009-01-03   
 3                                                   [] 2009-01-04   
 4                                                   [] 2009-01-05   
 5                                                   [] 2009-01-06   
 6                                                   [] 2009-01-07   
 7                                                   [] 2009-01-08   
 8                                                   [] 2009-01-09   
 9                                                   [] 2009-01-10   
 10                                                  [] 2009-01-11   
 11                                                  [] 2009-01-12   
 12                                                  [] 2009-01-13   
 13     

In [263]:
company_stack = {}
for name, val in final_proc.items():
    frames = []
    for key in data_names:
        frames.append(key)
#     frames.sort()
# #     print(name, frames)
    stacked = [val.groupby(final_proc[t]["Date"].dt.year).get_group(int(t.split("_")[1])) for t in frames]
    company_stack[name.split("_")[0]] = pd.concat(stacked)
    company_stack[name.split("_")[0]].reset_index(inplace = True)

In [270]:
for i, v in final_proc["HON_2016"].iterrows():
    if(len(v) > 364):
        v.drop([364])
for i, v in final_proc["HON_2017"].iterrows():
    if(len(v) > 364):
        v.drop([364])
for i, v in final_proc["HON_2018"].iterrows():
    if(len(v) > 364):
        v.drop([364])


In [273]:
%history

from fancyimpute import KNN

copy = company_stack 
completed = {}

for key, value in copy.items():
    completed[key] = KNN(k=10).complete(value)
import pandas as pd
import numpy as np
data_names = []
file_names = []
companies = ["MMM", "SYF", "BAYRY", "HON"]
for w in companies:
    for i in np.arange(2009,2019):
        file_names.append(w+"_"+str(i)+".csv")
        data_names.append(w+"_"+str(i))
mmm = pd.read_pickle('3m_stocktwits.pickle', 'gzip').rename(columns = {'published_date':"Date", 'content':"Tweet"})
hon = pd.read_pickle('honeywell_stocktwits.pickle', 'gzip').rename(columns = {'published_date':"Date", 'content':"Tweet"})
syf = pd.read_pickle('synchrony_stocktwits.pickle', 'gzip').rename(columns = {'published_date':"Date", 'content':"Tweet"})
bayry = pd.read_pickle('bayer_stocktwits.pickle', 'gzip').rename(columns = {'published_date':"Date", 'content':"Tweet"})
mmm["Date"] = fix_date(mmm)
hon["Date"] = fix_date(hon)
syf["Date"] = fix_date(syf)
bayry["Date"] = fix_date(bayry)

In [271]:
print(company_stack["HON"])

       index                                              Tweet       Date  \
0          0                                                 [] 2018-01-01   
1          1                                                 [] 2018-01-02   
2          2  [covered quite a bit in today s user daily run... 2018-01-03   
3          3  [top 1 position 7 32 vlo 6 65 t 6 62 sna 6 4 a... 2018-01-04   
4          4  [the latest update to the r i p portfolio bac ... 2018-01-05   
5          5                                                 [] 2018-01-06   
6          6  [one hell of a nice breakout setup in hon rt i... 2018-01-07   
7          7  [honeywell hon settled 5 at 154 74 much like a... 2018-01-08   
8          8  [hon daily chart could be one to watch for a p... 2018-01-09   
9          9  [doing scans just to keep my skills sharp here... 2018-01-10   
10        10  [wall street breakfast ceos warn of daca hit t... 2018-01-11   
11        11  [user all star trader labu atvi hon nvda mscc ... 

In [274]:
'''
from tabulate import tabulate
for i, v in company_stack.items():
    print(tabulate(v, headers='keys', tablefmt='psql'))
'''
import plotly.plotly as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure


temp = company_stack["HON"]

temp = company_stack["HON"].dropna()
temp['MA_w_imputation'] = temp["imputed_mean"].rolling(window=30).mean()
# temp["interpolated_mean"].apply(lambda x: abs(x-1))
plt.figure(figsize=(30, 12))    
# ax = plt.subplot()
# print(temp['MA_no_imputation'].isna().sum())
ax.get_xaxis().tick_bottom()    
ax.get_yaxis().tick_left()
ax.spines["top"].set_visible(False)    
ax.spines["bottom"].set_visible(False)    
ax.spines["right"].set_visible(False)    
ax.spines["left"].set_visible(False)    

plt.plot_date(temp["Date"], temp["MA_w_imputation"], "--", lw=1, color="black", alpha=1)
# plt.plot_date(temp["Date"], temp['MA_w_imputation'], "--", lw=0.5, color="pink", alpha=0.3)
plt.show()
print(temp)

KeyError: 'imputed_mean'

In [146]:
temp

Unnamed: 0,index,Tweet,Date,emot,mean,imputed_mean,MA_w_imputation,MA_no_imputation
0,0,[],2009-01-01,,,0.000000,,
1,1,[],2009-01-02,,,0.000000,,
2,2,[],2009-01-03,,,0.000000,,
3,3,[],2009-01-04,,,0.000000,,
4,4,[],2009-01-05,,,0.000000,,
5,5,[],2009-01-06,,,0.000000,,
6,6,[],2009-01-07,,,0.000000,,
7,7,[],2009-01-08,,,0.000000,,
8,8,[],2009-01-09,,,0.000000,,
9,9,[],2009-01-10,,,0.000000,0.000000,


In [1296]:
print(len(emot.index.values))

3653
