In [196]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [197]:
df_pos = pd.read_excel('data/output/clean_handlabeled_data.xlsx')
df_neg = pd.read_excel('data/output/clean_handlabeled_data.xlsx')

Raghava

In [198]:
df_bit = pd.read_excel("data/output/clean_bit_data.xlsx") 
df_ts = pd.read_excel("data/output/clean_tesla_data.xlsx") 
df_dj = pd.read_excel("data/output/clean_dowjones_data.xlsx") 

In [199]:
df_bit = df_bit[pd.notnull(df_bit['COMMENT'])]
df_ts = df_ts[pd.notnull(df_ts['COMMENT'])]
df_dj = df_dj[pd.notnull(df_dj['COMMENT'])]

df_pos = df_pos[pd.notnull(df_pos['LABEL'])]
df_neg = df_neg[pd.notnull(df_neg['LABEL'])]


In [200]:
df_bit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22682 entries, 0 to 22681
Data columns (total 3 columns):
date       22682 non-null object
COMMENT    22682 non-null object
LABEL      0 non-null float64
dtypes: float64(1), object(2)
memory usage: 708.8+ KB


## Classifier

In [201]:
df_pos['LABEL'] = df_pos['LABEL'].map({'NEU':'POS','POS':'POS','NEG':'NEG'})

In [202]:
df_pos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 997 entries, 0 to 998
Data columns (total 2 columns):
COMMENT    974 non-null object
LABEL      997 non-null object
dtypes: object(2)
memory usage: 23.4+ KB


In [203]:
df_neg['LABEL'] = df_neg['LABEL'].map({'NEU':'NEG','POS':'POS','NEG':'NEG'})

In [204]:
df_neg

Unnamed: 0,COMMENT,LABEL
0,"thats, bull, say, since, pessimisticvalue, k, ...",NEG
1,"hello, guy, anyone, use, ig, market, issue, pl...",NEG
2,"year, old, hand, pin, falcon, wing, door, subm...",NEG
3,"time, breka, never, really, guys",NEG
4,"world, economy, th, final, one, trump, bankruptcy",NEG
5,"come, q, extremely, strong, ford, expect, bill...",NEG
6,"bad, pessimisticvalue",NEG
7,"see, joke, right, smell, ponzihistory, doesnt,...",NEG
8,"bear, case, touch, fall, bull, case, retest, take",NEG
9,"distance, reentering, downward, channel, pullb...",NEG


In [205]:
def classifier(clf,strat,vec,df):
    
    y = df['LABEL'].astype(str)
    X = df.COMMENT.astype(str)
    #y = pd.get_dummies(y)
    
    if vec == "cv":
        count = CountVectorizer()
    elif vec == "tdidf":
        count = TfidfVectorizer(binary=True, ngram_range=(1,1), use_idf=False)
    
    if strat == "y":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
        X_train_counts = count.fit_transform(X_train)
        #print(X_train_counts)
        X_test_counts = count.transform(X_test)

        
    elif strat == "n":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 
        X_train_counts = count.fit_transform(X_train)
        X_test_counts = count.transform(X_test)

    
    if clf == "lr":
        clf = LogisticRegression(solver="lbfgs",C=0.5).fit(X_train_counts, y_train)
        #clf = LogisticRegression(C=0.4).fit(X_train_counts, y_train)
        
    elif clf == "mnb":
        clf = MultinomialNB().fit(X_train_counts, y_train)
        #clf = GaussianNB().fit(X_train_counts, y_train)
        
    
    elif clf == "svm":
        clf = svm.SVC(kernel='linear').fit(X_train_counts, y_train)

    yPred = clf.predict(X_test_counts)
    yPred_log = clf.predict_proba(X_test_counts)
    
    list_P = list(zip(yPred, X_test))
    list_T = list(zip(yPred, y_test, X_test)) 
    
    
    acc=accuracy_score(y_test,yPred)
    f1=f1_score(y_test,yPred, average='weighted')

    return clf, count,"Accuracy: %s" % acc, "F1 %s" % f1, list_P, list_T

In [206]:
result_hl_neg=classifier("lr","y","cv",df_neg)
result_hl_pos=classifier("lr","y","cv",df_pos)

df_result_hl_neg = pd.DataFrame(result_hl_neg[5], columns = ['Predicted_label', 'True_Label', 'Comment'])

In [207]:
df_result_hl_neg

Unnamed: 0,Predicted_label,True_Label,Comment
0,NEG,NEG,"go, say, break, pessimisticvalue, wti, beat, l..."
1,NEG,POS,"winampwin, bull"
2,NEG,POS,"short, play, think, well, hit, pessimisticvalu..."
3,NEG,NEG,"chaos, come, first, situatio, wait, us"
4,NEG,NEG,"st, support, pessimisticvalue"
5,POS,POS,"tesla, go, test, resistance, optimisticvalue, ..."
6,NEG,NEG,"elon, musk, develop, secret, sauce, tesla, lon..."
7,NEG,NEG,"ok, bull, ypu, know, ’, temporary, right"
8,NEG,NEG,"reason, christmas, rally, every, year, shorter..."
9,NEG,NEG,"pessimisticvalue, still, seem, inevitable"


In [208]:
result_hl_neg[2:4]

('Accuracy: 0.775', 'F1 0.7352774163019046')

In [214]:
df_result_hl_pos = pd.DataFrame(result_hl_pos[5], columns = ['Predicted_label', 'True_Label', 'Comment'])
df_result_hl_pos

Unnamed: 0,Predicted_label,True_Label,Comment
0,POS,NEG,"go, say, break, pessimisticvalue, wti, beat, l..."
1,POS,POS,"winampwin, bull"
2,NEG,POS,"short, play, think, well, hit, pessimisticvalu..."
3,POS,POS,"chaos, come, first, situatio, wait, us"
4,NEG,NEG,"st, support, pessimisticvalue"
5,POS,POS,"tesla, go, test, resistance, optimisticvalue, ..."
6,NEG,POS,"elon, musk, develop, secret, sauce, tesla, lon..."
7,POS,NEG,"ok, bull, ypu, know, ’, temporary, right"
8,NEG,NEG,"reason, christmas, rally, every, year, shorter..."
9,NEG,NEG,"pessimisticvalue, still, seem, inevitable"


In [212]:
result_hl_pos[2:4]

('Accuracy: 0.69', 'F1 0.6835891544117646')

## Results for neutral mapped to negative

In [59]:
result_hl_neg[0].classes_

array(['NEG', 'POS'], dtype=object)

In [60]:
feature_to_coef_hl_neg = {
    word: coef for word, coef in zip(
        result_hl_neg[1].get_feature_names(), result_hl_neg[0].coef_[0]
    )    
}

In [61]:
for best_positive in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 2.054611455174004)
('buy', 1.309284254694718)
('good', 1.219017906624262)
('close', 1.159741976205025)
('party', 0.8992120340128902)
('firework', 0.8896525784428868)
('jump', 0.8849633864320349)
('rise', 0.8667432167268505)
('come', 0.8637719401616092)
('another', 0.8542424216090926)


In [62]:
for best_negative in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('sell', -1.2219976181921164)
('know', -1.1572584436833175)
('wait', -1.1111493883131662)
('intraday', -1.0451686078580784)
('fall', -1.0343017475699385)
('need', -1.0004581426100616)
('well', -0.964731815712976)
('many', -0.8402754586377033)
('yet', -0.8068871504245725)
('look', -0.7768099569608871)


## Results for neutral mapped to positive

In [63]:
feature_to_coef_hl_pos = {
    word: coef for word, coef in zip(
        result_hl_pos[1].get_feature_names(), result_hl_pos[0].coef_[0]
    )    
}

In [64]:
for best_positive in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('nan', 1.2197711953820918)
('kumar', 0.994170061331511)
('lol', 0.9730095966195736)
('bullish', 0.9569605129483826)
('optimisticvalue', 0.9383160947355668)
('chart', 0.9116094187060659)
('buy', 0.8200277432371164)
('good', 0.8093041339654273)
('currently', 0.7984330838667806)
('think', 0.7918177961598891)


In [65]:
for best_negative in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('pessimisticvalue', -1.4039932012117189)
('level', -1.1132433714621937)
('low', -1.0979359224760457)
('fall', -1.0963148127269406)
('bond', -1.0911445207100003)
('short', -1.026077249250611)
('lose', -1.0228452650079287)
('sell', -1.0031698886522442)
('warn', -0.9455212131556906)
('investor', -0.9259313829289989)


## Labeling new comments


In [66]:
def assign_coef(comment, feature):
        comment = comment.split(', ')
        #print(comment)
        coef = []
        for word in comment:
            for w,c in feature.items():
                if w == word:    
                    coef.append(c)
        #print(comment,sum(coef))
        if len(coef) == 0:
            return sum(coef)
        else:
            return sum(coef) / len(coef)


def get_sentiment(compound_score):
    if compound_score > 0:
        return 1
    else:
        return 0


def add_label(df, feature):
    
    df['COEF'] = df['COMMENT'].apply(lambda x: assign_coef(x,feature))
    df['LABEL'] = df['COEF'].apply(lambda x: get_sentiment(x))
    
    return df

## BITCOIN

Labeled to pos

In [67]:
df_bitlabeled_pos=add_label(df_bit, feature_to_coef_hl_pos)


In [68]:
df_bitlabeled_pos.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,04-05-19,"hold, optimisticvalue, becomes, distinct, poss...",1,0.559575
1,04-05-19,"thumb, nasty, bear, love, get, spank, cryptoland",0,-0.187625
2,04-05-19,"mysterious, crypto, whale, unveils, meteoric, ...",0,-0.053632
3,04-05-19,"probably, pessimisticvalue, monday",0,-0.441968
4,04-05-19,"short, btc, maßsive, profit",0,-0.2829


In [69]:
df_bitlabeled_pos['LABEL'].value_counts()

0    13295
1     9387
Name: LABEL, dtype: int64

In [71]:
df_bitlabeled_pos.to_excel("ML_Data/bit_classified_pos.xlsx", sheet_name='Sheet_1') 

Labeled to neg

In [72]:
df_bitlabeled_neg=add_label(df_bit, feature_to_coef_hl_neg)

In [73]:
df_bitlabeled_neg.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,04-05-19,"hold, optimisticvalue, becomes, distinct, poss...",1,0.482363
1,04-05-19,"thumb, nasty, bear, love, get, spank, cryptoland",0,-0.31993
2,04-05-19,"mysterious, crypto, whale, unveils, meteoric, ...",0,-0.018088
3,04-05-19,"probably, pessimisticvalue, monday",0,-0.328881
4,04-05-19,"short, btc, maßsive, profit",0,-0.251865


In [74]:
df_bitlabeled_neg['LABEL'].value_counts()

0    12815
1     9867
Name: LABEL, dtype: int64

In [174]:
df_bitlabeled_neg['LABEL'].value_counts()

0    12233
1    10449
Name: LABEL, dtype: int64

In [40]:
df_bitlabeled_neg.to_excel("ML_Data/bit_classified_neg.xlsx", sheet_name='Sheet_1') 

Results for classifier 

In [75]:
result_bit_pos=classifier("lr","y","cv",df_bitlabeled_pos)

In [76]:
result_bit_pos[2:4]

('Accuracy: 0.9418117698919991', 'F1 0.941623447254783')

In [77]:
result_bit_neg=classifier("lr","y","cv",df_bitlabeled_pos)

In [78]:
result_bit_neg[2:4]

('Accuracy: 0.9418117698919991', 'F1 0.941623447254783')

In [113]:
df=pd.DataFrame(result_bit_neg[4])

### Bitcoin result mapped, neutral mapped to positive

In [79]:
feature_to_coef_bit_pos = {
    word: coef for word, coef in zip(
        result_bit_pos[1].get_feature_names(), result_bit_pos[0].coef_[0]
    )    
}

In [80]:
for best_positive in sorted(
    feature_to_coef_bit_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 9.069971461058072)
('buy', 6.53253854010206)
('good', 6.091401922109993)
('close', 5.194592694561069)
('come', 4.638013357593434)
('another', 4.125160711652745)
('rise', 3.9511575092092937)
('end', 3.7407835437519594)
('jump', 3.739000325200053)
('hold', 3.449158124622038)


In [81]:
for best_negative in sorted(
    feature_to_coef_bit_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('sell', -5.67007255256357)
('wait', -5.018186663038611)
('know', -4.933466486390538)
('fall', -4.721715832854903)
('need', -4.6459294601970935)
('well', -4.400763358177993)
('look', -3.8024955434280696)
('low', -3.759205256552319)
('many', -3.677106380009633)
('try', -3.5544266413262946)


### Bitcoin result mapped, neutral mapped to negative

In [107]:
feature_to_coef_bit_neg = {
    word: coef for word, coef in zip(
        result_bit_neg[1].get_feature_names(), result_bit_neg[0].coef_[0]
    )    
}

In [108]:
for best_positive in sorted(
    feature_to_coef_bit_neg.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 9.069971461058072)
('buy', 6.53253854010206)
('good', 6.091401922109993)
('close', 5.194592694561069)
('come', 4.638013357593434)
('another', 4.125160711652745)
('rise', 3.9511575092092937)
('end', 3.7407835437519594)
('jump', 3.739000325200053)
('hold', 3.449158124622038)


In [109]:
for best_negative in sorted(
    feature_to_coef_bit_neg.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('sell', -5.67007255256357)
('wait', -5.018186663038611)
('know', -4.933466486390538)
('fall', -4.721715832854903)
('need', -4.6459294601970935)
('well', -4.400763358177993)
('look', -3.8024955434280696)
('low', -3.759205256552319)
('many', -3.677106380009633)
('try', -3.5544266413262946)


## Tesla

In [176]:
df_teslabeled_pos=add_label(df_ts, feature_to_coef_hl_pos)


Labeled to pos

In [110]:
df_teslabeled_pos=add_label(df_ts, feature_to_coef_hl_pos)


In [111]:
df_teslabeled_pos.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,03-05-19,"tell, ratio, bull, bear, comment, section, bas...",1,0.042675
1,03-05-19,"right, total, debt, billion, dollar",0,-0.255872
2,03-05-19,"dont, know, day, tesla, hasnt, make, solid, pr...",0,-0.250183
3,03-05-19,"day, great, way, end, week, cheer, cleaner, fu...",0,-0.082739
4,03-05-19,"tesla, waymo, uber, mercedes, ge, batterelectr...",1,0.066279


In [112]:
df_teslabeled_pos['LABEL'].value_counts()

0    2763
1    1444
Name: LABEL, dtype: int64

In [179]:
df_teslabeled_pos['LABEL'].value_counts()

0    2464
1    1743
Name: LABEL, dtype: int64

In [180]:
df_teslabeled_pos.to_excel("ML_Data/tes_classified_pos.xlsx", sheet_name='Sheet_1') 

Labeled to neg

In [181]:
df_teslabeled_neg=add_label(df_ts, feature_to_coef_hl_neg)

In [182]:
df_teslabeled_neg.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,03-05-19,"tell, ratio, bull, bear, comment, section, bas...",0,-0.141945
1,03-05-19,"right, total, debt, billion, dollar",0,-0.831411
2,03-05-19,"dont, know, day, tesla, hasnt, make, solid, pr...",0,-2.396838
3,03-05-19,"day, great, way, end, week, cheer, cleaner, fu...",1,0.844712
4,03-05-19,"tesla, waymo, uber, mercedes, ge, batterelectr...",1,2.302141


In [183]:
df_teslabeled_neg.to_excel("ML_Data/tes_classified_neg.xlsx", sheet_name='Sheet_1') 

In [73]:
result_tes_pos=classifier("lr","y","cv",df_teslabeled_pos)

In [74]:
result_tes_neg=classifier("lr","y","cv",df_teslabeled_neg)

In [86]:
result_tes_pos[2:4]

('Accuracy: 0.8919239904988123', 'F1 0.8914246003167761')

In [88]:
result_tes_neg[2:4]

('Accuracy: 0.9121140142517815', 'F1 0.911857361407464')

### Tesla result mapped, neutral mapped to positive

In [75]:
feature_to_coef_tes_pos = {
    word: coef for word, coef in zip(
        result_tes_pos[1].get_feature_names(), result_tes_pos[0].coef_[0]
    )    
}

In [76]:
for best_positive in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 5.237980426137993)
('buy', 3.254523656833232)
('hold', 3.2134451255218317)
('come', 3.15356919295822)
('good', 3.0031003014560467)
('go', 2.4898691895762557)
('hour', 2.3925249651576097)
('buying', 2.1704773616104394)
('close', 1.99301537007207)
('ok', 1.968232448930962)


In [77]:
for best_negative in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('sell', -3.113630611835505)
('know', -2.558442440560808)
('one', -2.5009558992494454)
('yet', -2.1690605027328735)
('need', -2.165512188969157)
('bad', -2.0168996742788914)
('try', -1.8843146907282229)
('fall', -1.8818429346322458)
('pessimisticvalue', -1.8032735692601785)
('car', -1.7742209455915134)


## Dow Jones

Labeled to pos

In [184]:
df_djlabeled_pos=add_label(df_dj, feature_to_coef_hl_pos)

In [185]:
df_djlabeled_pos.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,03-05-19,"stock, market, gold, silver, one, lie, answer,...",0,-2.464765
1,03-05-19,"company, pay, income, tax, amazon, delta, air,...",1,1.072745
2,03-05-19,"gold, silver, interesting",0,-0.206652
3,03-05-19,"melt, up？crazy",0,0.0
4,03-05-19,"crazy, money, flow, daily, show, get, yet, mar...",0,-0.467521


In [186]:
df_djlabeled_pos['LABEL'].value_counts()

0    7553
1    5476
Name: LABEL, dtype: int64

In [187]:
df_djlabeled_pos.to_excel("ML_Data/dj_classified_pos.xlsx", sheet_name='Sheet_1') 

Labeled to neg

In [188]:
df_djlabeled_neg=add_label(df_dj, feature_to_coef_hl_neg)

In [189]:
df_djlabeled_neg.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,03-05-19,"stock, market, gold, silver, one, lie, answer,...",0,-1.375268
1,03-05-19,"company, pay, income, tax, amazon, delta, air,...",1,0.055596
2,03-05-19,"gold, silver, interesting",1,0.441337
3,03-05-19,"melt, up？crazy",0,0.0
4,03-05-19,"crazy, money, flow, daily, show, get, yet, mar...",0,-1.416995


In [190]:
df_djlabeled_neg['LABEL'].value_counts()

0    7329
1    5700
Name: LABEL, dtype: int64

In [191]:
df_djlabeled_neg.to_excel("ML_Data/dj_classified_neg.xlsx", sheet_name='Sheet_1') 

Results for classifier 

In [80]:
result_dj_pos=classifier("lr","y","cv",df_djlabeled_pos)

In [81]:
result_dj_neg=classifier("lr","y","cv",df_djlabeled_neg)

In [91]:
result_dj_pos[2:4]

('Accuracy: 0.9405218726016884', 'F1 0.9403874234070891')

In [92]:
result_dj_neg[2:4]

('Accuracy: 0.9282425172678435', 'F1 0.927937611947455')

### Dow jones result mapped, neutral mapped to positive

In [82]:
feature_to_coef_dj_pos = {
    word: coef for word, coef in zip(
        result_dj_pos[1].get_feature_names(), result_dj_pos[0].coef_[0]
    )    
}

In [89]:
for best_positive in sorted(
    feature_to_coef_dj_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 8.540239418553975)
('come', 4.943839737513353)
('buy', 4.5748817121086605)
('good', 4.57241670714433)
('hold', 4.331025666951921)
('close', 3.8068796384917283)
('go', 3.3586561581872085)
('cover', 3.1961506969535924)
('soon', 2.9161872489876695)
('buying', 2.8956733451183583)


In [90]:
for best_negative in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('sell', -3.113630611835505)
('know', -2.558442440560808)
('one', -2.5009558992494454)
('yet', -2.1690605027328735)
('need', -2.165512188969157)
('bad', -2.0168996742788914)
('try', -1.8843146907282229)
('fall', -1.8818429346322458)
('pessimisticvalue', -1.8032735692601785)
('car', -1.7742209455915134)
