In [35]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [36]:
df_pos = pd.read_excel('data/output/clean_handlabeled_data.xlsx')
df_neg = pd.read_excel('data/output/clean_handlabeled_data.xlsx')

Raghava

In [37]:
df_bit = pd.read_excel("data/output/clean_bit_data.xlsx") 
df_ts = pd.read_excel("data/output/clean_tesla_data.xlsx") 
df_dj = pd.read_excel("data/output/clean_dowjones_data.xlsx") 

In [38]:
df_bit = df_bit[pd.notnull(df_bit['COMMENT'])]
df_ts = df_ts[pd.notnull(df_ts['COMMENT'])]
df_dj = df_dj[pd.notnull(df_dj['COMMENT'])]

df_pos = df_pos[pd.notnull(df_pos['LABEL'])]
df_neg = df_neg[pd.notnull(df_neg['LABEL'])]

In [39]:
df_bit.head()

Unnamed: 0,date,COMMENT,LABEL
0,04-05-19,"hold, optimisticvalue, becomes, distinct, poss...",
1,04-05-19,"thumb, nasty, bear, love, get, spank, cryptoland",
2,04-05-19,"mysterious, crypto, whale, unveils, meteoric, ...",
3,04-05-19,"probably, pessimisticvalue, monday",
4,04-05-19,"short, btc, maßsive, profit",


## Classifier

In [40]:
df_pos['LABEL'] = df_pos['LABEL'].map({'NEU':'POS','POS':'POS','NEG':'NEG'})

In [41]:
df_pos.head()

Unnamed: 0,COMMENT,LABEL
0,"thats, bull, say, since, pessimisticvalue, k, ...",NEG
1,"hello, guy, anyone, use, ig, market, issue, pl...",POS
2,"year, old, hand, pin, falcon, wing, door, subm...",POS
3,"time, breka, never, really, guys",NEG
4,"world, economy, th, final, one, trump, bankruptcy",NEG


In [42]:
df_neg['LABEL'] = df_neg['LABEL'].map({'NEU':'NEG','POS':'POS','NEG':'NEG'})

In [43]:
df_neg.head()

Unnamed: 0,COMMENT,LABEL
0,"thats, bull, say, since, pessimisticvalue, k, ...",NEG
1,"hello, guy, anyone, use, ig, market, issue, pl...",NEG
2,"year, old, hand, pin, falcon, wing, door, subm...",NEG
3,"time, breka, never, really, guys",NEG
4,"world, economy, th, final, one, trump, bankruptcy",NEG


In [44]:
def classifier(clf,strat,vec,df):
    
    y = df['LABEL'].astype(str)
    X = df.COMMENT.astype(str)
    #y = pd.get_dummies(y)
    
    if vec == "cv":
        count = CountVectorizer()
    elif vec == "tdidf":
        count = TfidfVectorizer(binary=True, ngram_range=(1,1), use_idf=False)
    
    if strat == "y":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
        X_train_counts = count.fit_transform(X_train)

        X_test_counts = count.transform(X_test)

        
    elif strat == "n":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 
        X_train_counts = count.fit_transform(X_train)
        X_test_counts = count.transform(X_test)

    
    if clf == "lr":
        clf = LogisticRegression(solver="newton-cg",C=1).fit(X_train_counts, y_train)
        #clf = LogisticRegression(C=0.4).fit(X_train_counts, y_train)
        
    elif clf == "mnb":
        clf = MultinomialNB().fit(X_train_counts, y_train)
        
    
    elif clf == "svm":
        clf = svm.SVC(kernel='linear').fit(X_train_counts, y_train)

    yPred = clf.predict(X_test_counts)
    yPred_log = clf.predict_proba(X_test_counts)
    
    list_ = list(zip(yPred, X_test))
    
    acc=accuracy_score(y_test,yPred)
    f1=f1_score(y_test,yPred, average='weighted')

    return clf, count,"Accuracy: %s" % acc, "F1 %s" % f1, list_, yPred_log

In [45]:
result_hl_neg=classifier("lr","y","cv",df_neg)
result_hl_pos=classifier("lr","y","cv",df_pos)

#df_result_hl = pd.DataFrame(result_hl[4], columns = ['Predicted_label', 'Comment'])

## Results for neutral mapped to negative

In [46]:
result_hl_neg[0].classes_

array(['NEG', 'POS'], dtype=object)

In [47]:
feature_to_coef_hl_neg = {
    word: coef for word, coef in zip(
        result_hl_neg[1].get_feature_names(), result_hl_neg[0].coef_[0]
    )    
}

In [48]:
for best_positive in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 2.7145083657692686)
('come', 1.0961292733510961)
('hold', 1.0679674196517166)
('buy', 1.020816268033973)
('buying', 1.0034098955937554)
('good', 0.9735780649735009)
('firework', 0.9372279254387231)
('beautiful', 0.8973338073038175)
('cover', 0.8602513609399144)
('dip', 0.8460447827944467)


In [49]:
for best_negative in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('know', -1.2924863501103778)
('sell', -1.1280357824214031)
('need', -1.0824113562979611)
('yet', -0.9119875769924252)
('one', -0.8789857629010837)
('many', -0.8503359114696245)
('take', -0.8414034901100296)
('last', -0.7755707766791274)
('low', -0.7696253755937699)
('try', -0.7666984086650084)


## Results for neutral mapped to positive

In [50]:
feature_to_coef_hl_pos = {
    word: coef for word, coef in zip(
        result_hl_pos[1].get_feature_names(), result_hl_pos[0].coef_[0]
    )    
}

In [51]:
for best_positive in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 1.1752182609415582)
('nan', 1.144959760135993)
('kumar', 1.0153703684787325)
('lol', 0.9835062943025485)
('hold', 0.9523235134991022)
('long', 0.925757224616499)
('guy', 0.9128486155419003)
('put', 0.9093551587611052)
('buy', 0.8233909258223885)
('eth', 0.8156784356697819)


In [52]:
for best_negative in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('pessimisticvalue', -1.4407768537093855)
('level', -1.4056787681052145)
('fall', -1.096170520728939)
('bankruptcy', -1.0550437533775)
('sell', -1.0222387335352676)
('low', -1.0218587162027533)
('warn', -1.0120960520803493)
('without', -0.8520663845938539)
('investor', -0.8412913566162127)
('way', -0.8289507796724974)


## Labeling new comments


In [118]:
def assign_coef(comment, feature):
        comment = comment.split(', ')
        #print(comment)
        coef = []
        for word in comment:
            for w,c in feature.items():
                if w == word:
                    coef.append(c)
        #print(comment,sum(coef))
        return sum(coef)


def get_sentiment(compound_score):
    if compound_score > 0:
        return 1
    else:
        return 0


def add_label(df, feature):
    
    df['COEF'] = df['COMMENT'].apply(lambda x: assign_coef(x,feature))
    
    df['LABEL'] = df['COEF'].apply(lambda x: get_sentiment(x))
    
    return df

## BITCOIN

Labeled to pos

In [128]:
df_bitlabeled_pos=add_label(df_bit, feature_to_coef_hl_pos)


In [129]:
df_bitlabeled_pos.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,04-05-19,"hold, optimisticvalue, becomes, distinct, poss...",1,2.623776
1,04-05-19,"thumb, nasty, bear, love, get, spank, cryptoland",0,-0.6018
2,04-05-19,"mysterious, crypto, whale, unveils, meteoric, ...",1,0.44658
3,04-05-19,"probably, pessimisticvalue, monday",0,-1.374982
4,04-05-19,"short, btc, maßsive, profit",0,-0.609115


In [130]:
df_bitlabeled_pos['LABEL'].value_counts()

0    12120
1    10562
Name: LABEL, dtype: int64

In [None]:
df_bitlabeled_pos.to_excel("ML_Data/bit_classified_pos.xlsx", sheet_name='Sheet_1') 

Labeled to neg

In [121]:
df_bitlabeled_neg=add_label(df_bit, feature_to_coef_hl_neg)

In [122]:
df_bitlabeled_neg.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,04-05-19,"hold, optimisticvalue, becomes, distinct, poss...",1,3.024616
1,04-05-19,"thumb, nasty, bear, love, get, spank, cryptoland",0,-0.798447
2,04-05-19,"mysterious, crypto, whale, unveils, meteoric, ...",1,0.840196
3,04-05-19,"probably, pessimisticvalue, monday",0,-1.026745
4,04-05-19,"short, btc, maßsive, profit",1,0.130944


In [None]:
df_bitlabeled_neg['LABEL'].value_counts()

In [123]:
df_bitlabeled_neg.to_excel("ML_Data/bit_classified_neg.xlsx", sheet_name='Sheet_1') 

Results for classifier 

In [63]:
result_bit_pos=classifier("lr","y","cv",df_bitlabeled_pos)

In [84]:
result_bit_pos[2:4]

('Accuracy: 0.943134229667181', 'F1 0.9430249965665214')

In [64]:
result_bit_neg=classifier("lr","y","cv",df_bitlabeled_pos)

In [85]:
result_bit_neg[2:4]

('Accuracy: 0.9519506281683933', 'F1 0.9519027405085999')

### Bitcoin result mapped, neutral mapped to positive

In [67]:
feature_to_coef_bit_pos = {
    word: coef for word, coef in zip(
        result_bit_pos[1].get_feature_names(), result_bit_pos[0].coef_[0]
    )    
}

In [68]:
for best_positive in sorted(
    feature_to_coef_bit_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 10.448498174869902)
('come', 5.5309449643145845)
('buy', 5.3025002942963075)
('hold', 4.9594003005378084)
('good', 4.946538698795578)
('buying', 4.349338440048048)
('bitcoin', 4.070501625769858)
('close', 3.898465854725408)
('cover', 3.831341516367673)
('soon', 3.624530194450121)


In [69]:
for best_negative in sorted(
    feature_to_coef_bit_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('know', -5.64748413706954)
('sell', -5.631504469423946)
('need', -4.644676866110371)
('take', -4.043689807135337)
('try', -4.0315031929983505)
('one', -4.024366196233891)
('many', -3.986269912720603)
('yet', -3.9645309734836274)
('last', -3.75212645617468)
('low', -3.5909646603354712)


## Tesla

In [131]:
df_teslabeled_pos=add_label(df_ts, feature_to_coef_hl_pos)


Labeled to pos

In [132]:
df_teslabeled_pos=add_label(df_ts, feature_to_coef_hl_pos)


In [133]:
df_teslabeled_pos.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,03-05-19,"tell, ratio, bull, bear, comment, section, bas...",1,0.704567
1,03-05-19,"right, total, debt, billion, dollar",0,-0.799932
2,03-05-19,"dont, know, day, tesla, hasnt, make, solid, pr...",0,-2.799863
3,03-05-19,"day, great, way, end, week, cheer, cleaner, fu...",0,-1.239853
4,03-05-19,"tesla, waymo, uber, mercedes, ge, batterelectr...",1,1.261437


In [134]:
df_teslabeled_pos['LABEL'].value_counts()

0    2464
1    1743
Name: LABEL, dtype: int64

In [135]:
df_teslabeled_pos.to_excel("ML_Data/bit_classified_pos.xlsx", sheet_name='Sheet_1') 

Labeled to neg

In [136]:
df_teslabeled_neg=add_label(df_ts, feature_to_coef_hl_neg)

In [137]:
df_teslabeled_neg.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,03-05-19,"tell, ratio, bull, bear, comment, section, bas...",0,-0.141945
1,03-05-19,"right, total, debt, billion, dollar",0,-0.831411
2,03-05-19,"dont, know, day, tesla, hasnt, make, solid, pr...",0,-2.396838
3,03-05-19,"day, great, way, end, week, cheer, cleaner, fu...",1,0.844712
4,03-05-19,"tesla, waymo, uber, mercedes, ge, batterelectr...",1,2.302141


In [138]:
df_teslabeled_neg.to_excel("ML_Data/bit_classified_neg.xlsx", sheet_name='Sheet_1') 

In [73]:
result_tes_pos=classifier("lr","y","cv",df_teslabeled_pos)

In [74]:
result_tes_neg=classifier("lr","y","cv",df_teslabeled_neg)

In [86]:
result_tes_pos[2:4]

('Accuracy: 0.8919239904988123', 'F1 0.8914246003167761')

In [88]:
result_tes_neg[2:4]

('Accuracy: 0.9121140142517815', 'F1 0.911857361407464')

### Tesla result mapped, neutral mapped to positive

In [75]:
feature_to_coef_tes_pos = {
    word: coef for word, coef in zip(
        result_tes_pos[1].get_feature_names(), result_tes_pos[0].coef_[0]
    )    
}

In [76]:
for best_positive in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 5.237980426137993)
('buy', 3.254523656833232)
('hold', 3.2134451255218317)
('come', 3.15356919295822)
('good', 3.0031003014560467)
('go', 2.4898691895762557)
('hour', 2.3925249651576097)
('buying', 2.1704773616104394)
('close', 1.99301537007207)
('ok', 1.968232448930962)


In [77]:
for best_negative in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('sell', -3.113630611835505)
('know', -2.558442440560808)
('one', -2.5009558992494454)
('yet', -2.1690605027328735)
('need', -2.165512188969157)
('bad', -2.0168996742788914)
('try', -1.8843146907282229)
('fall', -1.8818429346322458)
('pessimisticvalue', -1.8032735692601785)
('car', -1.7742209455915134)


## Dow Jones

Labeled to pos

In [139]:
df_djlabeled_pos=add_label(df_dj, feature_to_coef_hl_pos)

In [140]:
df_djlabeled_pos.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,03-05-19,"stock, market, gold, silver, one, lie, answer,...",0,-2.464765
1,03-05-19,"company, pay, income, tax, amazon, delta, air,...",1,1.072745
2,03-05-19,"gold, silver, interesting",0,-0.206652
3,03-05-19,"melt, up？crazy",0,0.0
4,03-05-19,"crazy, money, flow, daily, show, get, yet, mar...",0,-0.467521


In [141]:
df_djlabeled_pos['LABEL'].value_counts()

0    7553
1    5476
Name: LABEL, dtype: int64

In [142]:
df_djlabeled_pos.to_excel("ML_Data/bit_classified_pos.xlsx", sheet_name='Sheet_1') 

Labeled to neg

In [143]:
df_djlabeled_neg=add_label(df_dj, feature_to_coef_hl_neg)

In [144]:
df_djlabeled_neg.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,04-05-19,"hold, optimisticvalue, becomes, distinct, poss...",1,3.024616
1,04-05-19,"thumb, nasty, bear, love, get, spank, cryptoland",0,-0.798447
2,04-05-19,"mysterious, crypto, whale, unveils, meteoric, ...",1,0.840196
3,04-05-19,"probably, pessimisticvalue, monday",0,-1.026745
4,04-05-19,"short, btc, maßsive, profit",1,0.130944


In [145]:
df_djlabeled_neg['LABEL'].value_counts()

0    12233
1    10449
Name: LABEL, dtype: int64

In [146]:
df_djlabeled_neg.to_excel("ML_Data/bit_classified_neg.xlsx", sheet_name='Sheet_1') 

Results for classifier 

In [80]:
result_dj_pos=classifier("lr","y","cv",df_djlabeled_pos)

In [81]:
result_dj_neg=classifier("lr","y","cv",df_djlabeled_neg)

In [91]:
result_dj_pos[2:4]

('Accuracy: 0.9405218726016884', 'F1 0.9403874234070891')

In [92]:
result_dj_neg[2:4]

('Accuracy: 0.9282425172678435', 'F1 0.927937611947455')

### Dow jones result mapped, neutral mapped to positive

In [82]:
feature_to_coef_dj_pos = {
    word: coef for word, coef in zip(
        result_dj_pos[1].get_feature_names(), result_dj_pos[0].coef_[0]
    )    
}

In [89]:
for best_positive in sorted(
    feature_to_coef_dj_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 8.540239418553975)
('come', 4.943839737513353)
('buy', 4.5748817121086605)
('good', 4.57241670714433)
('hold', 4.331025666951921)
('close', 3.8068796384917283)
('go', 3.3586561581872085)
('cover', 3.1961506969535924)
('soon', 2.9161872489876695)
('buying', 2.8956733451183583)


In [90]:
for best_negative in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('sell', -3.113630611835505)
('know', -2.558442440560808)
('one', -2.5009558992494454)
('yet', -2.1690605027328735)
('need', -2.165512188969157)
('bad', -2.0168996742788914)
('try', -1.8843146907282229)
('fall', -1.8818429346322458)
('pessimisticvalue', -1.8032735692601785)
('car', -1.7742209455915134)
