In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [2]:
df_pos = pd.read_excel('data/output/clean_handlabeled_data.xlsx')
df_neg = pd.read_excel('data/output/clean_handlabeled_data.xlsx')

Raghava

In [3]:
df_bit = pd.read_excel("data/output/clean_bit_data.xlsx") 
df_ts = pd.read_excel("data/output/clean_tesla_data.xlsx") 
df_dj = pd.read_excel("data/output/clean_dowjones_data.xlsx") 

In [4]:
df_bit = df_bit[pd.notnull(df_bit['COMMENT'])]
df_ts = df_ts[pd.notnull(df_ts['COMMENT'])]
df_dj = df_dj[pd.notnull(df_dj['COMMENT'])]

df_pos = df_pos[pd.notnull(df_pos['LABEL'])]
df_neg = df_neg[pd.notnull(df_neg['LABEL'])]


In [5]:
#df_bit.info()

## Classifier

In [6]:
df_pos['LABEL'] = df_pos['LABEL'].map({'NEU':'POS','POS':'POS','NEG':'NEG'})

In [7]:
#df_pos.info()

In [8]:
df_neg['LABEL'] = df_neg['LABEL'].map({'NEU':'NEG','POS':'POS','NEG':'NEG'})

In [9]:
def classifier(clf,strat,vec,df):
    
    y = df['LABEL'].astype(str)
    X = df.COMMENT.astype(str)
    #y = pd.get_dummies(y)
    
    if vec == "cv":
        count = CountVectorizer()
    elif vec == "tdidf":
        count = TfidfVectorizer(binary=True, ngram_range=(1,1), use_idf=False)
    
    if strat == "y":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
        X_train_counts = count.fit_transform(X_train)
        #print(X_train_counts)
        X_test_counts = count.transform(X_test)

        
    elif strat == "n":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 
        X_train_counts = count.fit_transform(X_train)
        X_test_counts = count.transform(X_test)

    
    if clf == "lr":
        clf = LogisticRegression(solver="lbfgs",C=0.5).fit(X_train_counts, y_train)
        #clf = LogisticRegression(C=0.4).fit(X_train_counts, y_train)
        
    elif clf == "mnb":
        clf = MultinomialNB().fit(X_train_counts, y_train)
        #clf = GaussianNB().fit(X_train_counts, y_train)
        
    
    elif clf == "svm":
        clf = svm.SVC(kernel='linear').fit(X_train_counts, y_train)

    yPred = clf.predict(X_test_counts)
    yPred_log = clf.predict_proba(X_test_counts)
    
    list_P = list(zip(yPred, X_test))
    list_T = list(zip(yPred, y_test, X_test)) 
    
    
    acc=accuracy_score(y_test,yPred)
    f1=f1_score(y_test,yPred, average='weighted')

    return clf, count,"Accuracy: %s" % acc, "F1 %s" % f1, list_P, list_T

In [10]:
result_hl_neg=classifier("lr","y","cv",df_neg)
result_hl_pos=classifier("lr","y","cv",df_pos)

df_result_hl_neg = pd.DataFrame(result_hl_neg[5], columns = ['Predicted_label', 'True_Label', 'Comment'])

In [11]:
result_hl_neg[2:4]

('Accuracy: 0.775', 'F1 0.7352774163019046')

In [12]:
df_result_hl_pos = pd.DataFrame(result_hl_pos[5], columns = ['Predicted_label', 'True_Label', 'Comment'])
#df_result_hl_pos

In [13]:
result_hl_pos[2:4]

('Accuracy: 0.69', 'F1 0.6835891544117646')

## Results for neutral mapped to negative

In [14]:
result_hl_neg[0].classes_

array(['NEG', 'POS'], dtype=object)

In [15]:
feature_to_coef_hl_neg = {
    word: coef for word, coef in zip(
        result_hl_neg[1].get_feature_names(), result_hl_neg[0].coef_[0]
    )    
}

In [16]:
for best_positive in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 1.6633055330297633)
('buy', 1.1088551070091754)
('good', 0.9863128707972757)
('close', 0.7874933644493388)
('end', 0.6570643647082953)
('come', 0.6475146249303012)
('rise', 0.6231366668155226)
('party', 0.6177713127833961)
('jump', 0.5748821888029564)
('another', 0.56154129065098)


In [17]:
for best_negative in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('know', -0.9001195381818783)
('sell', -0.8651384674148198)
('fall', -0.8005000603736819)
('wait', -0.7169378940220923)
('need', -0.7138591775106807)
('well', -0.6863833362578536)
('intraday', -0.6257504102456345)
('look', -0.6000433994396605)
('many', -0.5805928825229436)
('yet', -0.5623536651639781)


## Results for neutral mapped to positive

In [18]:
feature_to_coef_hl_pos = {
    word: coef for word, coef in zip(
        result_hl_pos[1].get_feature_names(), result_hl_pos[0].coef_[0]
    )    
}

In [19]:
for best_positive in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('nan', 0.9881546785457501)
('optimisticvalue', 0.7775493888266164)
('lol', 0.7098935341446149)
('chart', 0.7068419329653763)
('buy', 0.7057758366751355)
('kumar', 0.6898600859647903)
('think', 0.6110897431290209)
('good', 0.6078266033174625)
('bullish', 0.5906734266535262)
('long', 0.5764122489146538)


In [20]:
for best_negative in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('pessimisticvalue', -1.168371663604319)
('fall', -0.8957375293939392)
('low', -0.836442313482042)
('level', -0.7868794656092445)
('short', -0.77744706604852)
('sell', -0.7526709289877251)
('bond', -0.7293278743794027)
('lose', -0.7099183201879716)
('one', -0.7088252982021231)
('investor', -0.6837972550044226)


## Labeling new comments


In [21]:
def assign_coef(comment, feature):
        comment = comment.split(', ')
        #print(comment)
        coef = []
        for word in comment:
            for w,c in feature.items():
                if w == word:    
                    coef.append(c)
        #print(comment,sum(coef))
        if len(coef) == 0:
            return sum(coef)
        else:
            return sum(coef) / len(coef)


def get_sentiment(compound_score):
    if compound_score > 0:
        return 1
    else:
        return 0


def add_label(df, feature):
    
    df['COEF'] = df['COMMENT'].apply(lambda x: assign_coef(x,feature))
    df['LABEL'] = df['COEF'].apply(lambda x: get_sentiment(x))
    
    return df

## BITCOIN

Labeled to pos

In [22]:
df_bitlabeled_pos=add_label(df_bit, feature_to_coef_hl_pos)


In [23]:
df_bitlabeled_pos.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,04-05-19,"hold, optimisticvalue, becomes, distinct, poss...",1,0.406405
1,04-05-19,"thumb, nasty, bear, love, get, spank, cryptoland",0,-0.13815
2,04-05-19,"mysterious, crypto, whale, unveils, meteoric, ...",0,-0.021925
3,04-05-19,"probably, pessimisticvalue, monday",0,-0.360791
4,04-05-19,"short, btc, maßsive, profit",0,-0.21524


In [24]:
df_bitlabeled_pos['LABEL'].value_counts()

0    13429
1     9253
Name: LABEL, dtype: int64

In [25]:
df_bitlabeled_pos.to_excel("ML_Data/bit_classified_pos.xlsx", sheet_name='Sheet_1') 

Labeled to neg

In [26]:
df_bitlabeled_neg=add_label(df_bit, feature_to_coef_hl_neg)

In [27]:
df_bitlabeled_neg.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,04-05-19,"hold, optimisticvalue, becomes, distinct, poss...",1,0.388685
1,04-05-19,"thumb, nasty, bear, love, get, spank, cryptoland",0,-0.236963
2,04-05-19,"mysterious, crypto, whale, unveils, meteoric, ...",1,0.013416
3,04-05-19,"probably, pessimisticvalue, monday",0,-0.276061
4,04-05-19,"short, btc, maßsive, profit",0,-0.159362


In [28]:
df_bitlabeled_neg['LABEL'].value_counts()

0    12654
1    10028
Name: LABEL, dtype: int64

In [29]:
df_bitlabeled_neg['LABEL'].value_counts()

0    12654
1    10028
Name: LABEL, dtype: int64

In [30]:
df_bitlabeled_neg.to_excel("ML_Data/bit_classified_neg.xlsx", sheet_name='Sheet_1') 

Results for classifier 

In [31]:
result_bit_pos=classifier("lr","y","cv",df_bitlabeled_pos)

In [32]:
result_bit_pos[2:4]

('Accuracy: 0.9404893101168172', 'F1 0.9402554084862831')

In [33]:
result_bit_neg=classifier("lr","y","cv",df_bitlabeled_pos)

In [34]:
result_bit_neg[2:4]

('Accuracy: 0.9404893101168172', 'F1 0.9402554084862831')

In [35]:
df=pd.DataFrame(result_bit_neg[4])

### Bitcoin result mapped, neutral mapped to positive

In [36]:
feature_to_coef_bit_pos = {
    word: coef for word, coef in zip(
        result_bit_pos[1].get_feature_names(), result_bit_pos[0].coef_[0]
    )    
}

In [37]:
for best_positive in sorted(
    feature_to_coef_bit_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 7.408290596728253)
('buy', 6.067280663352652)
('good', 5.5454051676754865)
('come', 3.9313699312826964)
('close', 3.909843098542778)
('end', 3.478791512914319)
('rise', 3.148596539385836)
('another', 2.8695622733308754)
('go', 2.82430299774397)
('start', 2.6937403322046736)


In [38]:
for best_negative in sorted(
    feature_to_coef_bit_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('sell', -4.301259411376448)
('know', -4.2668084054694555)
('fall', -3.8223914413455478)
('need', -3.7289688123295748)
('wait', -3.5289874804490493)
('pessimisticvalue', -3.317281739806295)
('well', -3.1797570196138243)
('look', -3.0832998035555583)
('low', -3.0732314507127168)
('many', -2.899791405696808)


### Bitcoin result mapped, neutral mapped to negative

In [39]:
feature_to_coef_bit_neg = {
    word: coef for word, coef in zip(
        result_bit_neg[1].get_feature_names(), result_bit_neg[0].coef_[0]
    )    
}

In [40]:
for best_positive in sorted(
    feature_to_coef_bit_neg.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 7.408290596728253)
('buy', 6.067280663352652)
('good', 5.5454051676754865)
('come', 3.9313699312826964)
('close', 3.909843098542778)
('end', 3.478791512914319)
('rise', 3.148596539385836)
('another', 2.8695622733308754)
('go', 2.82430299774397)
('start', 2.6937403322046736)


In [41]:
for best_negative in sorted(
    feature_to_coef_bit_neg.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('sell', -4.301259411376448)
('know', -4.2668084054694555)
('fall', -3.8223914413455478)
('need', -3.7289688123295748)
('wait', -3.5289874804490493)
('pessimisticvalue', -3.317281739806295)
('well', -3.1797570196138243)
('look', -3.0832998035555583)
('low', -3.0732314507127168)
('many', -2.899791405696808)


## Tesla

In [42]:
df_teslabeled_pos=add_label(df_ts, feature_to_coef_hl_pos)


Labeled to pos

In [43]:
df_teslabeled_pos=add_label(df_ts, feature_to_coef_hl_pos)


In [44]:
df_teslabeled_pos.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,03-05-19,"tell, ratio, bull, bear, comment, section, bas...",1,0.028203
1,03-05-19,"right, total, debt, billion, dollar",0,-0.162984
2,03-05-19,"dont, know, day, tesla, hasnt, make, solid, pr...",0,-0.203656
3,03-05-19,"day, great, way, end, week, cheer, cleaner, fu...",0,-0.060399
4,03-05-19,"tesla, waymo, uber, mercedes, ge, batterelectr...",1,0.044957


In [45]:
df_teslabeled_pos['LABEL'].value_counts()

0    2791
1    1416
Name: LABEL, dtype: int64

In [46]:
df_teslabeled_pos['LABEL'].value_counts()

0    2791
1    1416
Name: LABEL, dtype: int64

In [47]:
df_teslabeled_pos.to_excel("ML_Data/tes_classified_pos.xlsx", sheet_name='Sheet_1') 

Labeled to neg

In [48]:
df_teslabeled_neg=add_label(df_ts, feature_to_coef_hl_neg)

In [49]:
df_teslabeled_neg.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,03-05-19,"tell, ratio, bull, bear, comment, section, bas...",0,-0.048429
1,03-05-19,"right, total, debt, billion, dollar",0,-0.10344
2,03-05-19,"dont, know, day, tesla, hasnt, make, solid, pr...",0,-0.169488
3,03-05-19,"day, great, way, end, week, cheer, cleaner, fu...",1,0.138365
4,03-05-19,"tesla, waymo, uber, mercedes, ge, batterelectr...",1,0.138239


In [50]:
df_teslabeled_neg.to_excel("ML_Data/tes_classified_neg.xlsx", sheet_name='Sheet_1') 

In [51]:
result_tes_pos=classifier("lr","y","cv",df_teslabeled_pos)

In [52]:
result_tes_neg=classifier("lr","y","cv",df_teslabeled_neg)

In [53]:
result_tes_pos[2:4]

('Accuracy: 0.8942992874109263', 'F1 0.8936117728477377')

In [54]:
result_tes_neg[2:4]

('Accuracy: 0.8942992874109263', 'F1 0.8936117728477377')

### Tesla result mapped, neutral mapped to positive

In [55]:
feature_to_coef_tes_pos = {
    word: coef for word, coef in zip(
        result_tes_pos[1].get_feature_names(), result_tes_pos[0].coef_[0]
    )    
}

In [56]:
for best_positive in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 4.015446103627258)
('buy', 3.774252722315349)
('good', 3.0209445209269763)
('close', 2.2740004618778342)
('come', 2.236379411728284)
('go', 1.9335739972290893)
('new', 1.5587963826163014)
('dont', 1.5208699415206761)
('end', 1.5006772912590813)
('next', 1.4996110667617244)


In [57]:
for best_negative in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('sell', -2.4359389791943546)
('know', -2.3266012890416086)
('pessimisticvalue', -2.0993034875865324)
('fall', -1.793485208308045)
('look', -1.6995932609675102)
('wait', -1.4975881706282363)
('one', -1.497144918218633)
('well', -1.4889288985539648)
('need', -1.4344692067400397)
('level', -1.3899459030605943)


## Dow Jones

Labeled to pos

In [58]:
df_djlabeled_pos=add_label(df_dj, feature_to_coef_hl_pos)

In [59]:
df_djlabeled_pos.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,03-05-19,"stock, market, gold, silver, one, lie, answer,...",0,-0.313185
1,03-05-19,"company, pay, income, tax, amazon, delta, air,...",0,-0.024678
2,03-05-19,"gold, silver, interesting",0,-0.110888
3,03-05-19,"melt, up？crazy",0,0.0
4,03-05-19,"crazy, money, flow, daily, show, get, yet, mar...",0,-0.002134


In [60]:
df_djlabeled_pos['LABEL'].value_counts()

0    7741
1    5288
Name: LABEL, dtype: int64

In [61]:
df_djlabeled_pos.to_excel("ML_Data/dj_classified_pos.xlsx", sheet_name='Sheet_1') 

Labeled to neg

In [62]:
df_djlabeled_neg=add_label(df_dj, feature_to_coef_hl_neg)

In [63]:
df_djlabeled_neg.head()

Unnamed: 0,date,COMMENT,LABEL,COEF
0,03-05-19,"stock, market, gold, silver, one, lie, answer,...",0,-0.209115
1,03-05-19,"company, pay, income, tax, amazon, delta, air,...",0,-0.110253
2,03-05-19,"gold, silver, interesting",1,0.2691
3,03-05-19,"melt, up？crazy",0,0.0
4,03-05-19,"crazy, money, flow, daily, show, get, yet, mar...",0,-0.057322


In [64]:
df_djlabeled_neg['LABEL'].value_counts()

0    6743
1    6286
Name: LABEL, dtype: int64

In [65]:
df_djlabeled_neg.to_excel("ML_Data/dj_classified_neg.xlsx", sheet_name='Sheet_1') 

Results for classifier 

In [66]:
result_dj_pos=classifier("lr","y","cv",df_djlabeled_pos)

In [67]:
result_dj_neg=classifier("lr","y","cv",df_djlabeled_neg)

In [68]:
result_dj_pos[2:4]

('Accuracy: 0.9259401381427476', 'F1 0.9258730910409005')

In [69]:
result_dj_neg[2:4]

('Accuracy: 0.9259401381427476', 'F1 0.9258730910409005')

### Dow jones result mapped, neutral mapped to positive

In [70]:
feature_to_coef_dj_pos = {
    word: coef for word, coef in zip(
        result_dj_pos[1].get_feature_names(), result_dj_pos[0].coef_[0]
    )    
}

In [71]:
for best_positive in sorted(
    feature_to_coef_dj_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 5.93832225898062)
('buy', 5.333313636539722)
('good', 4.215020711320868)
('close', 4.071599447684046)
('come', 3.4543755542415067)
('end', 3.0778413480829996)
('go', 2.659352667602555)
('another', 2.5796694033035186)
('rise', 2.4785946914601396)
('start', 2.3918274891581968)


In [72]:
for best_negative in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('sell', -2.4359389791943546)
('know', -2.3266012890416086)
('pessimisticvalue', -2.0993034875865324)
('fall', -1.793485208308045)
('look', -1.6995932609675102)
('wait', -1.4975881706282363)
('one', -1.497144918218633)
('well', -1.4889288985539648)
('need', -1.4344692067400397)
('level', -1.3899459030605943)
