In [18]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [19]:
df_pos = pd.read_excel('data/output/clean_handlabeled_data.xlsx')
df_neg = pd.read_excel('data/output/clean_handlabeled_data.xlsx')

Raghava

In [20]:
df_bit = pd.read_excel("data/output/clean_bit_data.xlsx") 
df_ts = pd.read_excel("data/output/clean_tesla_data.xlsx") 
df_dj = pd.read_excel("data/output/clean_dowjones_data.xlsx") 

In [21]:
df_bit = df_bit[pd.notnull(df_bit['COMMENT'])]


In [22]:
df_bit.head()

Unnamed: 0,date,COMMENT,LABEL
0,04-05-19,"hold, optimisticvalue, becomes, distinct, poss...",
1,04-05-19,"thumb, nasty, bear, love, get, spank, cryptoland",
2,04-05-19,"mysterious, crypto, whale, unveils, meteoric, ...",
3,04-05-19,"probably, pessimisticvalue, monday",
4,04-05-19,"short, btc, maßsive, profit",


## Classifier

In [23]:
df_pos['LABEL'] = df_pos['LABEL'].map({'NEU':'POS','POS':'POS','NEG':'NEG'})

In [24]:
df_neg['LABEL'] = df_neg['LABEL'].map({'NEU':'NEG','POS':'POS','NEG':'NEG'})

In [25]:
def classifier(clf,strat,vec,df):
    
    y = df['LABEL'].astype(str)
    X = df.COMMENT.astype(str)
    #y = pd.get_dummies(y)
    
    if vec == "cv":
        count = CountVectorizer()
    elif vec == "tdidf":
        count = TfidfVectorizer(binary=True, ngram_range=(1,1), use_idf=False)
    
    if strat == "y":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
        X_train_counts = count.fit_transform(X_train)

        X_test_counts = count.transform(X_test)

        
    elif strat == "n":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
        X_train_counts = count.fit_transform(X_train)
        X_test_counts = count.transform(X_test)

    
    if clf == "lr":
        clf = LogisticRegression(solver="newton-cg",C=1).fit(X_train_counts, y_train)
        #clf = LogisticRegression(C=0.4).fit(X_train_counts, y_train)
        
    elif clf == "mnb":
        clf = MultinomialNB().fit(X_train_counts, y_train)
        
    
    elif clf == "svm":
        clf = svm.SVC(kernel='linear').fit(X_train_counts, y_train)

    yPred = clf.predict(X_test_counts)
    yPred_log = clf.predict_proba(X_test_counts)
    
    list_ = list(zip(yPred, X_test))
    
    acc=accuracy_score(y_test,yPred)
    f1=f1_score(y_test,yPred, average='weighted')

    return clf, count,"Accuracy: %s" % acc, "F1 %s" % f1, list_, yPred_log

In [26]:
result_hl_neg=classifier("lr","y","cv",df_neg)
result_hl_pos=classifier("lr","y","cv",df_pos)

#df_result_hl = pd.DataFrame(result_hl[4], columns = ['Predicted_label', 'Comment'])



## Results for neutral mapped to negative

In [27]:
feature_to_coef_hl_neg = {
    word: coef for word, coef in zip(
        result_hl_neg[1].get_feature_names(), result_hl_neg[0].coef_[0]
    )    
}

In [28]:
for best_positive in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('sell', 1.3469036233955975)
('know', 1.1025283992193007)
('yet', 1.0113021384029357)
('fall', 0.8980452007292674)
('need', 0.7589709486020646)
('low', 0.7543595307308957)
('intraday', 0.7529236191003188)
('anyone', 0.7349947801898926)
('many', 0.7172859012506345)
('work', 0.7159181282621839)


In [29]:
for best_negative in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('optimisticvalue', -2.6418419543467526)
('buy', -1.3912036445863203)
('rise', -1.222858022520405)
('good', -1.1070113479794592)
('jump', -1.0197063011953424)
('growth', -0.9503048688353173)
('hold', -0.9448394020480498)
('train', -0.9254667206579154)
('dip', -0.8771018261063488)
('happy', -0.8760628688854692)


## Results for neutral mapped to positive

In [30]:
feature_to_coef_hl_pos = {
    word: coef for word, coef in zip(
        result_hl_pos[1].get_feature_names(), result_hl_pos[0].coef_[0]
    )    
}

In [31]:
for best_positive in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('low', 1.475503089696932)
('pessimisticvalue', 1.454003745256364)
('sell', 1.206198028648288)
('way', 1.1610869243105986)
('level', 1.0764770229389657)
('warn', 0.9488375911127448)
('short', 0.9311341723107671)
('lose', 0.8935537382758374)
('really', 0.8823657723915983)
('investor', 0.8766818539130368)


In [32]:
for best_negative in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('nan', -1.2310570375681982)
('optimisticvalue', -1.1032100802889768)
('hold', -1.0100532974484633)
('think', -0.9999745223686365)
('kumar', -0.9347827880920723)
('long', -0.8823410025633482)
('chart', -0.8731210453053964)
('put', -0.8087267836059421)
('talk', -0.7977309466373406)
('bore', -0.7937919054565385)


## Labeling new comments


In [36]:
def assign_coef(comment, feature):
        comment = comment.split(', ')
        #print(comment)
        coef = []
        for word in comment:
            for w,c in feature.items():
                if w == word:
                    coef.append(c)
        return sum(coef)


def get_sentiment(compound_score):
    if compound_score > 0:
        return 1
    else:
        return 0


def add_label(df, feature):
    
    df['COEF'] = df['COMMENT'].apply(lambda x: assign_coef(x,feature))
    df['LABEL'] = df['COEF'].apply(lambda x: get_sentiment(x))
    
    return df

## BITCOIN

In [37]:
df_bitlabeled_pos=add_label(df_bit, feature_to_coef_hl_pos)
df_bitlabeled_neg=add_label(df_bit, feature_to_coef_hl_neg)

In [None]:
to_excel("data/output/clean_bit_data.xlsx", sheet_name='Sheet_1') 

In [None]:
result_bit_pos=classifier("lr","y","cv",df_bitlabeled_pos)

In [None]:
result_bit_neg=classifier("lr","y","cv",df_bitlabeled_pos)

### Bitcoin result mapped, neutral mapped to positive

In [214]:
feature_to_coef_bit_pos = {
    word: coef for word, coef in zip(
        result_bit_pos[1].get_feature_names(), result_bit_pos[0].coef_[0]
    )    
}

In [215]:
for best_positive in sorted(
    feature_to_coef_bit_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 8.536060901941065)
('buy', 6.545442530590317)
('rise', 4.929929974434978)
('trend', 4.7463539619898905)
('close', 3.939052002639244)
('hold', 3.778934382839763)
('come', 3.7090573824294406)
('add', 3.7058638227861023)
('bitcoin', 3.6607861476085)
('bounce', 3.6147214375855645)


In [216]:
for best_negative in sorted(
    feature_to_coef_bit_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('know', -5.660962660305671)
('sell', -5.650913691475601)
('low', -4.7566105833886825)
('fall', -4.46418508102759)
('try', -4.065880573385342)
('wait', -3.8393292629173286)
('yet', -3.798312579064531)
('anyone', -3.6786722575200357)
('head', -3.6588563004284738)
('look', -3.59304445754055)
