In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [3]:
df_pos = pd.read_excel('data/output/clean_handlabeled_data.xlsx')
df_neg = pd.read_excel('data/output/clean_handlabeled_data.xlsx')

Raghava

In [4]:
df_bit = pd.read_excel("data/output/clean_bit_data.xlsx") 
df_ts = pd.read_excel("data/output/clean_tesla_data.xlsx") 
df_dj = pd.read_excel("data/output/clean_dowjones_data.xlsx") 

In [5]:
df_bit = df_bit[pd.notnull(df_bit['COMMENT'])]


In [6]:
df_bit.head()

Unnamed: 0,date,COMMENT,LABEL
0,04-05-19,"hold, optimisticvalue, becomes, distinct, poss...",
1,04-05-19,"thumb, nasty, bear, love, get, spank, cryptoland",
2,04-05-19,"mysterious, crypto, whale, unveils, meteoric, ...",
3,04-05-19,"probably, pessimisticvalue, monday",
4,04-05-19,"short, btc, maßsive, profit",


## Classifier

In [7]:
df_pos['LABEL'] = df_pos['LABEL'].map({'NEU':'POS','POS':'POS','NEG':'NEG'})

In [8]:
df_neg['LABEL'] = df_neg['LABEL'].map({'NEU':'NEG','POS':'POS','NEG':'NEG'})

In [9]:
def classifier(clf,strat,vec,df):
    
    y = df['LABEL'].astype(str)
    X = df.COMMENT.astype(str)
    #y = pd.get_dummies(y)
    
    if vec == "cv":
        count = CountVectorizer()
    elif vec == "tdidf":
        count = TfidfVectorizer(binary=True, ngram_range=(1,1), use_idf=False)
    
    if strat == "y":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
        X_train_counts = count.fit_transform(X_train)

        X_test_counts = count.transform(X_test)

        
    elif strat == "n":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
        X_train_counts = count.fit_transform(X_train)
        X_test_counts = count.transform(X_test)

    
    if clf == "lr":
        clf = LogisticRegression(solver="newton-cg",C=1).fit(X_train_counts, y_train)
        #clf = LogisticRegression(C=0.4).fit(X_train_counts, y_train)
        
    elif clf == "mnb":
        clf = MultinomialNB().fit(X_train_counts, y_train)
        
    
    elif clf == "svm":
        clf = svm.SVC(kernel='linear').fit(X_train_counts, y_train)

    yPred = clf.predict(X_test_counts)
    yPred_log = clf.predict_proba(X_test_counts)
    
    list_ = list(zip(yPred, X_test))
    
    acc=accuracy_score(y_test,yPred)
    f1=f1_score(y_test,yPred, average='weighted')

    return clf, count,"Accuracy: %s" % acc, "F1 %s" % f1, list_, yPred_log

In [10]:
result_hl_neg=classifier("lr","y","cv",df_neg)
result_hl_pos=classifier("lr","y","cv",df_pos)

#df_result_hl = pd.DataFrame(result_hl[4], columns = ['Predicted_label', 'Comment'])



## Results for neutral mapped to negative

In [11]:
feature_to_coef_hl_neg = {
    word: coef for word, coef in zip(
        result_hl_neg[1].get_feature_names(), result_hl_neg[0].coef_[0]
    )    
}

In [12]:
for best_positive in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('sell', 1.5841023913990104)
('know', 1.2105639564711146)
('yet', 0.9294525991017818)
('need', 0.9203480052260168)
('anyone', 0.869712579404843)
('wait', 0.8574035040620622)
('intraday', 0.8118329566666895)
('real', 0.8037720599207229)
('take', 0.7858178192513599)
('low', 0.7819470790899263)


In [13]:
for best_negative in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('optimisticvalue', -2.2979502187523906)
('buy', -1.1858543137740418)
('hold', -1.1403859875246796)
('start', -1.052613315633209)
('jump', -1.0332515790377867)
('good', -1.0097801222480263)
('rise', -0.9828816141907135)
('come', -0.9543462150115056)
('firework', -0.9224230044697869)
('panic', -0.876437795021588)


## Results for neutral mapped to positive

In [14]:
feature_to_coef_hl_pos = {
    word: coef for word, coef in zip(
        result_hl_pos[1].get_feature_names(), result_hl_pos[0].coef_[0]
    )    
}

In [15]:
for best_positive in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('pessimisticvalue', 1.5072056345060534)
('level', 1.2853446903104424)
('low', 1.2282658458366613)
('bankruptcy', 1.0742376224462495)
('crash', 1.0537010023919522)
('sell', 0.9682174003066277)
('red', 0.934734508863142)
('christmas', 0.9071018712318768)
('mad', 0.8521856029468351)
('tesla', 0.8261368249600541)


In [16]:
for best_negative in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('hold', -1.1084564214666213)
('optimisticvalue', -1.105325922089465)
('nan', -1.0784493644518427)
('put', -0.9639588062332387)
('think', -0.9594895536989415)
('long', -0.9323100811121111)
('lol', -0.9271121687356098)
('currently', -0.8807807463826035)
('kumar', -0.8179052877439119)
('happen', -0.794430014509056)


## Labeling new comments


In [None]:
def assign_coef(comment, feature):
        comment = comment.split(', ')
        #print(comment)
        coef = []
        for word in comment:
            for w,c in feature.items():
                if w == word:
                    coef.append(c)
        return sum(coef)


def get_sentiment(compound_score):
    if compound_score > 0:
        return 1
    else:
        return 0


def add_label(df, feature):
    
    df['COEF'] = df['COMMENT'].apply(lambda x: assign_coef(x))
    df['LABEL'] = df['COEF'].apply(lambda x: get_sentiment(x))
    
    return df

In [None]:
df_bitlabeled_pos=add_label(df_bit, feature_to_coef_hl_pos)
df_bitlabeled_neg=add_label(df_bit, feature_to_coef_hl_neg)

In [None]:
df=classifier("lr","y","cv",df_bitlabeled)

In [None]:
df_result_ml = pd.DataFrame(result_ml[4], columns = ['Predicted_label', 'Comment'])

In [213]:
result_ml[2:4]

('Accuracy: 0.943134229667181', 'F1 0.9430443979803095')

In [214]:
feature_to_coef = {
    word: coef for word, coef in zip(
        result_ml[1].get_feature_names(), result_ml[0].coef_[0]
    )    
}

In [215]:
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 8.536060901941065)
('buy', 6.545442530590317)
('rise', 4.929929974434978)
('trend', 4.7463539619898905)
('close', 3.939052002639244)
('hold', 3.778934382839763)
('come', 3.7090573824294406)
('add', 3.7058638227861023)
('bitcoin', 3.6607861476085)
('bounce', 3.6147214375855645)


In [216]:
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('know', -5.660962660305671)
('sell', -5.650913691475601)
('low', -4.7566105833886825)
('fall', -4.46418508102759)
('try', -4.065880573385342)
('wait', -3.8393292629173286)
('yet', -3.798312579064531)
('anyone', -3.6786722575200357)
('head', -3.6588563004284738)
('look', -3.59304445754055)
