In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [2]:
df_pos = pd.read_excel('data/output/clean_handlabeled_data.xlsx')
df_neg = pd.read_excel('data/output/clean_handlabeled_data.xlsx')

Raghava

In [3]:
df_bit = pd.read_excel("data/output/clean_bit_data.xlsx") 
df_ts = pd.read_excel("data/output/clean_tesla_data.xlsx") 
df_dj = pd.read_excel("data/output/clean_dowjones_data.xlsx") 

In [18]:
df_bit = df_bit[pd.notnull(df_bit['COMMENT'])]
df_ts = df_ts[pd.notnull(df_ts['COMMENT'])]
df_dj = df_dj[pd.notnull(df_dj['COMMENT'])]


In [5]:
df_bit.head()

Unnamed: 0,date,COMMENT,LABEL
0,04-05-19,"hold, optimisticvalue, becomes, distinct, poss...",
1,04-05-19,"thumb, nasty, bear, love, get, spank, cryptoland",
2,04-05-19,"mysterious, crypto, whale, unveils, meteoric, ...",
3,04-05-19,"probably, pessimisticvalue, monday",
4,04-05-19,"short, btc, maßsive, profit",


## Classifier

In [6]:
df_pos['LABEL'] = df_pos['LABEL'].map({'NEU':'POS','POS':'POS','NEG':'NEG'})

In [7]:
df_neg['LABEL'] = df_neg['LABEL'].map({'NEU':'NEG','POS':'POS','NEG':'NEG'})

In [8]:
def classifier(clf,strat,vec,df):
    
    y = df['LABEL'].astype(str)
    X = df.COMMENT.astype(str)
    #y = pd.get_dummies(y)
    
    if vec == "cv":
        count = CountVectorizer()
    elif vec == "tdidf":
        count = TfidfVectorizer(binary=True, ngram_range=(1,1), use_idf=False)
    
    if strat == "y":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
        X_train_counts = count.fit_transform(X_train)

        X_test_counts = count.transform(X_test)

        
    elif strat == "n":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
        X_train_counts = count.fit_transform(X_train)
        X_test_counts = count.transform(X_test)

    
    if clf == "lr":
        clf = LogisticRegression(solver="newton-cg",C=1).fit(X_train_counts, y_train)
        #clf = LogisticRegression(C=0.4).fit(X_train_counts, y_train)
        
    elif clf == "mnb":
        clf = MultinomialNB().fit(X_train_counts, y_train)
        
    
    elif clf == "svm":
        clf = svm.SVC(kernel='linear').fit(X_train_counts, y_train)

    yPred = clf.predict(X_test_counts)
    yPred_log = clf.predict_proba(X_test_counts)
    
    list_ = list(zip(yPred, X_test))
    
    acc=accuracy_score(y_test,yPred)
    f1=f1_score(y_test,yPred, average='weighted')

    return clf, count,"Accuracy: %s" % acc, "F1 %s" % f1, list_, yPred_log

In [9]:
result_hl_neg=classifier("lr","y","cv",df_neg)
result_hl_pos=classifier("lr","y","cv",df_pos)

#df_result_hl = pd.DataFrame(result_hl[4], columns = ['Predicted_label', 'Comment'])



## Results for neutral mapped to negative

In [10]:
feature_to_coef_hl_neg = {
    word: coef for word, coef in zip(
        result_hl_neg[1].get_feature_names(), result_hl_neg[0].coef_[0]
    )    
}

In [11]:
for best_positive in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('know', 1.2445967495916543)
('sell', 1.1584127933932038)
('intraday', 1.1242359734385283)
('fall', 1.041339177154852)
('yet', 1.0280067019832926)
('need', 0.9331545849716029)
('wait', 0.9303053628685605)
('pessimisticvalue', 0.92357285840614)
('one', 0.8306902195713199)
('around', 0.806438309772821)


In [12]:
for best_negative in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('optimisticvalue', -2.387722055051833)
('buy', -1.362476097656649)
('rise', -1.3154862893213966)
('come', -1.0366642846664857)
('jump', -1.033879127319575)
('buying', -1.0048671864235204)
('hold', -0.9966387539967203)
('dip', -0.9691222923574213)
('party', -0.9585706108267759)
('growth', -0.9495668881459498)


## Results for neutral mapped to positive

In [13]:
feature_to_coef_hl_pos = {
    word: coef for word, coef in zip(
        result_hl_pos[1].get_feature_names(), result_hl_pos[0].coef_[0]
    )    
}

In [14]:
for best_positive in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('pessimisticvalue', 1.4563059213682312)
('without', 1.1974855989337985)
('sec', 1.0509872845380217)
('low', 1.0384135685712763)
('bankruptcy', 1.020806754693085)
('sell', 0.9631217581307564)
('thing', 0.943443513054218)
('investor', 0.9277196408821016)
('really', 0.9186150382573769)
('crash', 0.9179971591156206)


In [15]:
for best_negative in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('optimisticvalue', -1.3087116820073705)
('nan', -1.1180555301750232)
('put', -1.058328679968951)
('kumar', -1.0117475948709873)
('chart', -0.9994504153579313)
('buy', -0.8856263472573686)
('hold', -0.8375794157870795)
('currently', -0.8122375900851678)
('bullish', -0.8014797153667941)
('data', -0.7985974874600406)


## Labeling new comments


In [16]:
def assign_coef(comment, feature):
        comment = comment.split(', ')
        #print(comment)
        coef = []
        for word in comment:
            for w,c in feature.items():
                if w == word:
                    coef.append(c)
        return sum(coef)


def get_sentiment(compound_score):
    if compound_score > 0:
        return 1
    else:
        return 0


def add_label(df, feature):
    
    df['COEF'] = df['COMMENT'].apply(lambda x: assign_coef(x,feature))
    df['LABEL'] = df['COEF'].apply(lambda x: get_sentiment(x))
    
    return df

## BITCOIN

In [19]:
df_bitlabeled_pos=add_label(df_bit, feature_to_coef_hl_pos)
df_bitlabeled_neg=add_label(df_bit, feature_to_coef_hl_neg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [32]:
df_bitlabeled_pos.to_excel("ML_Data/bit_classified_pos.xlsx", sheet_name='Sheet_1') 
df_bitlabeled_neg.to_excel("ML_Data/bit_classified_neg.xlsx", sheet_name='Sheet_1') 

In [None]:
result_bit_pos=classifier("lr","y","cv",df_bitlabeled_pos)

In [None]:
result_bit_neg=classifier("lr","y","cv",df_bitlabeled_pos)

### Bitcoin result mapped, neutral mapped to positive

In [None]:
feature_to_coef_bit_pos = {
    word: coef for word, coef in zip(
        result_bit_pos[1].get_feature_names(), result_bit_pos[0].coef_[0]
    )    
}

In [None]:
for best_positive in sorted(
    feature_to_coef_bit_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

In [None]:
for best_negative in sorted(
    feature_to_coef_bit_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

## Tesla

In [22]:
df_teslabeled_pos=add_label(df_ts, feature_to_coef_hl_pos)
df_teslabeled_neg=add_label(df_ts, feature_to_coef_hl_neg)

In [23]:
df_teslabeled_pos

Unnamed: 0,date,COMMENT,LABEL,COEF
0,03-05-19,"tell, ratio, bull, bear, comment, section, bas...",1,1.319381
1,03-05-19,"right, total, debt, billion, dollar",1,1.306930
2,03-05-19,"dont, know, day, tesla, hasnt, make, solid, pr...",1,2.625258
3,03-05-19,"day, great, way, end, week, cheer, cleaner, fu...",0,-0.506040
4,03-05-19,"tesla, waymo, uber, mercedes, ge, batterelectr...",0,-0.145474
5,03-05-19,"pessimisticvalue, perfect, buy, aint, fall",1,0.350535
6,03-05-19,"dont, vaccine, kid, sell, “, alternative, medi...",1,0.953549
7,03-05-19,"happy, buying, optimisticvalue",0,-4.230480
8,03-05-19,"hater, gon, na, hate",0,-0.444994
9,03-05-19,"us, delivery, k, mar, stay, k, also, apr, mean...",0,-1.089269


In [33]:
df_teslabeled_pos.to_excel("ML_Data/tes_classified_pos.xlsx", sheet_name='Sheet_1') 
df_teslabeled_neg.to_excel("ML_Data/tes_classified_neg.xlsx", sheet_name='Sheet_1') 

In [None]:
result_tes_pos=classifier("lr","y","cv",df_teslabeled_pos)

In [None]:
result_tes_neg=classifier("lr","y","cv",df_teslabeled_neg)

### Tesla result mapped, neutral mapped to positive

In [214]:
feature_to_coef_tes_pos = {
    word: coef for word, coef in zip(
        result_tes_pos[1].get_feature_names(), result_tes_pos[0].coef_[0]
    )    
}

In [215]:
for best_positive in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 8.536060901941065)
('buy', 6.545442530590317)
('rise', 4.929929974434978)
('trend', 4.7463539619898905)
('close', 3.939052002639244)
('hold', 3.778934382839763)
('come', 3.7090573824294406)
('add', 3.7058638227861023)
('bitcoin', 3.6607861476085)
('bounce', 3.6147214375855645)


In [216]:
for best_negative in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('know', -5.660962660305671)
('sell', -5.650913691475601)
('low', -4.7566105833886825)
('fall', -4.46418508102759)
('try', -4.065880573385342)
('wait', -3.8393292629173286)
('yet', -3.798312579064531)
('anyone', -3.6786722575200357)
('head', -3.6588563004284738)
('look', -3.59304445754055)


## Dow Jones

In [30]:
df_djlabeled_pos=add_label(df_dj, feature_to_coef_hl_pos)
df_djlabeled_neg=add_label(df_dj, feature_to_coef_hl_neg)

In [34]:
df_djlabeled_pos.to_excel("ML_Data/dj_classified_pos.xlsx", sheet_name='Sheet_1') 
df_djlabeled_neg.to_excel("ML_Data/dj_classified_neg.xlsx", sheet_name='Sheet_1') 

In [None]:
result_dj_pos=classifier("lr","y","cv",df_djlabeled_pos)

In [None]:
result_dj_neg=classifier("lr","y","cv",df_djlabeled_neg)

### Dow jones result mapped, neutral mapped to positive

In [214]:
feature_to_coef_dj_pos = {
    word: coef for word, coef in zip(
        result_dj_pos[1].get_feature_names(), result_dj_pos[0].coef_[0]
    )    
}

In [215]:
for best_positive in sorted(
    feature_to_coef_tes_dj.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

('optimisticvalue', 8.536060901941065)
('buy', 6.545442530590317)
('rise', 4.929929974434978)
('trend', 4.7463539619898905)
('close', 3.939052002639244)
('hold', 3.778934382839763)
('come', 3.7090573824294406)
('add', 3.7058638227861023)
('bitcoin', 3.6607861476085)
('bounce', 3.6147214375855645)


In [216]:
for best_negative in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

('know', -5.660962660305671)
('sell', -5.650913691475601)
('low', -4.7566105833886825)
('fall', -4.46418508102759)
('try', -4.065880573385342)
('wait', -3.8393292629173286)
('yet', -3.798312579064531)
('anyone', -3.6786722575200357)
('head', -3.6588563004284738)
('look', -3.59304445754055)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm

df_pos = pd.read_excel('data/output/clean_handlabeled_data.xlsx')
df_neg = pd.read_excel('data/output/clean_handlabeled_data.xlsx')

Raghava

df_bit = pd.read_excel("data/output/clean_bit_data.xlsx") 
df_ts = pd.read_excel("data/output/clean_tesla_data.xlsx") 
df_dj = pd.read_excel("data/output/clean_dowjones_data.xlsx") 

df_bit = df_bit[pd.notnull(df_bit['COMMENT'])]
df_ts = df_ts[pd.notnull(df_ts['COMMENT'])]
df_dj = df_dj[pd.notnull(df_dj['COMMENT'])]


df_bit.head()

## Classifier

df_pos['LABEL'] = df_pos['LABEL'].map({'NEU':'POS','POS':'POS','NEG':'NEG'})

df_neg['LABEL'] = df_neg['LABEL'].map({'NEU':'NEG','POS':'POS','NEG':'NEG'})

def classifier(clf,strat,vec,df):
    
    y = df['LABEL'].astype(str)
    X = df.COMMENT.astype(str)
    #y = pd.get_dummies(y)
    
    if vec == "cv":
        count = CountVectorizer()
    elif vec == "tdidf":
        count = TfidfVectorizer(binary=True, ngram_range=(1,1), use_idf=False)
    
    if strat == "y":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
        X_train_counts = count.fit_transform(X_train)

        X_test_counts = count.transform(X_test)

        
    elif strat == "n":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
        X_train_counts = count.fit_transform(X_train)
        X_test_counts = count.transform(X_test)

    
    if clf == "lr":
        clf = LogisticRegression(solver="newton-cg",C=1).fit(X_train_counts, y_train)
        #clf = LogisticRegression(C=0.4).fit(X_train_counts, y_train)
        
    elif clf == "mnb":
        clf = MultinomialNB().fit(X_train_counts, y_train)
        
    
    elif clf == "svm":
        clf = svm.SVC(kernel='linear').fit(X_train_counts, y_train)

    yPred = clf.predict(X_test_counts)
    yPred_log = clf.predict_proba(X_test_counts)
    
    list_ = list(zip(yPred, X_test))
    
    acc=accuracy_score(y_test,yPred)
    f1=f1_score(y_test,yPred, average='weighted')

    return clf, count,"Accuracy: %s" % acc, "F1 %s" % f1, list_, yPred_log

result_hl_neg=classifier("lr","y","cv",df_neg)
result_hl_pos=classifier("lr","y","cv",df_pos)

#df_result_hl = pd.DataFrame(result_hl[4], columns = ['Predicted_label', 'Comment'])

## Results for neutral mapped to negative

feature_to_coef_hl_neg = {
    word: coef for word, coef in zip(
        result_hl_neg[1].get_feature_names(), result_hl_neg[0].coef_[0]
    )    
}

for best_positive in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

for best_negative in sorted(
    feature_to_coef_hl_neg.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

## Results for neutral mapped to positive

feature_to_coef_hl_pos = {
    word: coef for word, coef in zip(
        result_hl_pos[1].get_feature_names(), result_hl_pos[0].coef_[0]
    )    
}

for best_positive in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

for best_negative in sorted(
    feature_to_coef_hl_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)



## Labeling new comments


def assign_coef(comment, feature):
        comment = comment.split(', ')
        #print(comment)
        coef = []
        for word in comment:
            for w,c in feature.items():
                if w == word:
                    coef.append(c)
        return sum(coef)


def get_sentiment(compound_score):
    if compound_score > 0:
        return 1
    else:
        return 0


def add_label(df, feature):
    
    df['COEF'] = df['COMMENT'].apply(lambda x: assign_coef(x,feature))
    df['LABEL'] = df['COEF'].apply(lambda x: get_sentiment(x))
    
    return df

## BITCOIN

df_bitlabeled_pos=add_label(df_bit, feature_to_coef_hl_pos)
df_bitlabeled_neg=add_label(df_bit, feature_to_coef_hl_neg)

df_bitlabeled_pos.to_excel("ML_Data/bit_classified_pos.xlsx", sheet_name='Sheet_1') 
df_bitlabeled_neg.to_excel("ML_Data/bit_classified_neg.xlsx", sheet_name='Sheet_1') 

result_bit_pos=classifier("lr","y","cv",df_bitlabeled_pos)



result_bit_neg=classifier("lr","y","cv",df_bitlabeled_pos)



### Bitcoin result mapped, neutral mapped to positive

feature_to_coef_bit_pos = {
    word: coef for word, coef in zip(
        result_bit_pos[1].get_feature_names(), result_bit_pos[0].coef_[0]
    )    
}

for best_positive in sorted(
    feature_to_coef_bit_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

for best_negative in sorted(
    feature_to_coef_bit_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

## Tesla

df_teslabeled_pos=add_label(df_ts, feature_to_coef_hl_pos)
df_teslabeled_neg=add_label(df_ts, feature_to_coef_hl_neg)

df_teslabeled_pos

df_teslabeled_pos.to_excel("ML_Data/tes_classified_pos.xlsx", sheet_name='Sheet_1') 
df_teslabeled_neg.to_excel("ML_Data/tes_classified_neg.xlsx", sheet_name='Sheet_1') 

result_tes_pos=classifier("lr","y","cv",df_teslabeled_pos)

result_tes_neg=classifier("lr","y","cv",df_teslabeled_neg)



### Tesla result mapped, neutral mapped to positive

feature_to_coef_tes_pos = {
    word: coef for word, coef in zip(
        result_tes_pos[1].get_feature_names(), result_tes_pos[0].coef_[0]
    )    
}

for best_positive in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

for best_negative in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

## Dow Jones

df_djlabeled_pos=add_label(df_dj, feature_to_coef_hl_pos)
df_djlabeled_neg=add_label(df_dj, feature_to_coef_hl_neg)

df_djlabeled_pos.to_excel("ML_Data/dj_classified_pos.xlsx", sheet_name='Sheet_1') 
df_djlabeled_neg.to_excel("ML_Data/dj_classified_neg.xlsx", sheet_name='Sheet_1') 

result_dj_pos=classifier("lr","y","cv",df_djlabeled_pos)

result_dj_neg=classifier("lr","y","cv",df_djlabeled_neg)



### Dow jones result mapped, neutral mapped to positive

feature_to_coef_dj_pos = {
    word: coef for word, coef in zip(
        result_dj_pos[1].get_feature_names(), result_dj_pos[0].coef_[0]
    )    
}

for best_positive in sorted(
    feature_to_coef_tes_dj.items(), 
    key=lambda x: x[1], reverse=True, 
    )[:10]:
    print (best_positive)

for best_negative in sorted(
    feature_to_coef_tes_pos.items(), 
    key=lambda x: x[1] 
    )[:10]:
    print (best_negative)

