In [1]:
import nltk
import pandas as pd
import numpy as np
#nltk.download('vader_lexicon') #*remember to download*
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [2]:
df_bc = pd.read_excel('data/output/clean_bit_data.xlsx')
df_dj = pd.read_excel('data/output/clean_dowjones_data.xlsx')
df_ts = pd.read_excel('data/output/clean_tesla_data.xlsx')

In [3]:
df_bc = df_bc[pd.notnull(df_bc['COMMENT'])]
df_dj = df_dj[pd.notnull(df_dj['COMMENT'])]
df_ts = df_ts[pd.notnull(df_ts['COMMENT'])]


In [4]:
sid = SentimentIntensityAnalyzer()

In [5]:
def get_compound_score(comment):
    ss = sid.polarity_scores(str(comment))
    return ss['compound']

def get_sentiment(compound_score):
    if compound_score > 0:
        return 1
    else:
        return 0


def vader(df):
    for comment in df.COMMENT:
        ss = sid.polarity_scores(comment)
    
    df['compound_score'] = df['COMMENT'].apply(lambda x: get_compound_score(x))
    df['LABEL'] = df['compound_score'].apply(lambda x: get_sentiment(x))
    
    return df

In [6]:
df_bc = vader(df_bc)
df_dj = vader(df_dj)
df_ts = vader(df_ts)

In [8]:
df_dj.head()

Unnamed: 0,date,COMMENT,LABEL,compound_score
0,03-05-19,"stock, market, gold, silver, one, lie, answer,...",0,0.0
1,03-05-19,"company, pay, income, tax, amazon, delta, air,...",1,0.5423
2,03-05-19,"gold, silver, interesting",1,0.4019
3,03-05-19,"melt, up？crazy",0,0.0
4,03-05-19,"crazy, money, flow, daily, show, get, yet, mar...",1,0.1027


In [9]:
df_ts.head()

Unnamed: 0,date,COMMENT,LABEL,compound_score
0,03-05-19,"tell, ratio, bull, bear, comment, section, bas...",1,0.4215
1,03-05-19,"right, total, debt, billion, dollar",0,-0.3612
2,03-05-19,"dont, know, day, tesla, hasnt, make, solid, pr...",0,-0.6542
3,03-05-19,"day, great, way, end, week, cheer, cleaner, fu...",1,0.8442
4,03-05-19,"tesla, waymo, uber, mercedes, ge, batterelectr...",1,0.4404


In [10]:
df_bc.to_excel("data/output/vader/vader_bitcoin.xlsx", sheet_name='Sheet_1') 
df_dj.to_excel("data/output/vader/vader_dowjones.xlsx", sheet_name='Sheet_1') 
df_ts.to_excel("data/output/vader/vader_tesla.xlsx", sheet_name='Sheet_1') 

In [21]:
def classifier(clf,strat,vec,df):
    
    y = df['LABEL'].astype(str)
    X = df.COMMENT.astype(str)
    #y = pd.get_dummies(y)
    
    if vec == "cv":
        count = CountVectorizer()
    elif vec == "tdidf":
        count = TfidfVectorizer(binary=True, ngram_range=(1,1), use_idf=False)
    
    if strat == "y":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
        X_train_counts = count.fit_transform(X_train)

        X_test_counts = count.transform(X_test)

        
    elif strat == "n":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 
        X_train_counts = count.fit_transform(X_train)
        X_test_counts = count.transform(X_test)

    
    if clf == "lr":
        clf = LogisticRegression(solver="newton-cg",C=1).fit(X_train_counts, y_train)
        #clf = LogisticRegression(C=0.4).fit(X_train_counts, y_train)
        
    elif clf == "mnb":
        clf = MultinomialNB().fit(X_train_counts, y_train)
        #clf = GaussianNB().fit(X_train_counts, y_train)
        
    
    elif clf == "svm":
        clf = svm.SVC(kernel='linear').fit(X_train_counts, y_train)

    yPred = clf.predict(X_test_counts)
    yPred_log = clf.predict_proba(X_test_counts)
    
    list_ = list(zip(yPred, X_test))
    
    acc=accuracy_score(y_test,yPred)
    f1=f1_score(y_test,yPred, average='weighted')

    return clf, count,"Accuracy: %s" % acc, "F1 %s" % f1, list_, yPred_log

In [32]:
df_ts[:1000]

Unnamed: 0,date,COMMENT,LABEL,compound_score
0,03-05-19,"tell, ratio, bull, bear, comment, section, bas...",1,0.4215
1,03-05-19,"right, total, debt, billion, dollar",0,-0.3612
2,03-05-19,"dont, know, day, tesla, hasnt, make, solid, pr...",0,-0.6542
3,03-05-19,"day, great, way, end, week, cheer, cleaner, fu...",1,0.8442
4,03-05-19,"tesla, waymo, uber, mercedes, ge, batterelectr...",1,0.4404
5,03-05-19,"pessimisticvalue, perfect, buy, aint, fall",1,0.5719
6,03-05-19,"dont, vaccine, kid, sell, “, alternative, medi...",0,-0.3400
7,03-05-19,"happy, buying, optimisticvalue",1,0.5719
8,03-05-19,"hater, gon, na, hate",0,-0.7579
9,03-05-19,"us, delivery, k, mar, stay, k, also, apr, mean...",1,0.0772


In [33]:
result_ts=classifier("lr","y","cv",df_ts[:1000])
result_ts[2:4]

('Accuracy: 0.81', 'F1 0.8047018447018448')

In [34]:
result_bit=classifier("lr","y","cv",df_bc[:1000])

In [35]:
result_bit[2:4]

('Accuracy: 0.83', 'F1 0.8232047685834502')

In [36]:
result_dj=classifier("lr","y","cv",df_dj[:1000])

In [37]:
result_dj[2:4]

('Accuracy: 0.845', 'F1 0.8347722185440902')