In [332]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [350]:
df =  pd.read_excel('data/output/clean_labeled_data.xlsx')

Raghava

## Classifier

In [352]:
df.head()

Unnamed: 0,COMMENT,LABEL
0,"thats, bull, say, since, pessimisticvalue, k, ...",NEG
1,"hello, guy, anyone, use, ig, market, issue, pl...",NEU
2,"year, old, hand, pin, falcon, wing, door, subm...",NEU
3,"time, breka, never, really, guys",NEG
4,"world, economy, th, final, one, trump, bankruptcy",NEG


In [353]:
df['LABEL']=df['LABEL'].map({'NEU':'NEG','POS':'POS','NEG':'NEG'})

In [354]:
def classifier(clf,strat,vec):
    
    y = df['LABEL'].astype(str)
    X = df.COMMENT.astype(str)
    #y = pd.get_dummies(y)
    
    if vec == "cv":
        count = CountVectorizer(binary=True,ngram_range=(1,1))
    elif vec == "tdidf":
        count = TfidfVectorizer(binary=True, ngram_range=(1,1))
    
    if strat == "y":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        X_train_counts = count.fit_transform(X_train)
        X_test_counts = count.transform(X_test)

        
    elif strat == "n":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
        X_train_counts = count.fit_transform(X_train)
        X_test_counts = count.transform(X_test)

    
    if clf == "lr":
        clf = LogisticRegression(solver="newton-cg",C=1).fit(X_train_counts, y_train)
        #clf = LogisticRegression(C=0.4).fit(X_train_counts, y_train)
        
    elif clf == "mnb":
        clf = MultinomialNB().fit(X_train_counts, y_train)
        
    
    elif clf == "svm":
        clf = svm.SVC(kernel='linear').fit(X_train_counts, y_train)

        
    yPred = clf.predict(X_test_counts)
    
    list_ = list(zip(yPred, X_test))
    acc=accuracy_score(y_test,yPred)
    f1=f1_score(y_test,yPred, average='weighted')

    return clf, count,"Accuracy: %s" % acc, "F1 %s" % f1, list_


In [355]:
result=classifier("lr","y","cv")
print(result[2:4])

df_result = pd.DataFrame(result[4], columns = ['Predicted_label', 'Comment'])



('Accuracy: 0.745', 'F1 0.712148349698332')


In [356]:
df_result

Unnamed: 0,Predicted_label,Comment
0,NEG,"hit, pessimisticvalue, morning, go, easter, al..."
1,NEG,bottom
2,NEG,"time, announcement, expect"
3,NEG,"yes, indeed, look, pessimisticvalue, percent, ..."
4,POS,"btc, attempt, stay, daily, however, pull, back..."
5,NEG,"today, point"
6,NEG,"please, dont, panic, there, tesla, space"
7,NEG,"short, folk, agree, overvalue, market, however..."
8,NEG,"wait, next, tweet, mass, destruction, potus"
9,NEG,"market, investor, lose, confidence, stock, pes..."


In [357]:
feature_to_coef = {
    word: coef for word, coef in zip(
        result[1].get_feature_names(), result[0].coef_[0]
    )    
}

In [358]:
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    )[:10]:
    print (best_positive)

('optimisticvalue', -2.422512550325552)
('buy', -1.5710786811484896)
('rise', -1.2797122289554046)
('hold', -0.9872163915830383)
('close', -0.9345578983426864)
('party', -0.9259176500457456)
('firework', -0.912427022694593)
('ok', -0.9071106345044886)
('youd', -0.8893736546303984)
('growth', -0.8739392312184157)


In [359]:
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], reverse=True 
    )[:10]:
    print (best_negative)

('know', 1.4358593353276883)
('sell', 1.3435594110826363)
('need', 1.1392660880089616)
('yet', 1.019535854264432)
('real', 0.9222283185665422)
('low', 0.8437433837231554)
('please', 0.834576609359245)
('look', 0.8150168521096357)
('try', 0.8127270312043091)
('one', 0.7991813499821042)


In [296]:
feature_to_coef

{'aamcoxxx': -8.11701408773731,
 'aapl': -8.11701408773731,
 'aback': -8.11701408773731,
 'abandon': -8.11701408773731,
 'abc': -8.810161268297255,
 'able': -8.11701408773731,
 'abruptly': -8.11701408773731,
 'absolutely': -8.11701408773731,
 'abuse': -8.11701408773731,
 'ac': -8.11701408773731,
 'accept': -7.711548979629145,
 'acceptance': -8.11701408773731,
 'accident': -8.11701408773731,
 'accommodative': -8.810161268297255,
 'accord': -7.423866907177365,
 'account': -7.711548979629145,
 'accumulate': -8.11701408773731,
 'achieve': -8.11701408773731,
 'action': -7.018401799069201,
 'activate': -8.810161268297255,
 'actual': -7.711548979629145,
 'add': -7.711548979629145,
 'added': -8.810161268297255,
 'additionally': -8.11701408773731,
 'address': -8.11701408773731,
 'advance': -7.711548979629145,
 'advise': -8.11701408773731,
 'affair': -8.11701408773731,
 'affect': -7.711548979629145,
 'afraid': -8.11701408773731,
 'afterhours': -8.11701408773731,
 'afternoon': -8.11701408773731,
