In [None]:
# for frequncy vectors
def entropy1(freq1):
    N=sum(freq1)
    if N==0:
        return float('NaN')
    else:
        H=0
        for f in freq1:
            if f!=0:
                p=f/N
                H=H-p*np.log2(p)
        return H 

# Entropy of joint distribution from contingency tables
def entropy2(freq):
    N=sum(sum(freq))
    if N==0:
        return float('NaN')
    else:
        H=0
        for frow in freq:
            for f in frow:
                if f!=0:
                    p=f/N
                    H=H-p*np.log2(p)
        return H 
                
# Mutual Information for contingency tables
def mutual_info(freq): 
    freq0=np.sum(freq,axis=0)
    freq1=np.sum(freq,axis=1)
    return entropy1(freq0)+entropy1(freq1)-entropy2(freq)

# Mutual Information for contingency tables, normalized by entropy
def norm_mutual_info(freq): 
    freq0=np.sum(freq,axis=0)
    freq1=np.sum(freq,axis=1)
    return mutual_info(freq)/np.sqrt(entropy1(freq0)*entropy1(freq1))

In [46]:
def create_freq_table(num_rows,num_cols,predictions,answers,label_choices):
    freq=np.zeros((num_rows,num_cols))
    tdict={}
    i=0
    for p in label_choices:
        tdict[p]=i
        i=i+1
    train_size=len(predictions)
    for i in range(train_size):
        freq[tdict[predictions[i]]][tdict[answers[i]]]+=1
    return freq

In [52]:
import numpy as np

from scipy.sparse import coo_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

from happierfuntokenizing import Tokenizer


def get_data(data_f):
    """
    Load the data from file and seperate into train/tune/test sets

    Output format is a dictionary that maps tweet ids to (stance, tweet) tuples.
    Note that stance is the gold label (what is to be predicted).
    """

    train = {}
    test = {}
    # Load data from file
    # It has already been split into testing and training data
    with open(data_f,'r',encoding='latin-1') as f:
        for l_count, line in enumerate(f):
            if l_count == 0:
                continue
            tid, _, test_train, target, stance, tweet = line.strip().split("\t")
            if target not in train:
                train[target] = {}
                test[target] = {}
            if test_train == "test":
                test[target][tid] = (stance, tweet)
            else:
                train[target][tid] = (stance, tweet)

    # Split into test/tune/train
    new_train = {}
    tune = {}
    for target in train:
        tids = train[target].keys()
        tids1=list(train[target].keys())
        tids1=np.array(tids1)
        train_tids, tune_tids = train_test_split(tids1, test_size=.2)
        new_train[target] = {tid: train[target][tid] for tid in train_tids}
        tune[target] = {tid: train[target][tid] for tid in tune_tids}

    return test, tune, new_train


def get_char_ngram_feats(tweet, n):
    """Extract character ngram features"""
    feat_template = "CHAR_{}:".format(n)
    feats = []
    for i in range(len(tweet)):
        if i + n <= len(tweet):
            feat = feat_template + tweet[i:i + n]
            pair = (feat, 1)
            if pair not in feats:
                feats.append(pair)
    return feats


def get_word_ngram_feats(tokens, n):
    """Extract word ngram features"""
    feat_template = "WORD_{}:".format(n)
    feats = []
    tokens=list(tokens)
    for i in range(len(tokens)):
        if i + n <= len(tokens):
            feat = feat_template + "|".join(tokens[i:i + n])
            pair = (feat, 1)
            if pair not in feats:
                feats.append(pair)
    return feats


def create_feat_dict(data, threshold=0):
    """
    Create a dictionary that maps feature names to indices (what column of the
    feature matrix they are).

    Can optionally specify a threshold; only features which occur more than this
    threshold will be included.
    """
    feat_counts = {}
    tokenizer = Tokenizer()
    for tid in data:
        tweet = data[tid][1]
        tokens = tokenizer.tokenize(tweet)

        feats = []
        feats += get_char_ngram_feats(tweet, 2)
        feats += get_char_ngram_feats(tweet, 3)
        feats += get_char_ngram_feats(tweet, 4)
        feats += get_char_ngram_feats(tweet, 5)
        feats += get_word_ngram_feats(tokens, 1)
        feats += get_word_ngram_feats(tokens, 2)
        feats += get_word_ngram_feats(tokens, 3)

        for feat_name, _ in feats:
            if feat_name not in feat_counts:
                feat_counts[feat_name] = 0
            feat_counts[feat_name] += 1
    # Only keep features which occur more often than the given threshold
    feat_dict = {}
    for feat_name in feat_counts:
        if feat_counts[feat_name] > threshold:
            feat_dict[feat_name] = len(feat_dict)
    return feat_dict


def create_matrix(data, feat_dict, ncols=None):
    """
    Given the raw data and the dictionary of feature names, creates the feature
    matrix which will be used for training/testing.

    ncols is used to specify the number of cols (features) that the matrix will have.
    This is necessary because the test/train/tune matrices all need the same number.
    """
    X = [[], [], []]
    Y = []
    tokenizer = Tokenizer()
    cur_row = 0
    for tid in data:
        stance, tweet = data[tid]

        if stance == "AGAINST":
            Y.append(-1)
        elif stance == "FAVOR":
            Y.append(1)
        else:
            Y.append(0)

        tokens = tokenizer.tokenize(tweet)

        feats = []
        feats += get_char_ngram_feats(tweet, 2)
        feats += get_char_ngram_feats(tweet, 3)
        feats += get_char_ngram_feats(tweet, 4)
        feats += get_char_ngram_feats(tweet, 5)
        feats += get_word_ngram_feats(tokens, 1)
        feats += get_word_ngram_feats(tokens, 2)
        feats += get_word_ngram_feats(tokens, 3)

        for feat_name, feat_val in feats:
            if feat_name not in feat_dict:
                continue
            feat_idx = feat_dict[feat_name]
            X[0].append(feat_val)
            X[1].append(cur_row)
            X[2].append(feat_idx)
        cur_row += 1
    if ncols is not None:
        nrows = len(Y)
        X_mat = coo_matrix((X[0], (X[1], X[2])), shape=(nrows, ncols))
    else:
        X_mat = coo_matrix((X[0], (X[1], X[2])))
    X_array = X_mat.toarray()
    Y_array = np.array(Y)
    return X_array, Y_array


def compute_score(predictions, labels):
    """
    Compute the F1 score for the model's predictions
    """
    # true positive, num guessed, num gold
    against = [0.0, 0.0, 0.0]
    favor = [0.0, 0.0, 0.0]
    for i in range(len(predictions)):
        predict = predictions[i]
        label = labels[i]
        if predict == label:
            if label == -1:
                against[0] += 1
            elif label == 1:
                favor[0] += 1
        if predict == 1:
            favor[1] += 1
        elif predict == -1:
            against[1] += 1
        if label == 1:
            favor[2] += 1
        elif label == -1:
            against[2] += 1
    scores = []
    for cat in [against, favor]:
        prec = 0.0
        rec = 0.0
        f1 = 0.0
        if cat[1] > 0:
            prec = cat[0] / cat[1]
        if cat[2] > 0:
            rec = cat[0] / cat[2]
        if prec + rec > 0:
            f1 = 2 * prec * rec / (prec + rec)
        scores.append((cat[0], cat[1], cat[2], prec, rec, f1))
    return scores


def train_model(train_X, train_Y, tune_X, tune_Y):
    # SVM parameters
    fit_intercept = True
    penalty = "l2"
    loss = "squared_hinge"
    dual = True
    tol = .0001
    max_iter = 1000
    random_state = 5

    c_cands = [10 ** i for i in range(-5, 1, 1)]
    c_votes = {c: 0 for c in c_cands}

    # Use the tune set to choose the best value for c
    scores = []
    for c in c_cands:
        classif = LinearSVC(C=c, random_state=random_state, penalty=penalty,
                            fit_intercept=fit_intercept, loss=loss, dual=dual,
                            tol=tol, max_iter=max_iter)
        classif.fit(train_X, train_Y)
        predictions = classif.predict(tune_X)
        against, favor = compute_score(predictions, tune_Y)
        score = (against[-1] + favor[-1]) / 2
        scores.append(score)
    best_score = max(scores)
    best_cs = []
    for i in range(len(scores)):
        if scores[i] == best_score:
            best_cs.append(str(c_cands[i]))
            c_votes[c_cands[i]] += 1

    votes = [(y, x) for x, y in c_votes.items()]
    c_opt = sorted(votes, reverse=True)

    # Once best value for c is found, train model with that value
    print("\tUsing best c={}".format(c_opt[0][1]))
    classif = LinearSVC(C=c_opt[0][1], random_state=random_state, penalty=penalty,
                        fit_intercept=fit_intercept, loss=loss, dual=dual, tol=tol,
                        max_iter=max_iter)
    classif.fit(train_X, train_Y)
    return classif


def main():
    threshold = 0
    data_f = "../stance/data/dataset.tsv"

    test, tune, train = get_data(data_f)

    targets = ["Atheism", "Climate Change is a Real Concern", "Feminist Movement",
               "Hillary Clinton", "Legalization of Abortion"]
    stances=['AGAINST','NONE','FAVOR']

    target_num=len(targets)
    stance_num=len(stances)
    for target in targets:
        print(target+'\n')
        feat_dict = create_feat_dict(train[target], threshold=threshold)
        num_feats = len(feat_dict)
        train_X, train_Y = create_matrix(train[target], feat_dict, ncols=num_feats)
        tune_X, tune_Y = create_matrix(tune[target], feat_dict, ncols=num_feats)
        test_X, test_Y = create_matrix(test[target], feat_dict, ncols=num_feats)

        model = train_model(train_X, train_Y, tune_X, tune_Y)
        predictions = model.predict(test_X)
        against, favor = compute_score(predictions, test_Y)
        score = (against[-1] + favor[-1]) / 2 * 100
        print('Model Score (mean accuracy): '+str(model.score(test_X,test_Y)))
        print("\t", "Test:", round(score, 2))
        print("\t", "Against:",round(against[-1]*100,2),"\t", "Favor:",round(favor[-1]*100,2))
        pred_string=[]
        for pred in predictions: 
            if pred==-1:
                pred_string.append("AGAINST")
            elif pred==0:
                pred_string.append("NONE")
            else:
                pred_string.append("FAVOR")
        testY_string=[]
        for t in test_Y: 
            if t==-1:
                testY_string.append("AGAINST")
            elif t==0:
                testY_string.append("NONE")
            else:
                testY_string.append("FAVOR")
        freqS=create_freq_table(stance_num,stance_num,pred_string,testY_string,stances)
        print(freqS)
        print('Normalized Mutual Information')
        print(norm_mutual_info(freqS))
        print("")


if __name__ == '__main__':
    main()


Atheism

	Using best c=1
Model Score (mean accuracy): 0.709090909091
	 Test: 60.99
	 Against: 81.99 	 Favor: 40.0
[[ 132.   13.   17.]
 [  15.   12.    3.]
 [  13.    3.   12.]]
Normalized Mutual Information
0.10118035106

Climate Change is a Real Concern

	Using best c=0.001
Model Score (mean accuracy): 0.727810650888
	 Test: 41.51
	 Against: 0.0 	 Favor: 83.02
[[   0.    0.    0.]
 [   1.   13.   13.]
 [  10.   22.  110.]]
Normalized Mutual Information
0.0655515431295

Feminist Movement

	Using best c=0.01
Model Score (mean accuracy): 0.561403508772
	 Test: 55.19
	 Against: 66.07 	 Favor: 44.3
[[ 111.   20.   22.]
 [  17.   14.    1.]
 [  55.   10.   35.]]
Normalized Mutual Information
0.0692518926367

Hillary Clinton

	Using best c=0.1
Model Score (mean accuracy): 0.650847457627
	 Test: 57.28
	 Against: 74.02 	 Favor: 40.54
[[ 141.   41.   27.]
 [  18.   36.    3.]
 [  13.    1.   15.]]
Normalized Mutual Information
0.132155100586

Legalization of Abortion

	Using best c=0.1
Model S

In [24]:
def multiclassScore(answers,pred):
    favord=0
    favorn=0
    againstd=0
    againstn=0
    noned=0
    nonen=0
    wrong_indices=[]
    for i in range (0,len(answers)):
        if answers[i]=="AGAINST": 
            againstd+=1
        if answers[i]=="FAVOR": 
            favord+=1 
        if answers[i]=="NONE": 
            noned+=1
        if answers[i]==pred[i]: 
            if answers[i]=="AGAINST": 
              
                againstn+=1
            if answers[i]=="FAVOR": 
              
                favorn+=1
            if answers[i]=="NONE": 
             
                nonen+=1
        if answers[i]!=pred[i]: 
            wrong_indices.append(i)
       
    if favord==0: 
        favord=1
    if againstd==0: 
        againstd=1
    if noned==0: 
        noned=1
    print('\t'+'against'+'\n')
    print('\t'+'\t'+str(againstn/againstd))
    print('\t'+'favor'+'\n')
    print('\t'+'\t'+str(favorn/favord))
    print('\t'+'none'+'\n')
    print('\t'+'\t'+str(nonen/noned))
    return wrong_indices

In [47]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
def get_data(data_f):
    """
    Load the data from file and seperate into train/tune/test sets

    Output format is a dictionary that maps tweet ids to (stance, tweet) tuples.
    Note that stance is the gold label (what is to be predicted).
    """

    train = {}
    test = {}
    user_list=[]
    # Load data from file
    # It has already been split into testing and training data
    with open(data_f,'r',encoding='latin-1') as f:
        for l_count, line in enumerate(f):
            if l_count == 0:
                continue
            tid, user, test_train, target, stance, tweet = line.strip().split("\t")
            if user not in user_list:
                user_list.append(user)
            if target not in train:
                train[target] = {}
                test[target] = {}
            if test_train == "test":
                test[target][tid] = (stance, tweet)
            else:
                train[target][tid] = (stance, tweet)

    # Split into test/tune/train
    new_train = {}
    for target in train:
        tids = train[target].keys()
        tids1=list(train[target].keys())
    return test,train,user_list

data_f='../stance/datacopy.tsv'
users=[]
testset,trainset,users=get_data(data_f)

        
targets=["Atheism", "Feminist Movement","Climate Change is a Real Concern","Hillary Clinton","Legalization of Abortion"]
stances=['AGAINST','NONE','FAVOR']

target_num=len(targets)
stance_num=len(stances)
tweets_of_target={}
stances_of_target={}
choices=['train','test']
for opt in choices:
    tweets_of_target[opt]={}
    stances_of_target[opt]={}
    for target in targets: 
        tweets_of_target[opt][target]=[]
        stances_of_target[opt][target]=[]

for target in targets:  
    choice=''
    for tid in trainset[target]: 
        choice='train'
        tweets_of_target[choice][target].append(trainset[target][tid][1])
        stances_of_target[choice][target].append(trainset[target][tid][0])
     
    for tid in testset[target]: 
        choice='test'
        tweets_of_target[choice][target].append(testset[target][tid][1])
        stances_of_target[choice][target].append(testset[target][tid][0])
       
        
    vectorizer=TfidfVectorizer(binary=True,sublinear_tf=True,max_df=0.5, analyzer='word', 
                 stop_words='english')
    
    train_matrix=vectorizer.fit_transform(tweets_of_target['train'][target])
    test_matrix=vectorizer.transform(tweets_of_target['test'][target])
    model=BernoulliNB()
    model.fit(train_matrix,stances_of_target['train'][target])
    predictions=model.predict(test_matrix)
    print(target+'\n')
    print('\t'+"Bernoulli Score"+'\n')
    print('\t'+str(model.score(test_matrix,stances_of_target['test'][target]))+'\n')
    multiclassScore(stances_of_target['test'][target],predictions)

    freqB=create_freq_table(stance_num,stance_num,predictions,stances_of_target['test'][target],stances)
    
    print('Contingency Table (Rows Representing predicted stances, columns representing correct stances)'+'\n')
    print(freqB)
    print('Normalized Mutual Information')
    print(norm_mutual_info(freqB))
#precision_recall_fscore_support(stances_test,predictions, labels=['FAVOR','NONE','AGAINST'])
    print("")

    multi=MultinomialNB()
    multi.fit(train_matrix,stances_of_target['train'][target])
    predictions1=multi.predict(test_matrix)
    print('\t'+"Multinomial Score"+'\n')
    print('\t'+str(multi.score(test_matrix,stances_of_target['test'][target]))+'\n')
    multiclassScore(stances_of_target['test'][target],predictions1)
#precision_recall_fscore_support(stances_test,predictions1)
    freqM=create_freq_table(stance_num,stance_num,predictions,stances_of_target['test'][target],stances)
    print('Contingency Table (Rows Representing predicted stances, columns representing correct stances)'+'\n')
    print(freqM)
   
    print('Normalized Mutual Information')
    print(norm_mutual_info(freqM))
    print("")

Atheism

	Bernoulli Score

	0.727272727273

	against

		1.0
	favor

		0.0
	none

		0.0
Contingency Table (Rows Representing predicted stances, columns representing correct stances)

[[ 160.   28.   32.]
 [   0.    0.    0.]
 [   0.    0.    0.]]
Normalized Mutual Information
nan

	Multinomial Score

	0.740909090909

	against

		1.0
	favor

		0.0625
	none

		0.03571428571428571
Contingency Table (Rows Representing predicted stances, columns representing correct stances)

[[ 160.   28.   32.]
 [   0.    0.    0.]
 [   0.    0.    0.]]
Normalized Mutual Information
nan

Feminist Movement

	Bernoulli Score

	0.638596491228

	against

		0.9726775956284153
	favor

		0.06896551724137931
	none

		0.0
Contingency Table (Rows Representing predicted stances, columns representing correct stances)

[[ 178.   44.   54.]
 [   0.    0.    0.]
 [   5.    0.    4.]]
Normalized Mutual Information
0.024412451723

	Multinomial Score

	0.610526315789

	against

		0.8961748633879781
	favor

		0.1724137931034

