In [110]:
import json
import random
import pandas as pd
import numpy as np
from tqdm.notebook import trange, tqdm

# tools
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from textblob import TextBlob
from arglex.Classifier import Classifier

stopWords = set(stopwords.words('english'))
arglex = Classifier()

In [42]:
with open("./data/users.json", "r") as f:
    users = json.load(f)

with open("./data/debates.json", "r") as f:
    debates = json.load(f)
    
with open("big_issue_embedding.json","r",encoding="UTF-8") as f:
    issue_emb_dic = json.load(f)

with open("user_attritbute_embedding.json","r",encoding="UTF-8") as f:
    att_emb_dic = json.load(f)


In [59]:
# Here we convert the debate dataset into a desired form for the model
user_collection = {}
cats = list(users["ahuggies30"].keys())
useful_cats= ['political_ideology', 'education', 'ethnicity', 'interested', 'gender' , 'religious_ideology']

per_cat_choices = dict()
for cat in useful_cats:
    per_cat_choices[cat] = []

for name,user in users.items():
    user_data = dict()
    user_data["name"] = (name)
    
    # categorical data
    for cat in useful_cats:
        user_data[cat] = user[cat]

        if user[cat] not in per_cat_choices[cat]:
            per_cat_choices[cat].append(user[cat])
    
    if not (len(debates) == 0 and len(opinions) == 0):
        user_collection[name] = user
    
# prepare one-hot
for key, user_data in user_collection.items():
    for cat in useful_cats:
        user_data["name"] = key
        user_data[cat + "_id"] = per_cat_choices[cat].index(user_data[cat])
        user_data[cat + "_len"] = len(per_cat_choices[cat])

In [48]:
number_of_users=len(users)
# number_of_users
print(users["ahuggies30"].keys())
# print(len(users["ahuggies30"].keys()))

print(debates["0-is-an-even-number./1/"].keys())

print(debates["0-is-an-even-number./1/"]['title'])
print(debates["0-is-an-even-number./1/"]['forfeit_label'])
print(debates["0-is-an-even-number./1/"]['participant_1_name'])
print(debates["0-is-an-even-number./1/"]['rounds'][0])
print(debates["0-is-an-even-number./1/"]['votes'][0])
# print(users["Chunkymilk"])

# categories = ["birthday","education","ethnicity","gender","looking","number_of_friends","party","political_ideology","president","relationship","religious_ideology"]
# print(len(categories))

dict_keys(['all_debates', 'big_issues_dict', 'birthday', 'description', 'education', 'elo_ranking', 'email', 'ethnicity', 'gender', 'friends', 'income', 'interested', 'joined', 'last_online', 'last_updated', 'looking', 'lost_debates', 'number_of_all_debates', 'number_of_lost_debates', 'number_of_tied_debates', 'number_of_won_debates', 'number_of_friends', 'number_of_opinion_arguments', 'number_of_opinion_questions', 'number_of_poll_topics', 'number_of_poll_votes', 'number_of_voted_debates', 'opinion_arguments', 'opinion_questions', 'party', 'percentile', 'political_ideology', 'poll_topics', 'poll_votes', 'president', 'relationship', 'religious_ideology', 'url', 'voted_debates', 'win_ratio', 'won_debates', 'tied_debates', 'political_ideology_id', 'political_ideology_len', 'education_id', 'education_len', 'ethnicity_id', 'ethnicity_len', 'interested_id', 'interested_len', 'gender_id', 'gender_len', 'religious_ideology_id', 'religious_ideology_len'])
dict_keys(['url', 'category', 'title',

In [49]:
for key,value in debates.items():
    if value['participant_1_position'] != "Pro":
        print(key)
    

In [87]:
# get number of involved debates:
count_list  = []
all_count = 0
all_votes = 0
involved_debates = []

for issue in BIG_ISSUES:
    count_list.append(0)
    
for key,value in debates.items():
    title = value["title"]

    pnames = []
    
    pnames.append(value['participant_1_name'])
    pnames.append(value['participant_2_name'])
    
    names = []
    for vote in value["votes"]:
        if vote["votes_map"].get(value['participant_1_name'], -1) == -1:
            continue
        if vote["votes_map"].get(value['participant_2_name'], -1) == -1:
            continue
        if vote["votes_map"].get("Tied", -1) == -1:
            names.append(vote["user_name"])
        elif vote["votes_map"]["Tied"]['Made more convincing arguments'] == False:
            names.append(vote["user_name"])
            
    if len(names) == 0:
        continue
        
    puser_available = 1
    
    for name in pnames:
        if users.get(name, -1) == -1:
            puser_available = 0
    
    user_available = 0
    
    for name in names:
        if users.get(name, -1) != -1:
            user_available += 1
    
    if not user_available or not puser_available:
        continue
    
    for i, issue in enumerate(BIG_ISSUES):
        if issue in title or issue.lower() in title:
            count_list[i] += 1
            all_count += 1
            all_votes += user_available
            
            involved_debates.append(value)
    
print("All: ", len(involved_debates))
print("Votes", all_votes)


All:  2893
Votes 10441


In [51]:
def flat_list(list_of_list):
    flat_list = []
    for i in list_of_list:
        flat_list+=i
        
    return flat_list

def compute_cosine(a, b):

    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [52]:
BIG_ISSUES = ['Abortion', 'Affirmative Action', 'Animal Rights', 'Barack Obama', 'Border Fence',
              'Capitalism', 'Civil Unions', 'Death Penalty', 'Drug Legalization', 'Electoral College',
              'Environmental Protection', 'Estate Tax', 'European Union', 'Euthanasia', 'Federal Reserve',
              'Flat Tax', 'Free Trade', 'Gay Marriage', 'Global Warming Exists', 'Globalization', 'Gold Standard',
              'Gun Rights', 'Homeschooling', 'Internet Censorship', 'Iran-Iraq War', 'Labor Union',
              'Legalized Prostitution', 'Medicaid & Medicare', 'Medical Marijuana', 'Military Intervention',
              'Minimum Wage', 'National Health Care', 'National Retail Sales Tax', 'Occupy Movement', 'Progressive Tax',
              'Racial Profiling', 'Redistribution', 'Smoking Ban', 'Social Programs', 'Social Security', 'Socialism',
              'Stimulus Spending', 'Term Limits', 'Torture', 'United Nations', 'War in Afghanistan', 'War on Terror', 'Welfare']

USEFUL_CATS = ['political_ideology', 'education', 'ethnicity', 'interested', 'gender' , 'religious_ideology']

In [69]:
def user_feature_generator(user_data, name, att_emb_dic):
    user = user_data[name]
    
    # background features
    cat_one_hot = []
    cat_emb = []
    
    for cat in USEFUL_CATS:
        cat_emb.extend(att_emb_dic[cat + ":" +user[cat]])

        cat_id = user[cat+"_id"]
        cat_num = user[cat+"_len"]
        one_hot = [0.0 for i in range(cat_num)]
        one_hot[cat_id] = 1.0
        cat_one_hot.extend(one_hot)
        
    # opinions
    options = ["Pro", "Con", "Und", "N/O", "N/S"]
    op_one_hot = []
    op_emb = []
    
    for issue in BIG_ISSUES:
        op_emb.extend(att_emb_dic[issue+"-"+user["big_issues_dict"][issue]])
        
        op_id = options.index(user["big_issues_dict"][issue])
        one_hot = [0.0 for i in range(len(options))]
        one_hot[op_id] = 1.0
        op_one_hot.extend(one_hot)
        
    return [cat_one_hot, cat_emb, op_one_hot, op_emb]


def argument_topic_feature(sent_list, title):

    for sent in sent_list:
        if sent[-1] in "abcdefghijklmnopqrstuvwxyz":
            sent += "."

    sep = " "
    text = sep.join(sent_list)
    
    # relevancy
    relevancy = 0.0
    rel_count = 0
    topic_words = word_tokenize(title)
    
    for word in topic_words:
        if word not in stopWords:
            rel_count += 1
            relevancy += text.count(word)
    
    relevancy /= rel_count
    
    # Consistency
    exist_pos = 0
    exist_neg = 0
    
    for sent in sent_list:
        rel = 0
        for word in topic_words:
            if word not in stopWords:
                rel = 1
        
        if rel:
            sent_blob = TextBlob(text)
            average_polar = sent_blob.sentiment.polarity
            if average_polar > 0:
                exist_pos = 1
            elif average_polar < 0:
                exist_neg = 1
    
    if exist_pos != exist_neg:
        consistency = 1
    else:
        consistency = 0
        
    return [relevancy, consistency]
    
    
def linguistic_feature_generator(sent_list, feature_set=["len","sub-polar","arglex","referring_op", "links"]):
    
    linguistic_vec = []
    for sent in sent_list:
        if sent[-1] in "abcdefghijklmnopqrstuvwxyz":
            sent += "."

    sep = " "
    text = sep.join(sent_list)

    avg_length = 0
    avg_sub = 0
    avg_polar = 0
    count = len(sent_list)

    # failed situtaion
    if count == 0:
        return [0.0 for i in range(10)]

    # sentiment
    sent_blob = TextBlob(text)
    average_sub = sent_blob.sentiment.subjectivity
    average_polar = sent_blob.sentiment.polarity

    # len
    for sent in sent_list:
        avg_length += len(word_tokenize(sent))

    avg_length = avg_length/count

    # arg
    lexicon_score = arglex.analyse(text)
    # in the original arglex, we have
    # ['0-Assessments', '1-Authority', '2-Causation', '3-Conditionals', '4-Contrast', '5-Difficulty', '6-Doubt', '7-Emphasis',\
    #     '8-Generalization', '9-Inconsistency', '10-Inyourshoes', '11-Necessity', '12-Possibility', '13-Priority', '14-Rhetoricalquestion',\
    #    '15-Structure', '16-Wants']
    # Here we only care about Authority, Conditionals, Contrast, Difficulty, Necessity
    lex_vec = [lexicon_score[1], lexicon_score[3], lexicon_score[4], lexicon_score[5], lexicon_score[11]]
    
    # referring opponent
    ref_op = 0.0
    if "opponent" in text or "debater" in text:
        ref_op = 1.0
    
    # using links:
    use_links = 0
    if "www." in text or "http" in text:
        use_links = 1
    
    if "len" in feature_set:
        linguistic_vec.append(avg_length)
    if "sub-polar" in feature_set:
        linguistic_vec.append(average_sub)
        linguistic_vec.append(average_polar)
    if "arglex" in feature_set:
        linguistic_vec.extend(lex_vec)
    if "referring_op" in feature_set:
        linguistic_vec.extend([ref_op])
    if "links" in feature_set:
        linguistic_vec.extend([use_links])
        
    return linguistic_vec

In [104]:
def prepare_debate_data(debates, users, useful_cats, max_arg_sent, issue_emb_dic , att_emb_dic):
    dataset = []
    
    for debate in tqdm(debates):
        
        debater1 = users[debate['participant_1_name']]
        debater2 = users[debate['participant_2_name']]
        title = debate["title"]
        
        # issue_emb
        this_issue = ""
        for i, issue in enumerate(BIG_ISSUES):
            if issue in title or issue.lower() in title:
                this_issue = issue
                break
                
        issue_emb = issue_emb_dic[this_issue]
        
        # [cat_one_hot, cat_emb, op_one_hot, op_emb]
        uf1 =  user_feature_generator(users, debate['participant_1_name'], att_emb_dic)
        uf2 =  user_feature_generator(users, debate['participant_2_name'], att_emb_dic)
        
        valid_votes = []
        for vote in debate["votes"]:
            if users.get(vote['user_name'], -1) == -1:
                continue
            
            if vote["votes_map"].get(debate['participant_1_name'], -1) == -1:
                continue
            if vote["votes_map"].get(debate['participant_2_name'], -1) == -1:
                continue
            
            if vote["votes_map"].get("Tied", -1) == -1:
                valid_votes.append(vote)
            elif vote["votes_map"]["Tied"]['Made more convincing arguments'] == False:
                valid_votes.append(vote)
                
        if len(valid_votes) == 0:
            continue
                
        debater1_args = []
        debater2_args = []
        
                
        for debate_round in debate["rounds"]:
            for argument in debate_round:
                temp_text = argument["text"].replace("\n","").replace("\r","").replace("\t","").lstrip().rstrip()
                temp_sents = sent_tokenize(temp_text)
                
                if len(temp_sents) > 3:
                    temp_sents = temp_sents[-4:]
                
                if argument["side"] == "Pro":
                    debater1_args.extend(temp_sents)
                if argument["side"] == "Con":
                    debater2_args.extend(temp_sents)
                    
        
        # create the dataset
        
        lf1 = linguistic_feature_generator(debater1_args)
        lf2 = linguistic_feature_generator(debater2_args)
        
        atf1 = argument_topic_feature(debater1_args, title)
        atf2 = argument_topic_feature(debater2_args, title)
        
        debater1_score = 0
        debater2_score = 0
        
        voters_ufs = []
        
        for vote in valid_votes:
            voters_ufs.append(user_feature_generator(users, vote['user_name'], att_emb_dic))
            
            if vote["votes_map"][debate['participant_1_name']].get('Made more convincing arguments', -1) != -1:
                if vote["votes_map"][debate['participant_1_name']]['Made more convincing arguments'] == True:
                    debater1_score += 1
                else:
                    debater2_score += 1
            else:
                debater2_score += 1
        
        # one-hot b sim, o sim; emb b sim osim
        d1_sims = [0,0,0,0]
        d2_sims = [0,0,0,0]
        
        for uf in voters_ufs:
            # [cat_one_hot, cat_emb, op_one_hot, op_emb]
            for i in range(4):
                d1_sims[i] += compute_cosine(uf1[i], uf[i])
                d2_sims[i] += compute_cosine(uf2[i], uf[i])
                
        for i in range(4):
            d1_sims[i] /= len(voters_ufs)
            d2_sims[i] /= len(voters_ufs)
            
        
        if debater1_score > debater2_score:
            label1 = 1
            label2 = 0
        elif debater1_score < debater2_score:
            label1 = 0
            label2 = 1
        else:
            continue
            
            
        # participant 1 data
        temp  = {}
        temp["title"] = debate["title"]
        temp["debate_key"] = key
        temp["issue"] = this_issue
        temp["issue_emb"] = issue_emb
        
        temp["user_name"] = debate['participant_1_name']
        
        temp["cat_one_hot"] = uf1[0]
        temp["cat_emb"] = uf1[1]
        temp["op_one_hot"] = uf1[2]
        temp["op_emb"] = uf1[3]
        temp["sims"] = d1_sims 
        temp["sim_ref"] = ["cat_one_hot", "cat_emb", "op_one_hot", "op_emb"]
        
        temp["ling_features"] = lf1
        temp["topic_features"] = atf1
        temp["args"] = debater1_args
        temp["label"] = label1
        
        dataset.append(temp)
        
        # participant 2 data
        temp  = {}
        temp["title"] = debate["title"]
        temp["debate_key"] = key
        temp["issue"] = this_issue
        temp["issue_emb"] = issue_emb
        
        temp["user_name"] = debate['participant_2_name']
        
        temp["cat_one_hot"] = uf2[0]
        temp["cat_emb"] = uf2[1]
        temp["op_one_hot"] = uf2[2]
        temp["op_emb"] = uf2[3]
        temp["sims"] = d2_sims 
        temp["sim_ref"] = ["cat_one_hot", "cat_emb", "op_one_hot", "op_emb"]
        
        temp["ling_features"] = lf2
        temp["topic_features"] = atf2
        temp["args"] = debater2_args
        temp["label"] = label2
    
        dataset.append(temp)
        
    print("Number of the data collected", len(dataset))

    return dataset
            

In [105]:
# loading the dataset
# print(involved_debates[0])
dataset = prepare_debate_data(involved_debates, user_collection, USEFUL_CATS, -1, issue_emb_dic , att_emb_dic)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2893.0), HTML(value='')))


Number of the data collected 5496


In [138]:
num_sent = []

for data in dataset:
    num_sent.append(len(data["args"]))
    
print(np.mean(num_sent))
print(np.max(num_sent))

10.845705967976711
22


In [None]:
# print(list(users["ahuggies30"]["big_issues_dict"].values()))
# print(len(users["ahuggies30"]["big_issues_dict"].keys()))
with open("persuasion.json","w",encoding="UTF-8") as f:
    json.dump(dataset,f)

In [141]:
print(dataset[0].keys())

print(len(dataset[0]["cat_one_hot"]))
print(len(dataset[0]["op_one_hot"]))

dict_keys(['title', 'debate_key', 'issue', 'issue_emb', 'user_name', 'cat_one_hot', 'cat_emb', 'op_one_hot', 'op_emb', 'sims', 'sim_ref', 'ling_features', 'topic_features', 'args', 'label'])
119
240


In [121]:
# train_test_split
split = {}
split["train"] = []
split["dev"] = []
split["test"] = []


mask = list(range(len(dataset)))
random.Random(2021).shuffle(mask)

for select in mask[:int(0.7*len(mask))]:
    split["train"].append(dataset[select])
    
for select in mask[int(0.7*len(mask)):int(0.85*len(mask))]:
    split["dev"].append(dataset[select])
    
for select in mask[int(0.85*len(mask)):]:
    split["test"].append(dataset[select])
    
with open("split.json","w",encoding="UTF-8") as f:
    json.dump(split,f)

In [124]:
# baselines - majority
label = []
pred = []

for case in split["test"]:
    if case["label"] == 1:
        label.append(1)
        pred.append(1)
    if case["label"] == 0:
        label.append(0)
        pred.append(1)
        
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

acc = accuracy_score(label, pred)
a = precision_score(label, pred, average='macro')
b = recall_score(label, pred, average='macro')
c = f1_score(label, pred, average='macro')
print(acc, a, b, c)

0.49818181818181817 0.24909090909090909 0.5 0.3325242718446602


In [127]:
def prepare_svm_data(dataset, fsets=["user", "ling", "topic"], uset = "emb"): # uset = "one_hot"
    X = []
    y = []
    
    # ['title', 'debate_key', 'issue', 'issue_emb', 'user_name', 
    # 'cat_one_hot', 'cat_emb', 'op_one_hot', 'op_emb', 'sims', 'sim_ref', 'ling_features', 'topic_features', 'args', 'label']
    
    for datapiece in dataset:
        temp = []
        
        if "user" in fsets:
            # ["cat_one_hot", "cat_emb", "op_one_hot", "op_emb"]
            for key in ["cat_"+uset, "op_"+uset]:
                temp.extend(datapiece[key])
            
            if uset == "emb":
                temp.append(datapiece["sims"][1])
                temp.append(datapiece["sims"][2])
                
        if "ling" in fsets:
            temp.extend(datapiece["ling_features"])
            
        if "topic" in fsets:
            temp.extend(datapiece["topic_features"])
            
        X.append(temp)
        y.append(datapiece["label"])
        
    return X, y
        
        

In [173]:
# svm with rbf
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

train_X, train_y = prepare_svm_data(split["train"], fsets=["topic", "user", "ling"], uset = "one_hot")
test_X, test_y = prepare_svm_data(split["test"], fsets=["topic", "user", "ling"], uset = "one_hot")

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0)

clf.fit(train_X, train_y)

pred_y = clf.predict(test_X)

a = precision_score(test_y, pred_y, average='macro')
b = recall_score(test_y, pred_y, average='macro')
c = f1_score(test_y, pred_y, average='macro')
print(a, b, c)
# Pipeline(steps=[('standardscaler', StandardScaler()),
#                 ('svc', SVC(gamma='auto'))])

0.6691542288557214 0.6690468634295991 0.6690208837572247


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [38]:
def get_debate_text(user, all_debates):
    name = user["name"]
    debates = []

    debates_names = user["all_debates"]
    for debate_name in debates_names:
        if all_debates.get(debate_name):
            this_debate = all_debates[debate_name]
            position = ""
            if this_debate["participant_1_name"] == name:
                position = this_debate["participant_1_position"]
            else:
                position = this_debate["participant_2_position"]

            for round in this_debate["rounds"]:
                for argument in round:
                    if argument["side"] == position:
                        text = argument["text"].replace("\n","").replace("\r","").replace("\t","").lstrip().rstrip()
                        text += " "
                        debates.append(text)

    return debates

def get_opinion_text(user):
    opinions = []
    arguments = user["opinion_arguments"]
    
    for argument in arguments:
        text = argument['opinion text'].replace("\n","").replace("\r","").replace("\t","").lstrip().rstrip()
        text += " "
        opinions.append(text)
        
    return opinions

In [153]:
# BiLSTM
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data.dataloader as dataloader
import torch.optim as optim
import torch.autograd as autograd
import torchtext.vocab as torchvocab
from torch.autograd import Variable
import tqdm
import os
import time
import re
import pandas as pd
import string
import gensim
import time
import random
import snowballstemmer
import collections
from collections import Counter
from nltk.corpus import stopwords
from itertools import chain
from sklearn.metrics import accuracy_score
from spacy.vocab import Vocab

In [157]:
def tokenizer(text):
    return [tok.lower() for tok in text.split(' ')]


def encode_samples(tokenized_samples, vocab):
    features = []
    for sample in tokenized_samples:
        feature = []
        for token in sample:
            if token in word_to_idx:
                feature.append(word_to_idx[token])
            else:
                feature.append(0)
        features.append(feature)
    return features

def pad_samples(features, maxlen=500, PAD=0):
    padded_features = []
    for feature in features:
        if len(feature) >= maxlen:
            padded_feature = feature[:maxlen]
        else:
            padded_feature = feature
            while(len(padded_feature) < maxlen):
                padded_feature.append(PAD)
        padded_features.append(padded_feature)
    return padded_features

In [158]:
def prepare_lstm(dataset, vocab, wvmodel):
    all_texts = []
    all_titles = []
    all_labels = []
                 
    for piece in dataset:
        all_text = tokenizer(" ".join(piece['args']))
        all_texts.append(all_text)
        title = piece["title"]
        all_titles.append(tokenizer(title))
                 
        all_labels.append(piece["label"])
                 
    arg_features = torch.tensor(pad_samples(encode_samples(all_texts, vocab)))
    title_features = torch.tensor(pad_samples(encode_samples(all_titles, vocab)))
    labels = torch.tensor([score for score in all_labels])
                 
                 
    return arg_features, title_features, labels

In [199]:
class ArgLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 bidirectional, weight, labels, use_gpu, **kwargs):
        
        super(ArgLSTM, self).__init__(**kwargs)
        
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.use_gpu = use_gpu
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding.from_pretrained(weight)
        self.embedding.weight.requires_grad = False
        
        self.encoder = nn.LSTM(input_size=embed_size, hidden_size=self.num_hiddens,
                               num_layers=num_layers, bidirectional=self.bidirectional,
                               dropout=0)
        
        self.hidden1 = nn.Linear(num_hiddens * 8, num_hiddens * 4)
        self.hidden2 = nn.Linear(num_hiddens * 4, num_hiddens * 2)
        self.hidden3 = nn.Linear(num_hiddens * 2, num_hiddens * 2)
        
        self.decoder = nn.Linear(num_hiddens * 2, labels)

    def forward(self, inputs1, inputs2):
        embeddings1 = self.embedding(inputs1)
        embeddings2 = self.embedding(inputs2)
        
        states1, hidden1 = self.encoder(embeddings1.permute([1, 0, 2]))
        states2, hidden2 = self.encoder(embeddings2.permute([1, 0, 2]))
        
        encoding = torch.cat([states1[0], states1[-1], states2[0], states2[-1]], dim=1)
        encoding = self.hidden3(self.hidden2(self.hidden1(encoding)))
        
        outputs = self.decoder(encoding)
        
        return outputs

In [200]:
all_args = []

for temp in dataset:
    all_args.extend(temp["args"])

vocab = set(Vocab(strings=all_args))

In [180]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = datapath('C:\\Users\Razer\Desktop\debate_args\glove.txt')
tmp_file = get_tmpfile("C:\\Users\Razer\Desktop\debate_args\word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)
wvmodel = KeyedVectors.load_word2vec_format(tmp_file)

In [201]:
num_epochs = 20
embed_size = 100
num_hiddens = 200
num_layers = 2
bidirectional = True
batch_size = 64
labels = 2
lr = 0.08
device = torch.device('cuda:0')
use_gpu = True


word_to_idx  = {word: i+1 for i, word in enumerate(vocab)}
word_to_idx['<unk>'] = 0
idx_to_word = {i+1: word for i, word in enumerate(vocab)}
idx_to_word[0] = '<unk>'
vocab_size = len(vocab)
weight = torch.zeros(vocab_size+1, embed_size)

for i in range(len(wvmodel.index2word)):
    try:
        index = word_to_idx[wvmodel.index2word[i]]
    except:
        continue
    weight[index, :] = torch.from_numpy(wvmodel.get_vector(
        idx_to_word[word_to_idx[wvmodel.index2word[i]]]))

In [202]:
net =  ArgLSTM(vocab_size=(vocab_size+1), embed_size=embed_size,
                   num_hiddens=num_hiddens, num_layers=num_layers,
                   bidirectional=bidirectional, weight=weight,
                   labels=labels, use_gpu=use_gpu)
net.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr)

tr_args, tr_titles, tr_labels = prepare_lstm(split["train"], vocab, wvmodel)
tst_args, tst_titles, tst_labels = prepare_lstm(split["test"], vocab, wvmodel)

train_set = torch.utils.data.TensorDataset(tr_args, tr_titles, tr_labels)
test_set = torch.utils.data.TensorDataset(tst_args, tst_titles, tst_labels)

train_iter = torch.utils.data.DataLoader(train_set, batch_size=batch_size,
                                         shuffle=True)
test_iter = torch.utils.data.DataLoader(test_set, batch_size=batch_size,
                                        shuffle=False)

In [203]:
for epoch in range(num_epochs):
    start = time.time()
    train_loss, test_losses = 0, 0
    test_labels, test_preds = [], []
    n, m = 0, 0
    for args, titles, label in train_iter:
        n += 1
        net.zero_grad()
        args = Variable(args.cuda())
        titles = Variable(titles.cuda())
        label = Variable(label.cuda())
        
        score = net(args, titles)
        
        loss = loss_function(score, label)
        loss.backward()
        optimizer.step()

        train_loss += loss
        
    with torch.no_grad():
        for args, titles, label in test_iter:
            m += 1
            
            args = Variable(args.cuda())
            titles = Variable(titles.cuda())
            tensor_label = Variable(label.cuda())
            test_score = net(args, titles)
            test_loss = loss_function(test_score, tensor_label)
            
            out_preds = torch.argmax(test_score.cpu().data, dim=1)
            test_preds.extend(out_preds.tolist())
            test_labels.extend(label.tolist())
            
            test_losses += test_loss
            
    a = precision_score(test_labels, test_preds, average='macro')
    b = recall_score(test_labels, test_preds, average='macro')
    c = f1_score(test_labels, test_preds, average='macro')
    print(a, b, c)

0.2509090909090909 0.5 0.3341404358353511
0.24909090909090909 0.5 0.3325242718446602
0.24909090909090909 0.5 0.3325242718446602
0.2509090909090909 0.5 0.3341404358353511
0.24909090909090909 0.5 0.3325242718446602
0.2509090909090909 0.5 0.3341404358353511
0.2509090909090909 0.5 0.3341404358353511
0.24909090909090909 0.5 0.3325242718446602
0.24909090909090909 0.5 0.3325242718446602
0.24909090909090909 0.5 0.3325242718446602
0.24909090909090909 0.5 0.3325242718446602
0.2509090909090909 0.5 0.3341404358353511
0.24909090909090909 0.5 0.3325242718446602
0.24909090909090909 0.5 0.3325242718446602
0.24909090909090909 0.5 0.3325242718446602
0.2509090909090909 0.5 0.3341404358353511
0.2509090909090909 0.5 0.3341404358353511
0.2509090909090909 0.5 0.3341404358353511
0.24909090909090909 0.5 0.3325242718446602
0.24909090909090909 0.5 0.3325242718446602
