# This tutorial is Largely based on the paper Context-aware Argumentative Relation Mining 
Huy V. Nguyen
Diane J. Litman

However the implementation is not the exact procedure stated in the paper, rather covers the overall intention.
The main intention being somehow give the contextual information to the classifier model by extracting topics using LDA.

The implementation is divided into 6 parts :
    
    Part 1 : Importing and structuring the Dataset 
    Part 2 : Window Context Extraction ( TODO )
    Part 3 : LDA topic Extraction ( TODO )
    Part 4 : Creating and Adding the features ( TODO )
    Part 5 : Applying Classification Models ( TODO )
    Part 6 : Hyperparameter tuning ( additional )

#### Library Installation cells. Uncomment them and run the cells. To be skipped if already installed

In [None]:
#!pip install numpy

In [None]:
#!pip install pandas

In [None]:
#!pip install sklearn

In [None]:
#!pip install imblearn 

In [None]:
#!pip install nltk

In [None]:
#!pip install gensim

In [None]:
import pandas as pd
import numpy as np
import itertools
import os
import re
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")



import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel,phrases,Phrases


import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
stop_words = stopwords.words('english')

# Part 1.
## Importing and structuring the Dataset of 90 Persuasive ESSAYs

#### Importing the actual essays

In [None]:
essay_dict = {}
for filename in os.listdir("brat-project"):
    if filename.endswith(".txt"): 
        
        filepath= os.path.join("brat-project", filename)
        #print(filepath,'\n')
        try:
            file =open(filepath,'r')
            essay = file.read()
            essay_dict.update({filename[:-4]:sent_tokenize(essay)})
            
        except IOError as err:
            print(err)

In [None]:
def rearrange_dataset(ann,essay):
    '''
        This function restructures the ADU segments such that the dataset contains all
        possible pairs of source - target permutations and the relation they share
    '''
    
    ann_ADU = ann[ann['ID'].str.startswith('T')] # this gets each ADU IDs
    ann_ADU_relations = ann[ann['ID'].str.startswith('R')][['ID','TYPE']] # this gets the relations between the ADU IDs

    
    # this creates a dataframe of ADU ID, segment, type and ADU start and end positions
    ann_ADU= pd.DataFrame(ann_ADU.apply(lambda x : 
                            list((x['ID'],)+(x['ADU'],) + tuple(x['TYPE'].split(' '))),axis=1).values.tolist(),
                          columns = ['ID','ADU','TYPE','strt','end']) 

    
    
    # this creates permutation of all ADUs with each other to form a source-target pair
    ann_ADU = pd.DataFrame([row[0]+ row[1] for row in itertools.permutations(
                                        ann_ADU[['ID','ADU','TYPE','strt','end']].values.tolist(), 2) ],
                            columns=['src_id','src','src_type','src_strt','src_end',
                                     'tgt_id','tgt','tgt_type','tgt_strt','tgt_end']) 

    # this creates a dataframe of ADU IDs and their relations
    ann_ADU_relations= pd.DataFrame(ann_ADU_relations.apply(lambda x : [x['TYPE'].split(' ')[0],
                                         x['TYPE'].split(' ')[1].split(':')[1],
                                         x['TYPE'].split(' ')[2].split(':')[1]] ,axis=1).values.tolist(), 
                 columns=['relation','src_id','tgt_id'])


    # finally the above two dataframes are joined together
    rearranged_ann = pd.merge(ann_ADU,ann_ADU_relations,on=['src_id','tgt_id'],how='outer')
    rearranged_ann.fillna('no relation',inplace=True)
    rearranged_ann['essay'] = essay
    
    return rearranged_ann

#### Importing the Annotations

In [None]:
dataset = pd.DataFrame()
for filename in os.listdir("brat-project"):
    if filename.endswith(".ann"): 
        
        filepath= os.path.join("brat-project", filename)
        #print(filepath,'\n')
        try:
            annotation = pd.read_table(filepath,header=None)
            annotation.columns =  ['ID','TYPE','ADU']
            dataset = dataset.append(rearrange_dataset(annotation,filename[:-4]))
        except pd.errors.ParserError as err:
            print(err)

In [None]:
dataset[dataset['relation'] != 'no relation']['relation'].value_counts()

#### keeping only the 'supports' and 'attacks' relation in the dataset

In [None]:
dataset.reset_index(drop=True,inplace=True)
dataset_model2 = dataset.drop(dataset[dataset['relation'] == 'no relation'].index, axis=0)
dataset_model2.shape

# Part 2.
## Context Window Extraction


### TASK :
#### Implement a Loop for tokenizing each sentence contained in variable 'sent' and store it in 'sentToken' dictionary
       
       For example for some list of sentences in 'sent', 1 entry in the dictionary should look like :  
   
             { 6 : ['In', 'a', 'word', ',', 'the', 'notion', 'of', 'being', 'afraid',
                   'of', 'social', 'misleading', 'is', 'unjustified', '.'] }
    
       where key '6' refers to the sentence number in the list and the value is the tokenized sentence

In [None]:
def getSentNeigh(sent, segment,prefix, num_neighbours = 0): 
    '''
        This function gets the neighbouring window sentences(both previous and next) of the 
        current ADU segment, with a parameter (num_neighbours) to control the window size.
    '''
    
    sentToken = dict() 
    
    ##### YOUR CODE HERE ######

    ###########################
              

    # tokenize the ADU segment
    wordList = word_tokenize(segment)
    res=-1

    # Then Check in every Sentence 
    for s in range(len(sentToken)): 
        wCount = len(wordList) 

        # Every word in the Phrase 
        for w in wordList: 
            if w in sentToken[s]: 
                wCount -= 1

        # If every word in phrase matches 
        if wCount == 0: 
            res= s 
            break

    if(res > -1): 

        ret_dict= {}

        for i in range(1,num_neighbours+1):

            ret_dict.update( {prefix+'_prev_sent'+str(i):[sent[res-i]]})

            # to check if the segment has neighbours left on its right.
            if res+i < len(sent):
                ret_dict.update( {prefix+'_next_sent'+str(i):[sent[res+i]]})

        return ret_dict

#### Getting then neighbouring sentences for each source and target pairs

In [None]:
# 40 - 60 secs atleast

window_size = 4

dataset_model2.reset_index(drop=True,inplace=True)
neighbours = pd.DataFrame()

for row in dataset_model2[['src','tgt','essay']].iterrows():
    
    essay_num = row[1]['essay']
    essay = essay_dict[essay_num]
    
    src_adu = row[1]['src']
    tgt_adu = row[1]['tgt']
    
    src = pd.DataFrame(getSentNeigh(sent = essay, 
                                    segment = src_adu ,
                                    prefix = 'src', 
                                    num_neighbours = window_size))
    
    tgt = pd.DataFrame(getSentNeigh(sent = essay, 
                                    segment = tgt_adu ,
                                    prefix = 'tgt', 
                                    num_neighbours = window_size))
    
    neighbours = neighbours.append(pd.concat([src,tgt],axis=1))
    
neighbours.reset_index(drop=True,inplace=True)

#### Adding the neighbours extracted to the dataset

In [None]:
neighbours = neighbours.fillna('')
dataset_model2  = pd.concat([dataset_model2,neighbours],axis=1)
dataset_model2.shape

# Part 3.
## LDA Topic Extraction 

#### Loading the extra Essay corpus

In [None]:
data=[]

for filename in os.listdir("developemental_data"):
    if filename.endswith(".txt"): 
        filepath= os.path.join("developemental_data", filename)
        #print(filepath,'\n')
        try:
            file =open(filepath,'r')
            data.append(file.readline())
            
        except IOError as err:
            print(err)

### TASK: 
#### Use the re library to remove single quotes and new line characters from each headings of all essays stored in variable data in previous step.

In [None]:
##### YOUR CODE HERE ######

###########################

#### Text preprocessing functions to tokenize, lemmatize and create bigrams or trigrams.

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

### TASK:
#### Implement the function to remove stopwords from each sentence contained in the texts variable.

In [None]:
def remove_stopwords(texts):
    
    ##### YOUR CODE HERE ######

    ###########################
    
    return texts

### TASK : 
#### Complete the lemmatization function which checks for Parts of Speech tag to lemmatize each token. 
#### Hint : 
#### 1. Check the pos tag to identify if the token is a noun ('NN') or verb ('VB') or adjective ('JJ')
#### 2. Lemmatize each token by calling : lemmatizer.lemmatize(token, pos='n')

In [None]:
def lemmatization(texts):
    lemmatizer = WordNetLemmatizer()
    pos_tagged_text = [pos_tag(text) for text in texts]
    
    texts_lemmatized =[]
    for text in pos_tagged_text:
        text_lemmatized = []
        for token, tag in text:
            ##### YOUR CODE HERE ######

            ###########################
                
            text_lemmatized.append(token)
        texts_lemmatized.append(text_lemmatized)
    
    return texts_lemmatized

In [None]:
def make_bigrams(texts):
    # Build the bigram models
    bigram = Phrases(texts, min_count=2, threshold=1) # higher threshold fewer phrases.
    
    bigram_mod = phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    # Build the bigram and trigram models
    bigram = Phrases(texts, min_count=2, threshold=1) # higher threshold fewer phrases.
    trigram = Phrases(bigram[texts], min_count=2, threshold=1)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = phrases.Phraser(bigram)
    trigram_mod = phrases.Phraser(trigram)
    
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [None]:
def preprocess_pipeline(data):
    
    # sentence tokenize
    data_words = list(sent_to_words(data))
    
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Do lemmatization
    data_lemmatized = lemmatization(data_words_nostops)
    
    # Form Bigrams
    data_words_bigrams = make_bigrams(data_lemmatized)

     # Form Trigrams
    data_words_trigrams = make_trigrams(data_lemmatized)
    
    return data_words_trigrams

print(preprocess_pipeline(data)[:10])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(preprocess_pipeline(data))

# Create Corpus
texts = preprocess_pipeline(data)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[9])

In [None]:

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=100,
                                           update_every=0,
                                           chunksize=10,
                                           passes=100,
                                           alpha='auto',
                                           eta='auto',
                                           per_word_topics=True)

'''



os.environ['MALLET_HOME'] = 'C:\\Users\\Arkajit\\Anaconda2\\envs\\TFENV\\mallet-2.0.8'

lda_model = gensim.models.wrappers.LdaMallet('C:\\Users\\Arkajit\\Anaconda2\\envs\\TFENV\\mallet-2.0.8\\bin\\mallet.bat',
                                            corpus=corpus,
                                            id2word=id2word,
                                            num_topics=36, 
                                            iterations=100
                                            )
                                            
'''                                            

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3,mallet = False):
    '''
        This function iterates the ldamodel over a series of different counts of topics and 
        stores its coherence Score
    '''
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        if ( mallet == False):
            model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           passes=100,
                                           alpha='auto',
                                           eta='auto')
        
        else :
            model = gensim.models.wrappers.LdaMallet('C:\\Users\\Arkajit\\Anaconda2\\envs\\TFENV\\mallet-2.0.8\\bin\\mallet.bat',
                                           corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           iterations=100
                                           )
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, 
                                                        corpus=corpus, 
                                                        texts=texts, 
                                                        start=2, 
                                                        limit=40, 
                                                        step=6,
                                                        mallet=True)



limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend("coherence_values", loc='best')
plt.show()

# Part 4.
## Creating and adding Features 

In [None]:
topic_words = [[word[0] for word in lda_model.show_topic(i,topn= 50)] for i in range(lda_model.num_topics)]

#### 1. Topic words common between source and target ADUs

### Task : 
#### Get the number of common tokens between the topic_words and src_ADU and tgt_ADU and store it it in context_words_lda

In [None]:
src_ADU = preprocess_pipeline(dataset_model2['src'].values)
tgt_ADU = preprocess_pipeline(dataset_model2['tgt'].values)

context_words_lda = []
column_names = []

for i, topic in enumerate(topic_words):
    
    ##### YOUR CODE HERE ######

    ###########################
    column_names.append('topic_'+str(i)+'_src')
    column_names.append('topic_'+str(i)+'_tgt')
    
context_words_lda = pd.DataFrame(np.array(context_words_lda).T, columns = column_names )

#### 2. Common tokens between source and target ADUs and 8-window context sentences

In [None]:
# runtime: depending on window size atleast 30 secs

common_neigh=pd.DataFrame()
for col in neighbours.columns:
    
    if 'src' in col:
        temp = pd.DataFrame([len(np.intersect1d(simple_preprocess(str(sent[1]), deacc=True),
                                                simple_preprocess(str(sent[0]), deacc=True))) 
                             for sent in dataset_model2[['src',col]].values],
                             columns = [col+'_comm'])
    elif 'tgt' in col:
        temp = pd.DataFrame([len(np.intersect1d(simple_preprocess(str(sent[1]), deacc=True),
                                                simple_preprocess(str(sent[0]), deacc=True))) 
                             for sent in dataset_model2[['tgt',col]].values],
                             columns = [col+'_comm'])
    
    common_neigh = pd.concat([common_neigh,temp],axis=1)

column_names.extend(common_neigh.columns)

 #### 3. Word counts of source and target ADUs

In [None]:
word_count_src = [len(sent) for sent in dataset_model2['src'].values ]
word_count_tgt = [len(sent) for sent in dataset_model2['tgt'].values ]

word_counts = [word_count_src, word_count_tgt ] 
word_counts = pd.DataFrame(np.array(word_counts).T, columns = ['word_count_src', 'word_count_tgt' ] )

column_names.extend( ['word_count_src', 'word_count_tgt' ])

#### 4. Boolean features of source and target ADU types (premise, claim and major claim)

In [None]:
src_tgt_type= pd.get_dummies(dataset_model2[['src_type','tgt_type']])

column_names.extend(src_tgt_type.columns)

#### 5. Absolute differences between the positions of the source and target ADUs

In [None]:
abs_diff_strt = pd.DataFrame(np.abs(dataset_model2['tgt_strt'].astype('int') - dataset_model2['src_strt'].astype('int')),
                             columns=['abs_diff_strt'])
abs_diff_end = pd.DataFrame(np.abs(dataset_model2['tgt_end'].astype('int') -  dataset_model2['src_end'].astype('int')),
                            columns=['abs_diff_end'])

column_names.extend(['abs_diff_strt','abs_diff_end'] )

#### Joining all the above feature vectors into a training dataset and seperating out the target variable

In [None]:
X =pd.concat([src_tgt_type,
              abs_diff_strt,
              abs_diff_end,
              word_counts,
              common_neigh,
              context_words_lda],axis=1)

Y= dataset_model2['relation'].astype('category').cat.codes

# Part 5.
## Applying Classification models

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegressionCV , SGDClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import make_scorer,f1_score , recall_score, precision_score,accuracy_score,precision_recall_fscore_support,cohen_kappa_score,confusion_matrix
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,cross_validate,train_test_split,ShuffleSplit,KFold,StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA


from imblearn import over_sampling, under_sampling

 ### Scale the independant variables

In [None]:
s_scaler  = StandardScaler()#Normalizer()#MinMaxScaler()#
X = s_scaler.fit_transform(X)

### Task : 
#### Split X and Y into training and testing datasets using train_test_split. 

In [None]:
X_train=[]
Y_train=[]
X_test=[]
Y_test=[]
##### YOUR CODE HERE ######

###########################
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

In [None]:
#sampler=over_sampling.SVMSMOTE(sampling_strategy= 1)
#X_train, Y_train=sampler.fit_resample(X_train,Y_train)

 #### Apply the best Classification model

In [None]:
model = SVC()

model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)

#### Calculating the precision macro, recall macro, f1 macro and accuracy of the model

In [None]:
p_macro, r_macro, f_macro, support_macro = precision_recall_fscore_support(y_true=Y_test, 
                                                                           y_pred=Y_pred, 
                                                                           labels=[0,1], 
                                                                           average='macro')

print('Accuracy:',round(accuracy_score(Y_test, Y_pred),2),
      '\nKappa:',round(cohen_kappa_score(Y_test,Y_pred),2),
      '\nMacro Precision:',round(p_macro,2),
      '\nMacro Recall:', round(r_macro,2),
      '\nMacro F1:',round(f_macro,2,),
      '\nF1:',round(f1_score(Y_test, Y_pred),2,)
      )

# Part 6.
## Hyperparameter tuning

In [None]:
cv = KFold(n_splits=5)#ShuffleSplit(n_splits=10, test_size=0.25,random_state=0)

scoring = ['precision_macro', 'recall_macro','f1_macro']
scores = cross_validate(model, X, Y, scoring=scoring, cv=cv)
scores['test_precision_macro'].mean(),scores['test_recall_macro'].mean(),scores['test_f1_macro'].mean()

In [None]:
plt.plot(model.feature_importances_)

In [None]:
scorers = {
    'precision_score': make_scorer(precision_score,labels=[0,1],average='macro'),
    'recall_score': make_scorer(recall_score,labels=[0,1],average='macro'),
    'f1_score': make_scorer(f1_score,labels=[0,1],average='macro')
}

In [None]:
param_test1 = {
    'C':  [c for c in range(1,120,2)]
}

clf = SVC(class_weight='balanced')#GradientBoostingClassifier(random_state = 42)#RandomForestClassifier(class_weight='balanced_subsample')#

gsearch1 = GridSearchCV(
    n_jobs=-1,
    estimator=clf, 
    param_grid=param_test1,
    scoring= scorers,#'precision_macro',
    verbose= True,
    iid=True,
    refit='precision_score',
    #n_iter= 100,
    cv=KFold(n_splits=5))

gsearch1.fit(X_train, Y_train)

gsearch1.best_params_, gsearch1.best_score_

In [None]:
pd.DataFrame.from_dict(gsearch1.cv_results_)[[
       #'param_max_depth', 'param_n_estimators', 
      'params',
       'rank_test_precision_score',
       'rank_test_recall_score', 
       'rank_test_f1_score', 
        ]]

In [None]:
from scipy.stats import rv_discrete

depths = rv_discrete(values= ([i for i in range(1,6,1)], [0.2]*5))
n_iterations = rv_discrete(values = ([i for i in range(100,2100,50)],[0.025]*40))

In [None]:
param_test1 = {
    "n_estimators":n_iterations,
    'max_depth': depths,
}

clf = GradientBoostingClassifier(random_state = 42)#RandomForestClassifier(class_weight='balanced_subsample')#SVC()

gsearch1 = RandomizedSearchCV(
    n_jobs=-1,
    estimator=clf, 
    param_distributions=param_test1,
    scoring= scorers,#'precision_macro',
    verbose= True,
    iid=True,
    refit='precision_score',
    n_iter= 100,
    cv=cv)

gsearch1.fit(X, Y)

gsearch1.best_params_, gsearch1.best_score_