In [1]:
import xml.etree.ElementTree as et 
import pandas as pd
import stanza
from tqdm import tqdm
from nltk import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
from nltk.metrics import BigramAssocMeasures
import nltk
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import word2vec
import os
import matplotlib.pyplot as plt

In [2]:
stanza.download('ru')
stanza_nlp = stanza.Pipeline('ru')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 16.4MB/s]                    
2021-05-22 16:42:41 INFO: Downloading default packages for language: ru (Russian)...
2021-05-22 16:42:42 INFO: File exists: /Users/ciwwwnd/stanza_resources/ru/default.zip.
2021-05-22 16:42:46 INFO: Finished downloading models and saved to /Users/ciwwwnd/stanza_resources.
2021-05-22 16:42:46 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |
| lemma     | syntagrus |
| depparse  | syntagrus |
| ner       | wikiner   |

2021-05-22 16:42:46 INFO: Use device: cpu
2021-05-22 16:42:46 INFO: Loading: tokenize
2021-05-22 16:42:46 INFO: Loading: pos
2021-05-22 16:42:46 INFO: Loading: lemma
2021-05-22 16:42:46 INFO: Loading: depparse
2021-05-22 16:42:47 INFO: Loading: ner
2021-05-22 16:42:48 INFO: Done loading processors!


In [3]:
etree = et.parse('/Users/ciwwwnd/Desktop/論文/тетрадки и прочее/final_version.xml')
root = etree.getroot()

In [4]:
tok_ids = []
poses = []
animacies = []
genders = []
nums = []
cases = []
funcs = []
left_c_pos = []
right_c_pos = []
deprel = []
head_pos = []
left_deprel = []
head_pos_left_c = []
right_deprel = []
head_pos_right_c = []

In [5]:
nlp = stanza.Pipeline(lang='ru', processors='tokenize, pos,lemma, depparse')

def get_tree_info(strng, text):
    doc = nlp(strng)
    doc = doc.to_dict()
    wrds = doc[0]
    for w in wrds:
        if w['text'] == text:
            deprel = w['deprel']
            head_id = w['head']
            for w in wrds:
                if w['id'] == head_id:
                    head_pos = w['upos']
                    return deprel, head_pos

2021-05-22 16:42:53 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |
| lemma     | syntagrus |
| depparse  | syntagrus |

2021-05-22 16:42:53 INFO: Use device: cpu
2021-05-22 16:42:53 INFO: Loading: tokenize
2021-05-22 16:42:53 INFO: Loading: pos
2021-05-22 16:42:53 INFO: Loading: lemma
2021-05-22 16:42:53 INFO: Loading: depparse
2021-05-22 16:42:54 INFO: Done loading processors!


In [6]:
for paragraphs in tqdm(root.iter('paragraphs')):
    paragraph = paragraphs.findall('paragraph')
    for prr in paragraph:
        sent = prr.findall('sentence')
        for st in sent:
            src = st.find('source')
            st_str = et.tostring(src, encoding='unicode')
            my_sent = ''.join(et.fromstring(st_str).itertext())
            tk = st.find('tokens')
            tkk = tk.findall('token')
            for t in range(len(tkk)):
                tfr = tkk[t].find('tfr')
                v = tfr.find('v')
                l = v.find('l')
                gs = l.findall('g')
                d = gs[0].attrib
                if str(d['v']) == 'NOUN':
                    if len(gs) == 6:
                        d_id = tkk[t].attrib
                        tok_ids.append(d_id['id'])
                        poses.append(gs[0].attrib['v'])
                        animacies.append(gs[1].attrib['v'])
                        genders.append(gs[2].attrib['v'])
                        nums.append(gs[3].attrib['v'])
                        cases.append(gs[4].attrib['v'])
                        funcs.append(gs[5].attrib['v'])
                        
                        node_info = get_tree_info(my_sent, str(tkk[t].attrib['text']))
                        if node_info:
                            deprel.append(node_info[0])
                            head_pos.append(node_info[1])
                        else:
                            deprel.append(None)
                            head_pos.append(None)
                                
                        if t > 0:
                            left_context = tkk[t - 1]
                            tfr_lc = left_context.find('tfr')
                            v = tfr_lc.find('v')
                            l = v.find('l')
                            gs = l.findall('g')
                            d = gs[0].attrib
                            left_c_pos.append(str(d['v']))
                            
                            node_info = get_tree_info(my_sent, str(tkk[t - 1].attrib['text']))
                            if node_info:
                                left_deprel.append(node_info[0])
                                head_pos_left_c.append(node_info[1])
                            else:
                                left_deprel.append(None)
                                head_pos_left_c.append(None)
                        else:
                            left_c_pos.append(None)
                            left_deprel.append(None)
                            head_pos_left_c.append(None)
                            
                        if tkk[t] != tkk[-1]:
                            right_context = tkk[t + 1]
                            tfr_rc = right_context.find('tfr')
                            v = tfr_rc.find('v')
                            l = v.find('l')
                            gs = l.findall('g')
                            d = gs[0].attrib
                            right_c_pos.append(str(d['v']))
                            
                            node_info = get_tree_info(my_sent, str(tkk[t + 1].attrib['text']))
                            if node_info:
                                right_deprel.append(node_info[0])
                                head_pos_right_c.append(node_info[1])
                            else:
                                right_deprel.append(None)
                                head_pos_right_c.append(None)
                        else:
                            right_c_pos.append(None)
                            right_deprel.append(None)
                            head_pos_right_c.append(None) 
    
print(len(tok_ids), len(poses), len(animacies), len(genders), len(nums), len(cases), len(funcs),  len(deprel), len(head_pos), len(left_c_pos), len(right_c_pos))


27it [30:10, 67.07s/it] 

4030 4030 4030 4030 4030 4030 4030 4030 4030 4030 4030





In [7]:
df = pd.DataFrame({
    'id': tok_ids,
    'pos': poses,
    'animacy': animacies,
    'gender': genders,
    'number': nums,
    'case': cases,
    'function': funcs,
    'syntax': deprel,
    'head pos': head_pos,
    'left context pos': left_c_pos,
    'right context pos': right_c_pos,
    'left context syntax': left_deprel,
    'right context syntax': right_deprel,
    'left context head pos': head_pos_left_c,
    'right context head pos': head_pos_right_c
})

In [8]:
df.head(5)

Unnamed: 0,id,pos,animacy,gender,number,case,function,syntax,head pos,left context pos,right context pos,left context syntax,right context syntax,left context head pos,right context head pos
0,2,NOUN,inan,femn,sing,nomn,agent,nsubj,VERB,PNCT,NOUN,punct,nmod,VERB,NOUN
1,10,NOUN,inan,masc,sing,nomn,posessor,nsubj,VERB,PRCL,NOUN,advmod,nmod,VERB,NOUN
2,11,NOUN,inan,femn,sing,gent,posessor,nmod,NOUN,NOUN,PREP,nsubj,case,VERB,NOUN
3,18,NOUN,inan,femn,sing,nomn,agent,nsubj,VERB,PNCT,NOUN,amod,nmod,NOUN,NOUN
4,51,NOUN,inan,femn,sing,nomn,agent,nsubj,VERB,PNCT,PNCT,punct,punct,NOUN,NOUN


# Random Forest with an unbalanced dataset

In [9]:
df_2 = df.drop(columns=['id', 'function', 'pos'])
df_2.dropna()
df_2.head(5)

Unnamed: 0,animacy,gender,number,case,syntax,head pos,left context pos,right context pos,left context syntax,right context syntax,left context head pos,right context head pos
0,inan,femn,sing,nomn,nsubj,VERB,PNCT,NOUN,punct,nmod,VERB,NOUN
1,inan,masc,sing,nomn,nsubj,VERB,PRCL,NOUN,advmod,nmod,VERB,NOUN
2,inan,femn,sing,gent,nmod,NOUN,NOUN,PREP,nsubj,case,VERB,NOUN
3,inan,femn,sing,nomn,nsubj,VERB,PNCT,NOUN,amod,nmod,NOUN,NOUN
4,inan,femn,sing,nomn,nsubj,VERB,PNCT,PNCT,punct,punct,NOUN,NOUN


In [10]:
df_2 = pd.get_dummies(df_2)
df_2.head(5)

Unnamed: 0,animacy_anim,animacy_inan,gender_GNdr,gender_femn,gender_masc,gender_ms-f,gender_neut,number_Abbr,number_Fixd,number_Geox,...,right context head pos_CCONJ,right context head pos_DET,right context head pos_NOUN,right context head pos_NUM,right context head pos_PART,right context head pos_PRON,right context head pos_PROPN,right context head pos_PUNCT,right context head pos_SYM,right context head pos_VERB
0,0,1,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,1,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,1,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


Clearing random and erroneous data

In [11]:
df_clean = df_2[df_2['case_sing'] == 0]
df_clean = df_clean[df_clean['case_plur'] == 0]
df_clean = df_clean[df_clean['case_patient'] == 0]
df_clean = df_clean[df_clean['number_Slng'] == 0]
df_clean = df_clean[df_clean['case_accs'] == 0]
df_clean = df_clean[df_clean['number_Patr'] == 0]
df_clean = df_clean[df_clean['number_Orgn'] == 0]
df_clean = df_clean[df_clean['number_Name'] == 0]
df_clean = df_clean[df_clean['number_Inmx'] == 0]
df_clean = df_clean[df_clean['number_Geox'] == 0]
df_clean = df_clean[df_clean['number_Fixd'] == 0]
df_clean = df_clean[df_clean['number_Abbr'] == 0]
df_clean = df_clean[df_clean['gender_GNdr'] == 0]
df_clean = df_clean[df_clean['gender_ms-f'] == 0]
df_clean = df_clean.drop(columns=['case_sing', 'case_plur', 'case_patient', 'number_Slng', 'case_accs', 'number_Patr', 'number_Orgn', 'number_Name', 'number_Inmx', 'number_Infr', 'number_Geox', 'number_Fixd', 'number_Abbr', 'gender_GNdr', 'gender_ms-f'])

print(df_2.shape, df_clean.shape)

(4030, 195) (3561, 180)


In [12]:
df_clean['function'] = df['function']

In [13]:
df_clean.head(3)

Unnamed: 0,animacy_anim,animacy_inan,gender_femn,gender_masc,gender_neut,number_Pltm,number_Sgtm,number_plur,number_sing,case_datv,...,right context head pos_DET,right context head pos_NOUN,right context head pos_NUM,right context head pos_PART,right context head pos_PRON,right context head pos_PROPN,right context head pos_PUNCT,right context head pos_SYM,right context head pos_VERB,function
0,0,1,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,agent
1,0,1,0,1,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,posessor
2,0,1,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,posessor


In [14]:
import numpy as np

labels = np.array(df_clean['function'])
features= df_clean.drop('function', axis = 1)
feature_list = list(features.columns)
features = np.array(features)

Using Skicit-learn to split data into training and testing sets

In [15]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25,
                                                                           random_state = 42)

In [16]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (2670, 180)
Training Labels Shape: (2670,)
Testing Features Shape: (891, 180)
Testing Labels Shape: (891,)


Creating a Gaussian Classifier, training the model

In [17]:
from sklearn.ensemble import RandomForestClassifier

clf=RandomForestClassifier(n_estimators=100)
clf.fit(train_features, train_labels)

y_pred=clf.predict(test_features)

Using scikit-learn metrics module to calculate accuracy score

In [18]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(test_labels, y_pred))

Accuracy: 0.43434343434343436


In [19]:
print(metrics.classification_report(test_labels, y_pred))

                      precision    recall  f1-score   support

           addressee       0.33      0.25      0.29        12
               agent       0.59      0.64      0.62       197
              aspect       0.00      0.00      0.00         9
         benefactive       0.21      0.17      0.19        23
             content       0.12      0.08      0.10        13
        counterparty       0.00      0.00      0.00         3
            effector       0.00      0.00      0.00         0
            endpoint       0.00      0.00      0.00         4
         experiencer       0.00      0.00      0.00        12
                goal       0.41      0.48      0.44        29
               means       0.00      0.00      0.00         4
          motivation       0.00      0.00      0.00         9
                nomn       0.00      0.00      0.00         0
             patient       0.48      0.45      0.46       181
               place       0.00      0.00      0.00        18
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Random Forest with a balanced dataset

## Deleting data that is not enough

In [22]:
df_clean['function'].value_counts()

agent                   784
patient                 683
subj_with_properties    374
theme                   368
posessor                329
time                    181
goal                    130
second_member            80
place                    76
benefactive              74
term                     64
experiencer              51
recipient                49
addressee                47
content                  45
stimulus                 42
starting_point           41
motivation               29
endpoint                 29
aspect                   22
counterparty             19
result                   18
means                    10
trajectory                7
effector                  4
Abbr                      2
thema                     1
nomn                      1
instrument                1
Name: function, dtype: int64

In [23]:
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'thema'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'nomn'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'Abbr'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'instrument'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'Infr'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'effector'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'trajectory'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'means'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'result'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'counterparty'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'aspect'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'endpoint'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'motivation'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'starting_point'].index)

In [24]:
df_clean['function'].value_counts()

agent                   784
patient                 683
subj_with_properties    374
theme                   368
posessor                329
time                    181
goal                    130
second_member            80
place                    76
benefactive              74
term                     64
experiencer              51
recipient                49
addressee                47
content                  45
stimulus                 42
Name: function, dtype: int64

In [25]:
df_clean.shape

(3377, 181)

## Balancing the number of agents and patients

In [26]:
agent_df = df_clean[df_clean['function'].values == 'agent']
agent_df.head(5)
agent_df.shape

(784, 181)

In [27]:
agent_df = agent_df.sample(frac=1)

In [28]:
agent_df = agent_df.iloc[:390]
agent_df.shape

(390, 181)

In [29]:
patient_df = df_clean[df_clean['function'].values == 'patient']
patient_df.shape

(683, 181)

In [30]:
patient_df = patient_df.sample(frac=1)
patient_df = patient_df.iloc[:344]
patient_df.shape

(344, 181)

In [31]:
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'agent'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'patient'].index)
df_clean.shape

(1910, 181)

In [32]:
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'thema'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'nomn'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'Abbr'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'instrument'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'Infr'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'effector'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'trajectory'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'means'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'result'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'counterparty'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'aspect'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'endpoint'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'motivation'].index)
df_clean = df_clean.drop(df_clean[df_clean['function'].values == 'starting_point'].index)

finaldf = pd.concat([df_clean, agent_df, patient_df])
finaldf.shape

(2644, 181)

In [33]:
finaldf['function'].value_counts()

agent                   390
subj_with_properties    374
theme                   368
patient                 344
posessor                329
time                    181
goal                    130
second_member            80
place                    76
benefactive              74
term                     64
experiencer              51
recipient                49
addressee                47
content                  45
stimulus                 42
Name: function, dtype: int64

In [34]:
finaldf = finaldf.sample(frac=1)

## Training the model

In [35]:
import numpy as np

flabels = np.array(finaldf['function'])
ffeatures= finaldf.drop('function', axis = 1)
feature_list = list(ffeatures.columns)
features = np.array(ffeatures)

In [36]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(ffeatures, flabels, test_size = 0.25,
                                                                           random_state = 42)

In [37]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (1983, 180)
Training Labels Shape: (1983,)
Testing Features Shape: (661, 180)
Testing Labels Shape: (661,)


In [38]:
from sklearn.ensemble import RandomForestClassifier

clf=RandomForestClassifier(n_estimators=100)

clf.fit(train_features, train_labels)
y_pred=clf.predict(test_features)

In [39]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(test_labels, y_pred))

Accuracy: 0.45537065052950076


In [40]:
print(metrics.classification_report(test_labels, y_pred))

                      precision    recall  f1-score   support

           addressee       0.43      0.21      0.29        14
               agent       0.53      0.63      0.57        97
         benefactive       0.35      0.27      0.31        22
             content       0.20      0.09      0.13        11
         experiencer       0.00      0.00      0.00        14
                goal       0.36      0.47      0.41        30
             patient       0.42      0.37      0.39        87
               place       0.33      0.08      0.12        26
            posessor       0.46      0.54      0.50        81
           recipient       0.38      0.73      0.50        11
       second_member       0.18      0.18      0.18        17
            stimulus       0.00      0.00      0.00        11
subj_with_properties       0.56      0.55      0.55       102
                term       0.55      0.40      0.46        15
               theme       0.39      0.49      0.43        84
       

# Using PyCM to evaluate results

In [21]:
from pycm import *

def plot_confusion_matrix(cm, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    plt_cm = []
    for i in cm.classes :
        row=[]
        for j in cm.classes:
            row.append(cm.table[i][j])
        plt_cm.append(row)
    plt_cm = np.array(plt_cm)
    if normalize:
        plt_cm = plt_cm.astype('float') / plt_cm.sum(axis=1)[:, np.newaxis]     
    plt.imshow(plt_cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(cm.classes))
    plt.xticks(tick_marks, cm.classes, rotation=45)
    plt.yticks(tick_marks, cm.classes)

    fmt = '.2f' if normalize else 'd'
    thresh = plt_cm.max() / 2.
    for i, j in itertools.product(range(plt_cm.shape[0]), range(plt_cm.shape[1])):
        plt.text(j, i, format(plt_cm[i, j], fmt),
                horizontalalignment="center",
                color="white" if plt_cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Actual')
    plt.xlabel('Prediction')

In [46]:
clf.fit(train_features, train_labels)
y_predicted = clf.predict(test_features)

cm = ConfusionMatrix(actual_vector=test_labels, predict_vector=y_pred)

print("[INFO] Clases")
print(cm.classes)
print(cm.table)
print(cm)

[INFO] Clases
['addressee', 'agent', 'benefactive', 'content', 'experiencer', 'goal', 'patient', 'place', 'posessor', 'recipient', 'second_member', 'stimulus', 'subj_with_properties', 'term', 'theme', 'time']
{'addressee': {'addressee': 3, 'agent': 0, 'benefactive': 1, 'content': 0, 'experiencer': 0, 'goal': 2, 'patient': 0, 'place': 0, 'posessor': 1, 'recipient': 4, 'second_member': 0, 'stimulus': 1, 'subj_with_properties': 0, 'term': 0, 'theme': 1, 'time': 1}, 'agent': {'addressee': 0, 'agent': 61, 'benefactive': 0, 'content': 0, 'experiencer': 2, 'goal': 2, 'patient': 13, 'place': 0, 'posessor': 3, 'recipient': 0, 'second_member': 1, 'stimulus': 0, 'subj_with_properties': 12, 'term': 0, 'theme': 3, 'time': 0}, 'benefactive': {'addressee': 1, 'agent': 2, 'benefactive': 6, 'content': 0, 'experiencer': 0, 'goal': 2, 'patient': 1, 'place': 0, 'posessor': 4, 'recipient': 3, 'second_member': 0, 'stimulus': 0, 'subj_with_properties': 0, 'term': 0, 'theme': 3, 'time': 0}, 'content': {'addre

If confusion matrix has too many zeros (sparse matrix) you can set `sparse` flag to True in printing functions otherwise by using save_csv method to save the confusion matrix in csv format you'll have better demonstration.


In [47]:
from sklearn import svm
clf = svm.SVC(kernel='linear', C=1,gamma=1)

In [48]:
cm.save_csv(os.path.join("/Users/ciwwwnd/Desktop/論文","cm1_filtered2"))

{'Status': True, 'Message': '/Users/ciwwwnd/Desktop/論文/cm1_filtered2.csv'}