In [1]:
import pandas as pd
import torch
import torch.utils.data as tud
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter, defaultdict
import operator
import os, math
import numpy as np
import random
import copy
import spacy
nlp = spacy.load("en_core_web_sm")



In [2]:
cd 'Desktop/aml/ml_nlp_conflict/'

/Users/crismacg/Desktop/aml/ml_nlp_conflict


In [3]:
data = pd.read_csv('data/acled_all.csv')
VOCAB_SIZE = 5000
seed = 30255
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
def remove_stopwords(l):
    STOP  = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']
    l_clean = []
    for i in l:
        if i not in STOP:
            l_clean.append(i)
    return l_clean


def word_tokenize(s, clean = False):
    if type(s) != str:
        return None
    split_l = s.lower().replace('.', '').replace(',', '').replace(';', '').replace(':', '').replace('!', '').replace('?', '').split()
    if clean:
        clean_l = remove_stopwords(split_l)
        return clean_l

    return split_l

class Model:
    def __init__(self, data, clean = False):
        # Vocabulary is a set that stores every word seen in the
        # training data
        self.vocab_counts = Counter([word for content in data
                              for word in word_tokenize(content, clean) if word]
                            ).most_common(VOCAB_SIZE-1)
        # word to index mapping
        self.word_to_idx = {k[0]: v+1 for v, k in
                            enumerate(self.vocab_counts)}
        # all the unknown words will be mapped to index 0
        self.word_to_idx["UNK"] = 0
        self.vocab = set(self.word_to_idx.keys())

        self.verb_counts = Counter([token.lemma_ for content in data
                              for token in nlp(content) if token.pos_ == "VERB"]
                            ).most_common(VOCAB_SIZE-1)

        self.noun_counts = Counter([chunk.text for content in data
                              for chunk in nlp(content).noun_chunks]
                            ).most_common(VOCAB_SIZE-1)


class TextClassificationDataset(tud.Dataset):
    '''
    PyTorch provides a common dataset interface.
    See https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
    The dataset encodes documents into indices.
    With the PyTorch dataloader, you can easily get batched data for
    training and evaluation.
    '''
    def __init__(self, word_to_idx, data):

        self.data = data
        self.word_to_idx = word_to_idx
        self.vocab_size = VOCAB_SIZE

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = np.zeros(self.vocab_size)
        item = torch.from_numpy(item)
        for word in word_tokenize(self.data[idx]):
            item[self.word_to_idx.get(word, 0)] += 1
        return item

In [5]:
# ******************************************************************************
# Get list of verbs for all data
# ******************************************************************************
data.dropna(subset=['NOTES'], inplace = True)
# x_wstopwords = Model(data['NOTES'], False)
# verb_count_complete = x_wstopwords.verb_counts

# verb_count_complete # This was exported to a csv and then manually classified. Then we imported it back 

In [6]:
# ******************************************************************************
# Subset my data
# ******************************************************************************
mlist = [26, 27, 36, 37, 56, 57]
data_filtered = data.loc[data['INTERACTION'].isin(mlist)]
data_filtered.shape

final_data = data_filtered.loc[data_filtered['NOTES'].str.len() > 100 ]

In [7]:
# ******************************************************************************
# Getting the nouns for a subset of type of events in which we know civilians are attacked
# ******************************************************************************

# data_filt2 = data_filtered[data_filtered['ACTOR2'].str.contains('Civilians') == True]
# z_wstopwords = Model(data_filt2['NOTES'], False)
#
# noun_count_complete = z_wstopwords.noun_counts
# noun_count_filtered = noun_count_complete
#

# ******************************************************************************
# Turn into list of verbs
# ******************************************************************************

# Remove verbs we're not interested in

# vbs_complete = [word for word, count in verb_count_complete]
# vbs_filtered = [word for word, count in verb_count_filtered]
#
# noun_filtered = [word for word, count in noun_count_filtered]
# print(vbs_complete)
# print(vbs_filtered)
# print(noun_count_filtered)

In [8]:
# ******************************************************************************
# Read processed list of verbs
# ******************************************************************************
vbs = pd.read_csv('data/verbs.csv')
vbs.head()

vbs['clean_list'] = vbs['List'].str.replace('(', '')
vbs['clean_list'] = vbs['clean_list'].str.replace('[', '')
vbs['clean_list'] = vbs['clean_list'].str.replace(']', '')
vbs['clean_list'] = vbs['clean_list'].str.replace("'", '')
vbs['clean_list'] = vbs['clean_list'].str.strip()

vbs = vbs[vbs['Filter'] == 1]
vbs.shape
list_vbs = vbs.clean_list
list_vbs[:15]

1        kill
3      injure
4       stage
6     protest
8      attack
9      demand
10       hold
11      shell
12      wound
14      shoot
15      clash
16       lead
20       fire
22     arrest
24        hit
Name: clean_list, dtype: object

In [57]:
# ******************************************************************************
# Create automatic labelling.
# ******************************************************************************
final_data['tagged_rel'] = ''
num_v = len(list_vbs)
for cnt, vb in enumerate(list_vbs):
    final_data['tagged_rel'] = final_data.apply(lambda x: vb if (vb in x['NOTES'] and x['tagged_rel'] == '') else x['tagged_rel'], axis =1)
    print(str(cnt) + '/' + str(num_v))
print('end_loop')

final_data_subset = final_data[final_data['tagged_rel'] != '']
print('we lose {} observations by filtering through our classification of relationship, which is equivalent to {} %'.format(final_data.shape[0] - final_data_subset.shape[0], ((final_data.shape[0] - final_data_subset.shape[0])/final_data.shape[0])*100))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0/68
1/68
2/68
3/68
4/68
5/68
6/68
7/68
8/68
9/68
10/68
11/68
12/68
13/68
14/68
15/68
16/68
17/68
18/68
19/68
20/68
21/68
22/68
23/68
24/68
25/68
26/68
27/68
28/68
29/68
30/68
31/68
32/68
33/68
34/68
35/68
36/68
37/68
38/68
39/68
40/68
41/68
42/68
43/68
44/68
45/68
46/68
47/68
48/68
49/68
50/68
51/68
52/68
53/68
54/68
55/68
56/68
57/68
58/68
59/68
60/68
61/68
62/68
63/68
64/68
65/68
66/68
67/68
end_loop
we lose 2445 observations by filtering through our classification of relationship, which is equivalent to 5.99529204060615 %


In [11]:
# ******************************************************************************
# Replace part of strings in the representation.
# ******************************************************************************

# 1. Replace aggresors *********************************************************
# -------- Get actors from Actors column 

final_data['clean_actor'] = final_data['ACTOR1'].str.split('(').str[0]
final_data['clean_actor'] = final_data['clean_actor'].str.replace('civilians', '')
final_data['temp_clean_actor1'] = final_data['clean_actor'].str.split(':').str[0].str.strip().apply(lambda x: ' ' + str(x) + ' ')
final_data['temp_clean_actor2'] = final_data['clean_actor'].str.split(':').str[1].str.strip().apply(lambda x: ' ' + str(x) + ' ')
uniq_actors = list(final_data['temp_clean_actor1'].str.lower().dropna().unique()) + list(final_data['temp_clean_actor2'].str.lower().dropna().unique())

# Getting those that are unidentified actors & different ways of spelling them
list_unident_actors = final_data['NOTES'].str.split('unidentified armed').str[1].str.split(' ').str[1].str.replace('.', '').str.replace(',', '').str.replace(';', '').str.replace("'s", "").str.lower().dropna().unique()
unident_actors_l = list_unident_actors[((list_unident_actors != 'and') & (list_unident_actors != 'in') & (list_unident_actors != 'on')& (list_unident_actors != 'nan'))]
unident_actors = list(map(lambda x: 'unidentified armed '+ str(x), unident_actors_l))
unident_actors_v2 = list(map(lambda x: 'armed '+ str(x), unident_actors_l))
unident_actors_v3 = list(map(lambda x: 'unidentified '+ str(x), unident_actors_l))
unident_actors_v4 = list(map(lambda x: 'unknown armed '+ str(x), unident_actors_l))
unident_actors_v5 = list(map(lambda x: 'unknown '+ str(x), unident_actors_l))


list_aggrs = uniq_actors + unident_actors + unident_actors_v2 + unident_actors_v3 + unident_actors_v4 + unident_actors_v5+ ['fighter','fighters','anti-houti forces']
num = len(list_aggrs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.

In [12]:
# --------
final_data['tagged_str'] = final_data['NOTES']
for cnt, token in enumerate(list_aggrs):
    if len(token.strip()) >= 3: 
        if 'civilians'not in token: 
            final_data['tagged_str'] = final_data['tagged_str'].str.lower().str.replace(token, 'aggresor')
    print(str(cnt) + '/' + str(num))
print('end loop')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0/1281
1/1281
2/1281
3/1281
4/1281
5/1281
6/1281
7/1281
8/1281
9/1281
10/1281
11/1281
12/1281
13/1281
14/1281
15/1281
16/1281
17/1281
18/1281
19/1281
20/1281
21/1281
22/1281
23/1281
24/1281
25/1281
26/1281
27/1281
28/1281
29/1281
30/1281
31/1281
32/1281
33/1281
34/1281
35/1281
36/1281
37/1281
38/1281
39/1281
40/1281
41/1281
42/1281
43/1281
44/1281
45/1281
46/1281
47/1281
48/1281
49/1281
50/1281
51/1281
52/1281
53/1281
54/1281
55/1281
56/1281
57/1281
58/1281
59/1281
60/1281
61/1281
62/1281
63/1281
64/1281
65/1281
66/1281
67/1281
68/1281
69/1281
70/1281
71/1281
72/1281
73/1281
74/1281
75/1281
76/1281
77/1281
78/1281
79/1281
80/1281
81/1281
82/1281
83/1281
84/1281
85/1281
86/1281
87/1281
88/1281
89/1281
90/1281
91/1281
92/1281
93/1281
94/1281
95/1281
96/1281
97/1281
98/1281
99/1281
100/1281
101/1281
102/1281
103/1281
104/1281
105/1281
106/1281
107/1281
108/1281
109/1281
110/1281
111/1281
112/1281
113/1281
114/1281
115/1281
116/1281
117/1281
118/1281
119/1281
120/1281
121/1281
122/1281
123

924/1281
925/1281
926/1281
927/1281
928/1281
929/1281
930/1281
931/1281
932/1281
933/1281
934/1281
935/1281
936/1281
937/1281
938/1281
939/1281
940/1281
941/1281
942/1281
943/1281
944/1281
945/1281
946/1281
947/1281
948/1281
949/1281
950/1281
951/1281
952/1281
953/1281
954/1281
955/1281
956/1281
957/1281
958/1281
959/1281
960/1281
961/1281
962/1281
963/1281
964/1281
965/1281
966/1281
967/1281
968/1281
969/1281
970/1281
971/1281
972/1281
973/1281
974/1281
975/1281
976/1281
977/1281
978/1281
979/1281
980/1281
981/1281
982/1281
983/1281
984/1281
985/1281
986/1281
987/1281
988/1281
989/1281
990/1281
991/1281
992/1281
993/1281
994/1281
995/1281
996/1281
997/1281
998/1281
999/1281
1000/1281
1001/1281
1002/1281
1003/1281
1004/1281
1005/1281
1006/1281
1007/1281
1008/1281
1009/1281
1010/1281
1011/1281
1012/1281
1013/1281
1014/1281
1015/1281
1016/1281
1017/1281
1018/1281
1019/1281
1020/1281
1021/1281
1022/1281
1023/1281
1024/1281
1025/1281
1026/1281
1027/1281
1028/1281
1029/1281
1030/1281
1031/1

In [13]:
final_data['tagged_str'] = final_data['tagged_str'].str.lower().str.replace('aggresors', 'AGGRESOR')
final_data['tagged_str'] = final_data['tagged_str'].str.lower().str.replace('aggresor', 'AGGRESOR')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
x = final_data[final_data['tagged_str'].str.contains('AGGRESOR')]
x.shape # These are for how many we did NOT get the tag

(21597, 35)

In [15]:
final_data_subset = final_data[final_data['tagged_str'].str.contains('AGGRESOR')]
print('we lose {} observations by replacing aggresor in phrase, which is equivalent to {} %'.format(final_data.shape[0] - final_data_subset.shape[0], ((final_data.shape[0] - final_data_subset.shape[0])/final_data.shape[0])*100))

we lose 19185 observations by replacing aggresor in phrase, which is equivalent to 47.04281300573783 %


In [16]:
# 2. Replace civilians *********************************************************
list_civs = ['passengers', 'civilians','residents' 'passenger', 'civilian', 'family', 'families', 'people', 'tourist','tourists' 'villagers', 'women', 'children', 'citizen', 'citizens', 'population']

for cnt, token in enumerate(list_civs):
    if token:
        final_data['tagged_str'] = final_data['tagged_str'].str.lower().str.replace(token,'victim')
        print(str(cnt) + '/' + str(len(list_civs)))
print('end loop')


0/14
1/14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


2/14
3/14
4/14
5/14
6/14
7/14
8/14
9/14
10/14
11/14
12/14
13/14
end loop


In [33]:
final_data['tagged_str'] = final_data['tagged_str'].str.lower().str.replace('victims', 'VICTIM')
final_data['tagged_str'] = final_data['tagged_str'].str.lower().str.replace('victim', 'VICTIM')
final_data['tagged_str'] = final_data['tagged_str'].str.replace('aggresor', 'AGGRESOR')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [34]:
y = final_data[~final_data['tagged_str'].str.contains('VICTIM')]
y.shape # These are for how many we did NOT get the tag

(25375, 35)

In [35]:
final_data_subset = final_data[final_data['tagged_str'].str.contains('VICTIM')]
print('we lose {} observations by replacing victim in phrase, which is equivalent to {} %'.format(final_data.shape[0] - final_data_subset.shape[0], ((final_data.shape[0] - final_data_subset.shape[0])/final_data.shape[0])*100))

we lose 25375 observations by replacing victim in phrase, which is equivalent to 62.22107792653622 %


In [36]:
# 3. Subset our data for this case *********************************************

In [50]:
#Tags that have both aggresor and victim labels 
final_data_subset = final_data[((final_data['tagged_str'].str.contains('AGGRESOR')) & (final_data['tagged_str'].str.contains('VICTIM')))]
print('{} dont have victim & aggressor, which is equivalent to {} %'.format(final_data.shape[0] - final_data_subset.shape[0], ((final_data.shape[0] - final_data_subset.shape[0])/final_data.shape[0])*100))

33706 dont have victim & aggressor, which is equivalent to 82.64920798391448 %


In [51]:
final_data_subset = final_data[((final_data['tagged_str'].str.contains('AGGRESOR')) | (final_data['tagged_str'].str.contains('VICTIM')))]
print('{} dont have victim OR aggressor, which is equivalent to {} %'.format(final_data.shape[0] - final_data_subset.shape[0], ((final_data.shape[0] - final_data_subset.shape[0])/final_data.shape[0])*100))

10854 dont have victim OR aggressor, which is equivalent to 26.61468294835957 %


In [58]:
final_data_subset = final_data[(final_data['tagged_rel'] != '' ) & ((final_data['tagged_str'].str.contains('AGGRESOR')) | (final_data['tagged_str'].str.contains('VICTIM')))]
print('{} dont have victim OR aggressor, NOR relationship, which is equivalent to {} %'.format(final_data.shape[0] - final_data_subset.shape[0], ((final_data.shape[0] - final_data_subset.shape[0])/final_data.shape[0])*100))
final = final_data_subset
print('Final Dataset for training has {} observations'.format(final.shape[0]))

12325 dont have victim OR aggressor, NOR relationship, which is equivalent to 30.221666421460448 %
Final Dataset for training has 28457 observations


In [59]:
final.to_csv('Data/data_prepr.csv')


In [46]:
final_v2 = final_data[(final_data['tagged_rel'] != '' ) & ((final_data['tagged_str'].str.contains('AGGRESOR')) & (final_data['tagged_str'].str.contains('VICTIM')))]
print('{} dont have victim AND aggressor, NOR relationship, which is equivalent to {} %'.format(final_data.shape[0] - final_v2.shape[0], ((final_data.shape[0] - final_data_v2.shape[0])/final_data.shape[0])*100))

final_v2.to_csv('Data/data_prepr_v2.csv')

