In [3]:
import pandas as pd
import torch
import torch.utils.data as tud
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter, defaultdict
import operator
import os, math
import numpy as np
import random
import copy
import spacy
nlp = spacy.load("en_core_web_sm")

In [9]:
data = pd.read_csv('data/050319_acled_all.csv')
VOCAB_SIZE = 5000
seed = 30255
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

In [10]:
data.shape

(509157, 31)

In [11]:
def remove_stopwords(l):
    STOP  = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']
    l_clean = []
    for i in l:
        if i not in STOP:
            l_clean.append(i)
    return l_clean


def word_tokenize(s, clean = False):
    if type(s) != str:
        return None
    split_l = s.lower().replace('.', '').replace(',', '').replace(';', '').replace(':', '').replace('!', '').replace('?', '').split()
    if clean:
        clean_l = remove_stopwords(split_l)
        return clean_l

    return split_l

class Model:
    def __init__(self, data, clean = False):
        # Vocabulary is a set that stores every word seen in the
        # training data
        self.vocab_counts = Counter([word for content in data
                              for word in word_tokenize(content, clean) if word]
                            ).most_common(VOCAB_SIZE-1)
        # word to index mapping
        self.word_to_idx = {k[0]: v+1 for v, k in
                            enumerate(self.vocab_counts)}
        # all the unknown words will be mapped to index 0
        self.word_to_idx["UNK"] = 0
        self.vocab = set(self.word_to_idx.keys())

        self.verb_counts = Counter([token.lemma_ for content in data
                              for token in nlp(content) if token.pos_ == "VERB"]
                            ).most_common(VOCAB_SIZE-1)

        self.noun_counts = Counter([chunk.text for content in data
                              for chunk in nlp(content).noun_chunks]
                            ).most_common(VOCAB_SIZE-1)


class TextClassificationDataset(tud.Dataset):
    '''
    PyTorch provides a common dataset interface.
    See https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
    The dataset encodes documents into indices.
    With the PyTorch dataloader, you can easily get batched data for
    training and evaluation.
    '''
    def __init__(self, word_to_idx, data):

        self.data = data
        self.word_to_idx = word_to_idx
        self.vocab_size = VOCAB_SIZE

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = np.zeros(self.vocab_size)
        item = torch.from_numpy(item)
        for word in word_tokenize(self.data[idx]):
            item[self.word_to_idx.get(word, 0)] += 1
        return item

In [13]:
# ******************************************************************************
# Get list of verbs for all data
# ******************************************************************************
data.dropna(subset=['notes'], inplace = True)
x_wstopwords = Model(data['NOTES'], False)
verb_count_complete = x_wstopwords.verb_counts

verb_count_complete # This was exported to a csv and then manually classified. Then we imported it back 

In [19]:
# ******************************************************************************
# Subset my data
# ******************************************************************************
data_filtered = data.loc[data['notes'].str.len() > 100 ]

mlist = [26, 27, 36, 37, 56, 57]
final_data = data_filtered.loc[data_filtered['interaction'].isin(mlist)]
final_data.shape

(50671, 31)

In [22]:
# ******************************************************************************
# Read processed list of verbs
# ******************************************************************************
vbs = pd.read_csv('data/verbs.csv')
vbs.head()

vbs['clean_list'] = vbs['List'].str.replace('(', '')
vbs['clean_list'] = vbs['clean_list'].str.replace('[', '')
vbs['clean_list'] = vbs['clean_list'].str.replace(']', '')
vbs['clean_list'] = vbs['clean_list'].str.replace("'", '')
vbs['clean_list'] = vbs['clean_list'].str.strip()

vbs = vbs[vbs['Filter'] == 1]
vbs.shape
list_vbs = vbs.clean_list
list_vbs[:15]

1        kill
3      injure
4       stage
6     protest
8      attack
9      demand
10       hold
11      shell
12      wound
14      shoot
15      clash
16       lead
20       fire
22     arrest
24        hit
Name: clean_list, dtype: object

In [24]:
# ******************************************************************************
# Create automatic labelling.
# ******************************************************************************
final_data['tagged_rel'] = ''
num_v = len(list_vbs)
for cnt, vb in enumerate(list_vbs):
    final_data['tagged_rel'] = final_data.apply(lambda x: vb if (vb in x['notes'] and x['tagged_rel'] == '') else x['tagged_rel'], axis =1)
    print(str(cnt) + '/' + str(num_v))
print('end_loop')

final_data_subset = final_data[final_data['tagged_rel'] != '']
print('we lose {} observations by filtering through our classification of relationship, which is equivalent to {} %'.format(final_data.shape[0] - final_data_subset.shape[0], ((final_data.shape[0] - final_data_subset.shape[0])/final_data.shape[0])*100))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0/68
1/68
2/68
3/68
4/68
5/68
6/68
7/68
8/68
9/68
10/68
11/68
12/68
13/68
14/68
15/68
16/68
17/68
18/68
19/68
20/68
21/68
22/68
23/68
24/68
25/68
26/68
27/68
28/68
29/68
30/68
31/68
32/68
33/68
34/68
35/68
36/68
37/68
38/68
39/68
40/68
41/68
42/68
43/68
44/68
45/68
46/68
47/68
48/68
49/68
50/68
51/68
52/68
53/68
54/68
55/68
56/68
57/68
58/68
59/68
60/68
61/68
62/68
63/68
64/68
65/68
66/68
67/68
end_loop
we lose 3028 observations by filtering through our classification of relationship, which is equivalent to 5.975804700913737 %


In [26]:
# ******************************************************************************
# Replace part of strings in the representation. - Replace actors 
# ******************************************************************************

# -------- Get actors from Actors column 

final_data['clean_actor'] = final_data['actor1'].str.split('(').str[0]
final_data['clean_actor'] = final_data['clean_actor'].str.replace('civilians', '')
final_data['temp_clean_actor1'] = final_data['clean_actor'].str.split(':').str[0].str.strip().apply(lambda x: ' ' + str(x) + ' ')
final_data['temp_clean_actor2'] = final_data['clean_actor'].str.split(':').str[1].str.strip().apply(lambda x: ' ' + str(x) + ' ')
uniq_actors = list(final_data['temp_clean_actor1'].str.lower().dropna().unique()) + list(final_data['temp_clean_actor2'].str.lower().dropna().unique())

# Getting those that are unidentified actors & different ways of spelling them
list_unident_actors = final_data['notes'].str.split('unidentified armed').str[1].str.split(' ').str[1].str.replace('.', '').str.replace(',', '').str.replace(';', '').str.replace("'s", "").str.lower().dropna().unique()
unident_actors_l = list_unident_actors[((list_unident_actors != 'and') & (list_unident_actors != 'in') & (list_unident_actors != 'on')& (list_unident_actors != 'nan'))]
unident_actors = list(map(lambda x: 'unidentified armed '+ str(x), unident_actors_l))
unident_actors_v2 = list(map(lambda x: 'armed '+ str(x), unident_actors_l))
unident_actors_v3 = list(map(lambda x: 'unidentified '+ str(x), unident_actors_l))
unident_actors_v4 = list(map(lambda x: 'unknown armed '+ str(x), unident_actors_l))
unident_actors_v5 = list(map(lambda x: 'unknown '+ str(x), unident_actors_l))


list_aggrs = uniq_actors + unident_actors + unident_actors_v2 + unident_actors_v3 + unident_actors_v4 + unident_actors_v5+ ['fighter','fighters','anti-houti forces']
num = len(list_aggrs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.

In [27]:
final_data['tagged_str'] = final_data['notes']
for cnt, token in enumerate(list_aggrs):
    if len(token.strip()) >= 3: 
        if 'civilians'not in token: 
            final_data['tagged_str'] = final_data['tagged_str'].str.lower().str.replace(token, 'aggresor')
    print(str(cnt) + '/' + str(num))
print('end loop')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0/1380
1/1380
2/1380
3/1380
4/1380
5/1380
6/1380
7/1380
8/1380
9/1380
10/1380
11/1380
12/1380
13/1380
14/1380
15/1380
16/1380
17/1380
18/1380
19/1380
20/1380
21/1380
22/1380
23/1380
24/1380
25/1380
26/1380
27/1380
28/1380
29/1380
30/1380
31/1380
32/1380
33/1380
34/1380
35/1380
36/1380
37/1380
38/1380
39/1380
40/1380
41/1380
42/1380
43/1380
44/1380
45/1380
46/1380
47/1380
48/1380
49/1380
50/1380
51/1380
52/1380
53/1380
54/1380
55/1380
56/1380
57/1380
58/1380
59/1380
60/1380
61/1380
62/1380
63/1380
64/1380
65/1380
66/1380
67/1380
68/1380
69/1380
70/1380
71/1380
72/1380
73/1380
74/1380
75/1380
76/1380
77/1380
78/1380
79/1380
80/1380
81/1380
82/1380
83/1380
84/1380
85/1380
86/1380
87/1380
88/1380
89/1380
90/1380
91/1380
92/1380
93/1380
94/1380
95/1380
96/1380
97/1380
98/1380
99/1380
100/1380
101/1380
102/1380
103/1380
104/1380
105/1380
106/1380
107/1380
108/1380
109/1380
110/1380
111/1380
112/1380
113/1380
114/1380
115/1380
116/1380
117/1380
118/1380
119/1380
120/1380
121/1380
122/1380
123

923/1380
924/1380
925/1380
926/1380
927/1380
928/1380
929/1380
930/1380
931/1380
932/1380
933/1380
934/1380
935/1380
936/1380
937/1380
938/1380
939/1380
940/1380
941/1380
942/1380
943/1380
944/1380
945/1380
946/1380
947/1380
948/1380
949/1380
950/1380
951/1380
952/1380
953/1380
954/1380
955/1380
956/1380
957/1380
958/1380
959/1380
960/1380
961/1380
962/1380
963/1380
964/1380
965/1380
966/1380
967/1380
968/1380
969/1380
970/1380
971/1380
972/1380
973/1380
974/1380
975/1380
976/1380
977/1380
978/1380
979/1380
980/1380
981/1380
982/1380
983/1380
984/1380
985/1380
986/1380
987/1380
988/1380
989/1380
990/1380
991/1380
992/1380
993/1380
994/1380
995/1380
996/1380
997/1380
998/1380
999/1380
1000/1380
1001/1380
1002/1380
1003/1380
1004/1380
1005/1380
1006/1380
1007/1380
1008/1380
1009/1380
1010/1380
1011/1380
1012/1380
1013/1380
1014/1380
1015/1380
1016/1380
1017/1380
1018/1380
1019/1380
1020/1380
1021/1380
1022/1380
1023/1380
1024/1380
1025/1380
1026/1380
1027/1380
1028/1380
1029/1380
1030/13

In [28]:
# Homologize 
final_data['tagged_str'] = final_data['tagged_str'].str.lower().str.replace('aggresors', 'AGGRESOR')
final_data['tagged_str'] = final_data['tagged_str'].str.lower().str.replace('aggresor', 'AGGRESOR')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [30]:
final_data_subset = final_data[final_data['tagged_str'].str.contains('AGGRESOR')]
print('we lose {} observations by replacing aggresor in phrase, which is equivalent to {} %'.format(final_data.shape[0] - final_data_subset.shape[0], ((final_data.shape[0] - final_data_subset.shape[0])/final_data.shape[0])*100))

we lose 24460 observations by replacing aggresor in phrase, which is equivalent to 48.272187247143336 %


In [31]:
# ******************************************************************************
# Replace part of strings in the representation. - Replace victims
# ******************************************************************************


# 2. Replace civilians *********************************************************
list_civs = ['passengers', 'civilians','residents' 'passenger', 'civilian', 'family', 'families', 'people', 'tourist','tourists' 'villagers', 'women', 'children', 'citizen', 'citizens', 'population']

for cnt, token in enumerate(list_civs):
    if token:
        final_data['tagged_str'] = final_data['tagged_str'].str.lower().str.replace(token,'victim')
        print(str(cnt) + '/' + str(len(list_civs)))
print('end loop')


0/14
1/14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


2/14
3/14
4/14
5/14
6/14
7/14
8/14
9/14
10/14
11/14
12/14
13/14
end loop


In [32]:
final_data['tagged_str'] = final_data['tagged_str'].str.lower().str.replace('victims', 'VICTIM')
final_data['tagged_str'] = final_data['tagged_str'].str.lower().str.replace('victim', 'VICTIM')
final_data['tagged_str'] = final_data['tagged_str'].str.replace('aggresor', 'AGGRESOR')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [33]:
y = final_data[~final_data['tagged_str'].str.contains('VICTIM')]
y.shape # These are for how many we did NOT get the tag

(31996, 36)

In [34]:
final_data_subset = final_data[final_data['tagged_str'].str.contains('VICTIM')]
print('we lose {} observations by replacing victim in phrase, which is equivalent to {} %'.format(final_data.shape[0] - final_data_subset.shape[0], ((final_data.shape[0] - final_data_subset.shape[0])/final_data.shape[0])*100))

we lose 31996 observations by replacing victim in phrase, which is equivalent to 63.14459947504489 %


In [35]:
# 3. Subset our data for this case *********************************************

In [39]:
#Tags that have both aggresor and victim labels 
final_data_subset = final_data[((final_data['tagged_str'].str.contains('AGGRESOR')) & (final_data['tagged_str'].str.contains('VICTIM')))]
print('{} have victim & aggressor, which is equivalent to a {} % loss of the data'.format(final_data.shape[0] - final_data_subset.shape[0], ((final_data.shape[0] - final_data_subset.shape[0])/final_data.shape[0])*100))

42125 have victim & aggressor, which is equivalent to a 83.13433719484517 % loss of the data


In [40]:
final_data_subset = final_data[((final_data['tagged_str'].str.contains('AGGRESOR')) | (final_data['tagged_str'].str.contains('VICTIM')))]
print('{} have victim OR aggressor, which is equivalent to {} % loss of the data'.format(final_data.shape[0] - final_data_subset.shape[0], ((final_data.shape[0] - final_data_subset.shape[0])/final_data.shape[0])*100))

14331 have victim OR aggressor, which is equivalent to 28.282449527343058 % loss of the data


In [41]:
final_data_subset = final_data[(final_data['tagged_rel'] != '' ) & ((final_data['tagged_str'].str.contains('AGGRESOR')) | (final_data['tagged_str'].str.contains('VICTIM')))]
print('{} have a victim OR aggressor, NOR relationship, which is equivalent to a  {} % loss'.format(final_data.shape[0] - final_data_subset.shape[0], ((final_data.shape[0] - final_data_subset.shape[0])/final_data.shape[0])*100))
final = final_data_subset
print('Final Dataset for training has {} observations'.format(final.shape[0]))

16115 have a victim OR aggressor, NOR relationship, which is equivalent to a  31.803201042016145 % loss
Final Dataset for training has 34556 observations


In [44]:
final_v2 = final_data[(final_data['tagged_rel'] != '' ) & ((final_data['tagged_str'].str.contains('AGGRESOR')) & (final_data['tagged_str'].str.contains('VICTIM')))]
print('{} dont have victim AND aggressor, NOR relationship, which is equivalent to {} %'.format(final_data.shape[0] - final_v2.shape[0], ((final_data.shape[0] - final_v2.shape[0])/final_data.shape[0])*100))

final_v2.to_csv('Data/data_prepr_final_v2.csv')



42377 dont have victim AND aggressor, NOR relationship, which is equivalent to 83.63166308144699 %
