In [None]:
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import linear_model
import nltk

#import data
with open("data_matrix_sample_programs2.pickle",'rb') as f:
        data = pickle.load(f)

X = data['data'][:,4]
y = data['data'][:,6]

# Now convert y to a binary indicator matrix (1 is claim, 0 no claim)
y = np.asarray([y[i] is not None for i in range(len(X))])  



In [None]:
def init_list_of_objects(size):
    list_of_objects = list()
    for i in range(0,size):
        list_of_objects.append( list() ) #different object reference each time
    return list_of_objects

In [1]:
#count the claims
claim_indexes=[]
non_zeros=0
for (i,bool) in enumerate(y):
        if(bool==True):
            non_zeros=non_zeros+1
            claim_indexes.append(i)
    
print('Total number of claims: ', non_zeros)
print('Total number of sentences: ', len(X))
#print('Claim indexes: ', claim_indexes)

# Make a Bag-of-Words
X_new=init_list_of_objects(len(X))
stop_words=['.nr','b.t', 'ph.d', '.dk', '.com','bl.a','p.t']
X_new=init_list_of_objects(len(X))
for (i,sent) in enumerate(X):
    X_new[i] = X[i].replace("-", " ")   
    X_new[i] = X_new[i].replace(".", '')
    X_new[i] = X_new[i].replace("=", ' ')
    X_new[i] = X_new[i].replace("/", ' ')
    X_new[i] = X_new[i].replace("'", ' ')
    X_new[i] = X_new[i].replace(":", ' ')

vectorizer = CountVectorizer(max_df=0.1,stop_words=stop_words)
X_bow = vectorizer.fit_transform(X_new)
words = vectorizer.get_feature_names()
stop=vectorizer.get_stop_words()
print('Stop words: ', stop)
print('Number of words: ', len(words))


# 'Translate' sentences into ordered lists of ids
sentences_id = init_list_of_objects(len(X))
sentences = init_list_of_objects(len(X))
err = []

for j, sentence in enumerate(X_new):
	tok = nltk.word_tokenize(sentence)
	for word in tok:
		word = word.lower()
		if word in words:
			sentences_id[j].append(words.index(word)+1)
			sentences[j].append(word)
		else:
			err.append(word)


err = set(err)
print('Discarded words: ', err)



Total number of claims:  568
Total number of sentences:  8466
Stop words:  frozenset({'.com', 'ph.d', '.dk', 'p.t', '.nr', 'b.t', 'bl.a'})
Number of words:  10391
Discarded words:  {'p', '1,6', 'kan', ',', '16,6', 'skal', 'at', '8', '42,6', 'jeg', 'ø', 'e', '1,5', 'det', '68,5', 's', 'k', '10,5', 'til', '4', 'x', '11,1', 'har', 'på', 'm', '17,5', 'en', 'og', 'så', '6', '2,5', '``', 'a', '5', '!', '0,05', '2,25', '0', 'vi', "''", 'er', 'i', 'b', '%', 'man', '1,3', '13,3', '?', '>', 'af', 'som', '15,7', 'et', '8,6', 'den', 'for', '7', '2', '2,9', 'w', 'r', 'd', '3', 'de', 'om', 'med', 'der', 'ikke', '47,5', 'f', '1', '9', 'c', 'v', '<'}


In [2]:
num_sentences=len(sentences_id)
num_words=max(max(sentences_id))

#splitting exemples in training, validation and test set
num_training=round(6/10*num_sentences)
num_valid=round(2/10*num_sentences)
num_test=num_sentences-(num_training+num_valid)

index_permutated=np.random.permutation(num_sentences)

sentences_id_permutated=init_list_of_objects(num_sentences)
sentences_permutated=init_list_of_objects(num_sentences)
word_sentences_permutated=init_list_of_objects(num_sentences)
labels_permutated=init_list_of_objects(num_sentences)
for (i,perm_index) in enumerate(index_permutated):
    sentences_id_permutated[i]=sentences_id[perm_index]
    sentences_permutated[i]=sentences[perm_index]
    word_sentences_permutated[i]=X[perm_index]
    labels_permutated[i]=y[perm_index]
    

#check the permutation works in the right way
print(sentences_id_permutated[0])
print(sentences_id[index_permutated[0]])
print(sentences_permutated[0])
print(sentences[index_permutated[0]])
print(word_sentences_permutated[0])
print(X[index_permutated[0]])
print(labels_permutated[0])
print(y[index_permutated[0]])


    
training_sentences_id=sentences_id_permutated[:num_training]
validation_sentences_id=sentences_id_permutated[num_training:num_training+num_valid]
test_sentences_id=sentences_id_permutated[num_training+num_valid:num_sentences+1]

training_sentences=sentences_permutated[:num_training]
validation_sentences=sentences_permutated[num_training:num_training+num_valid]
test_sentences=sentences_permutated[num_training+num_valid:num_sentences+1]

training_labels=labels_permutated[:num_training]
valid_labels=labels_permutated[num_training:num_training+num_valid]
test_labels=labels_permutated[num_training+num_valid:num_sentences+1]

training_word_sentences=word_sentences_permutated[:num_training]
validation_word_sentences=word_sentences_permutated[num_training:num_training+num_valid]
test_word_sentences=word_sentences_permutated[num_training+num_valid:num_sentences+1]


print('Number of training exemples: ',len(training_sentences))
print('Number of validation exemples: ',len(validation_sentences))
print('Number of test exemples: ',len(test_sentences))





[3490, 2949, 4925, 10201, 8243, 10201, 7588, 234, 10201, 8243]
[3490, 2949, 4925, 10201, 8243, 10201, 7588, 234, 10201, 8243]
['gået', 'fra', 'kun', 'være', 'spidse', 'være', 'sammen', 'afghanerne', 'være', 'spidse']
['gået', 'fra', 'kun', 'være', 'spidse', 'være', 'sammen', 'afghanerne', 'være', 'spidse']
Vi er gået fra kun at være spidse til at være sammen med afghanerne om at være spidse.
Vi er gået fra kun at være spidse til at være sammen med afghanerne om at være spidse.
False
False
Number of training exemples:  5080
Number of validation exemples:  1693
Number of test exemples:  1693


In [3]:
#count number of claims

non_zeros_tot=0
non_zeros_train=0
train_claim_indexes=[]
for (i,lab) in enumerate(training_labels):
        if(lab==1):
            non_zeros_train=non_zeros_train+1
            train_claim_indexes.append(i)
for (i,lab) in enumerate(y):
        if(lab==1):
            non_zeros_tot=non_zeros_tot+1       
            
non_zeros_valid=0
valid_claim_indexes=[]
for (i,lab) in enumerate(valid_labels):
        if(lab==1):
            non_zeros_valid=non_zeros_valid+1
            valid_claim_indexes.append(i)

non_zeros_test=0
test_claim_indexes=[]
for (i,lab) in enumerate(test_labels):
        if(lab==1):
            non_zeros_test=non_zeros_test+1
            test_claim_indexes.append(i)            
            
            
print('Total number of claims: ', non_zeros_tot)
print('Number of claims in the training set: ',non_zeros_train)
print('Number of claims in the validation set: ',non_zeros_valid)
print('Number of claims in the test set: ',non_zeros_test)




Total number of claims:  568
Number of claims in the training set:  341
Number of claims in the validation set:  104
Number of claims in the test set:  123


In [4]:
#creating a balanced dataset

new_num_training=num_training+non_zeros_train*9
new_training_sentences_id = init_list_of_objects(new_num_training)
new_training_sentences = init_list_of_objects(new_num_training)
new_training_labels= np.zeros((new_num_training,))
new_training_word_sentences=init_list_of_objects(new_num_training)

k=0
for (i,sent) in enumerate(training_sentences_id):
    if(training_labels[i]==False):
        for (j,word) in enumerate(sent):
            new_training_sentences_id[k].append(word)
        for (j,word) in enumerate(training_sentences[i]):
            new_training_sentences[k].append(word)
        new_training_labels[k]=0
        new_training_word_sentences[k].append(training_word_sentences[i])
        k=k+1
    else:
        for (j,word) in enumerate(sent):
            for g in np.arange(10):
                new_training_sentences_id[k+g].append(word)
        for (j,word) in enumerate(training_sentences[i]):
            for g in np.arange(10):
                new_training_sentences[k+g].append(word)
        new_training_labels[k:k+10]=1
        for g in np.arange(10):
            new_training_word_sentences[k+g].append(training_word_sentences[i])
        k=k+10
    #print(k) 

    
new_num_valid=num_valid+non_zeros_valid*9
new_valid_sentences_id = init_list_of_objects(new_num_valid)
new_valid_sentences = init_list_of_objects(new_num_valid)
new_valid_labels= np.zeros((new_num_valid,))
new_valid_word_sentences=init_list_of_objects(new_num_valid)

k=0
for (i,sent) in enumerate(validation_sentences_id):
    if(valid_labels[i]==False):
        for (j,word) in enumerate(sent):
            new_valid_sentences_id[k].append(word)
        for (j,word) in enumerate(validation_sentences[i]):
            new_valid_sentences[k].append(word)
        new_valid_labels[k]=0
        new_valid_word_sentences[k].append(validation_word_sentences[i])
        k=k+1
    else:
        for (j,word) in enumerate(sent):
            for g in np.arange(10):
                new_valid_sentences_id[k+g].append(word)
        for (j,word) in enumerate(validation_sentences[i]):
            for g in np.arange(10):
                new_valid_sentences[k+g].append(word)
        new_valid_labels[k:k+10]=1
        for g in np.arange(10):
            new_valid_word_sentences[k+g].append(validation_word_sentences[i])
        k=k+10
        
new_num_test=num_test+non_zeros_test*9
new_test_sentences_id = init_list_of_objects(new_num_test)
new_test_sentences = init_list_of_objects(new_num_test)
new_test_labels= np.zeros((new_num_test,))
new_test_word_sentences=init_list_of_objects(new_num_test)

k=0
for (i,sent) in enumerate(test_sentences_id):
    if(test_labels[i]==False):
        for (j,word) in enumerate(sent):
            new_test_sentences_id[k].append(word)
        for (j,word) in enumerate(test_sentences[i]):
            new_test_sentences[k].append(word)
        new_test_labels[k]=0
        new_test_word_sentences[k].append(test_word_sentences[i])
        k=k+1
    else:
        for (j,word) in enumerate(sent):
            for g in np.arange(10):
                new_test_sentences_id[k+g].append(word)
        for (j,word) in enumerate(test_sentences[i]):
            for g in np.arange(10):
                new_test_sentences[k+g].append(word)
        new_test_labels[k:k+10]=1
        for g in np.arange(10):
            new_test_word_sentences[k+g].append(test_word_sentences[i])
        k=k+10
    
    
    
print('New number of training exemples: ', len(new_test_word_sentences))
print('New number of training claims: ', sum(new_test_labels))

print('Exemples of sentences and labels: ')
for i in np.arange(23,35):
    print(i)
    print(new_training_word_sentences[i])
    print(new_training_sentences_id[i][:])
    print(new_training_sentences[i][:])
    print(new_training_labels[i])






New number of training exemples:  2800
New number of training claims:  1230.0
Exemples of sentences and labels: 
23
['Det er et fremskridt. Taleban forbød dem at gå uden for husene at tage uddannelser og at have job. Alt det har vi gjort op med.']
[3008, 8863, 2597, 1555, 3489, 9436, 3806, 8840, 9410, 3581, 4303, 413, 3339, 6358]
['fremskridt', 'taleban', 'forbød', 'dem', 'gå', 'uden', 'husene', 'tage', 'uddannelser', 'have', 'job', 'alt', 'gjort', 'op']
1.0
24
['Det er et fremskridt. Taleban forbød dem at gå uden for husene at tage uddannelser og at have job. Alt det har vi gjort op med.']
[3008, 8863, 2597, 1555, 3489, 9436, 3806, 8840, 9410, 3581, 4303, 413, 3339, 6358]
['fremskridt', 'taleban', 'forbød', 'dem', 'gå', 'uden', 'husene', 'tage', 'uddannelser', 'have', 'job', 'alt', 'gjort', 'op']
1.0
25
['Det er et fremskridt. Taleban forbød dem at gå uden for husene at tage uddannelser og at have job. Alt det har vi gjort op med.']
[3008, 8863, 2597, 1555, 3489, 9436, 3806, 8840, 941

In [5]:
#mix the exemples inside the training, validation and test set

train_index_permutated=np.random.permutation(new_num_training)

new_train_sentences_id_permutated=init_list_of_objects(new_num_training)
new_train_sentences_permutated=init_list_of_objects(new_num_training)
new_train_word_sentences_permutated=init_list_of_objects(new_num_training)
new_train_labels_permutated=init_list_of_objects(new_num_training)
for (i,perm_index) in enumerate(train_index_permutated):
    new_train_sentences_id_permutated[i]=new_training_sentences_id[perm_index]
    new_train_sentences_permutated[i]=new_training_sentences[perm_index]
    new_train_word_sentences_permutated[i]=new_training_word_sentences[perm_index]
    new_train_labels_permutated[i]=new_training_labels[perm_index]
    
    
valid_index_permutated=np.random.permutation(new_num_valid)

new_valid_sentences_id_permutated=init_list_of_objects(new_num_valid)
new_valid_sentences_permutated=init_list_of_objects(new_num_valid)
new_valid_word_sentences_permutated=init_list_of_objects(new_num_valid)
new_valid_labels_permutated=init_list_of_objects(new_num_valid)
for (i,perm_index) in enumerate(valid_index_permutated):
    new_valid_sentences_id_permutated[i]=new_valid_sentences_id[perm_index]
    new_valid_sentences_permutated[i]=new_valid_sentences[perm_index]
    new_valid_word_sentences_permutated[i]=new_valid_word_sentences[perm_index]
    new_valid_labels_permutated[i]=new_valid_labels[perm_index]
    
    
test_index_permutated=np.random.permutation(new_num_test)

new_test_sentences_id_permutated=init_list_of_objects(new_num_test)
new_test_sentences_permutated=init_list_of_objects(new_num_test)
new_test_word_sentences_permutated=init_list_of_objects(new_num_test)
new_test_labels_permutated=init_list_of_objects(new_num_test)
for (i,perm_index) in enumerate(test_index_permutated):
    new_test_sentences_permutated[i]=new_test_sentences[perm_index]
    new_test_sentences_id_permutated[i]=new_test_sentences_id[perm_index]
    new_test_word_sentences_permutated[i]=new_test_word_sentences[perm_index]
    new_test_labels_permutated[i]=new_test_labels[perm_index]    
    
    
#check the permutation works in the right way
print(new_train_sentences_id_permutated[0])
print(new_training_sentences_id[train_index_permutated[0]])
print(new_train_sentences_permutated[0])
print(new_training_sentences[train_index_permutated[0]])
print(new_train_word_sentences_permutated[0])
print(new_training_word_sentences[train_index_permutated[0]])
print(new_train_labels_permutated[0])
print(new_training_labels[train_index_permutated[0]])

[4270, 1100, 8114, 10201, 9056, 3839, 3373, 7795, 4886, 8402, 10201, 2268]
[4270, 1100, 8114, 10201, 9056, 3839, 3373, 7795, 4886, 8402, 10201, 2268]
['ja', 'blive', 'smuk', 'være', 'tilstand', 'hvor', 'godt', 'sin', 'krop', 'stedet', 'være', 'fanget']
['ja', 'blive', 'smuk', 'være', 'tilstand', 'hvor', 'godt', 'sin', 'krop', 'stedet', 'være', 'fanget']
['Ja, man skal blive smuk. Man skal være i en tilstand hvor man har det godt i sin krop i stedet for at være fanget i den.']
['Ja, man skal blive smuk. Man skal være i en tilstand hvor man har det godt i sin krop i stedet for at være fanget i den.']
0.0
0.0


In [6]:
#fasttext embedding

from pyfasttext import FastText
model = FastText('wiki.da.bin')

# Printing out number of tokens available
print ('Number of words: ', model.nwords) 
# Printing out the dimension of a word vector 
print("Dimension of a word vector: ",len(model.get_numpy_vector('ikke')))

# Print out the vector of a word 
#print("Vector components of a word: ",model.get_numpy_vector('ikke'))


#sentence embedding
embedded_train_sentences = init_list_of_objects(num_training)
for (i,sent) in enumerate(training_sentences):
    for (j,word) in enumerate(sent):
        embedded_train_sentences[i].append(model.get_numpy_vector(word))

new_embedded_train_sentences = init_list_of_objects(new_num_training)
for (i,sent) in enumerate(new_train_sentences_permutated):
    for (j,word) in enumerate(sent):
        new_embedded_train_sentences[i].append(model.get_numpy_vector(word))
        
embedded_valid_sentences = init_list_of_objects(num_valid)
for (i,sent) in enumerate(validation_sentences):
    for (j,word) in enumerate(sent):
        embedded_valid_sentences[i].append(model.get_numpy_vector(word))

new_embedded_valid_sentences = init_list_of_objects(new_num_valid)
for (i,sent) in enumerate(new_valid_sentences_permutated):
    for (j,word) in enumerate(sent):
        new_embedded_valid_sentences[i].append(model.get_numpy_vector(word))
        
embedded_test_sentences = init_list_of_objects(num_test)
for (i,sent) in enumerate(test_sentences):
    for (j,word) in enumerate(sent):
        embedded_test_sentences[i].append(model.get_numpy_vector(word))

new_embedded_test_sentences = init_list_of_objects(new_num_test)
for (i,sent) in enumerate(new_test_sentences_permutated):
    for (j,word) in enumerate(sent):
        new_embedded_test_sentences[i].append(model.get_numpy_vector(word))

print (training_sentences_id[0]) 
print(embedded_train_sentences[0])  
print(training_word_sentences[0])
print(training_labels[0])



Number of words:  312956
Dimension of a word vector:  300
[3490, 2949, 4925, 10201, 8243, 10201, 7588, 234, 10201, 8243]
[array([ -2.85874367e-01,  -9.02902558e-02,   3.86890583e-02,
        -2.75295138e-01,   3.94206107e-01,   2.15953708e-01,
         1.74526423e-01,  -2.63020784e-01,   4.34191316e-01,
        -3.59307155e-02,  -7.82948956e-02,  -1.92232896e-02,
        -2.39860296e-01,   1.74513295e-01,  -4.01518591e-05,
         8.06377605e-02,   2.04292670e-01,   1.35362819e-01,
         3.69114652e-02,   4.47239727e-01,   7.76901022e-02,
        -9.91798043e-02,  -7.06609935e-02,   1.91169977e-02,
        -1.40223771e-01,  -2.32665628e-01,   6.98608309e-02,
        -2.52802908e-01,   1.49005085e-01,  -7.51850247e-01,
         1.74580589e-01,   5.59078678e-02,   2.62454711e-02,
         1.98225543e-01,  -3.53351116e-01,  -1.73368692e-01,
        -4.95047048e-02,  -3.17610592e-01,  -1.19630642e-01,
        -4.49542552e-02,   9.44609419e-02,   4.10273820e-01,
         8.08542147e-02,

In [7]:
#save training, validation and test sets

with open('train_sentences_id_balanced.pickle','wb') as file:
	pickle.dump(new_train_sentences_id_permutated, file)
    
with open('train_sentences_fasttext_balanced.pickle','wb') as file:
	pickle.dump(new_embedded_train_sentences, file)
    
with open('train_labels_balanced.pickle','wb') as file:
	pickle.dump(new_train_labels_permutated, file)
    
with open('train_word_sentences_balanced.pickle','wb') as file:
	pickle.dump(new_train_word_sentences_permutated, file)


    
with open('valid_sentences_id_balanced.pickle','wb') as file:
	pickle.dump(new_valid_sentences_id_permutated, file)
    
with open('valid_sentences_fasttext_balanced.pickle','wb') as file:
	pickle.dump(new_embedded_valid_sentences, file)
    
with open('valid_labels_balanced.pickle','wb') as file:
	pickle.dump(new_valid_labels_permutated, file)
    
with open('valid_word_sentences_balanced.pickle','wb') as file:
	pickle.dump(new_valid_word_sentences_permutated, file)
    
    
    
with open('test_sentences_id_balanced.pickle','wb') as file:
	pickle.dump(new_test_sentences_id_permutated, file)
    
with open('test_sentences_fasttext_balanced.pickle','wb') as file:
	pickle.dump(new_embedded_test_sentences, file)
    
with open('test_labels_balanced.pickle','wb') as file:
	pickle.dump(new_test_labels_permutated, file)
    
with open('test_word_sentences_balanced.pickle','wb') as file:
	pickle.dump(new_test_word_sentences_permutated, file)

In [8]:
with open('train_sentences_id_unbalanced.pickle','wb') as file:
	pickle.dump(training_sentences_id, file)
    
with open('train_sentences_fasttext_unbalanced.pickle','wb') as file:
	pickle.dump(embedded_train_sentences, file)
    
with open('train_labels_unbalanced.pickle','wb') as file:
	pickle.dump(training_labels, file)
    
with open('train_word_sentences_unbalanced.pickle','wb') as file:
	pickle.dump(training_word_sentences, file)


    
with open('valid_sentences_id_unbalanced.pickle','wb') as file:
	pickle.dump(validation_sentences_id, file)
    
with open('valid_sentences_fasttext_unbalanced.pickle','wb') as file:
	pickle.dump(embedded_valid_sentences, file)
    
with open('valid_labels_unbalanced.pickle','wb') as file:
	pickle.dump(valid_labels, file)
    
with open('valid_word_sentences_unbalanced.pickle','wb') as file:
	pickle.dump(validation_word_sentences, file)
    
    
    
with open('test_sentences_id_unbalanced.pickle','wb') as file:
	pickle.dump(test_sentences_id, file)
    
with open('test_sentences_fasttext_unbalanced.pickle','wb') as file:
	pickle.dump(embedded_test_sentences, file)
    
with open('test_labels_unbalanced.pickle','wb') as file:
	pickle.dump(test_labels, file)
    
with open('test_word_sentences_unbalanced.pickle','wb') as file:
	pickle.dump(test_word_sentences, file)