In [59]:
import re
from nltk.corpus import stopwords
import numpy as np
import nltk
import pandas as pd
import codecs
import string
import re, math, collections
import random

In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [3]:
def remove_stop(str):
    stop = set(stopwords.words('english'))
    lst = str.split(" ")
    lst = [i for i in lst if i not in stop]
    return ' '.join(lst)

In [4]:
def remove_unwanted_words(str):
    unwanted_words = ["httpaddress", "usrid", "dd", "rt", "amp", "pm", " ", "'s", "n't", "\t", '``', "''", "", "//", "\\", "\\'s", "\\?"]
    lst = str.split(" ")
    lst = [i for i in lst if i not in unwanted_words]
    return ' '.join(lst)

In [5]:
def toLower(str):
    lst = str.split(" ")
    lst = [i.lower() for i in lst]
    return ' '.join(lst)

In [6]:
def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads MR polarity african_data from files, splits the african_data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load african_data from files
    positive_examples = list(open(positive_data_file, "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    positive_examples = [remove_stop(item) for item in positive_examples]
    positive_examples = [toLower(item) for item in positive_examples]
    positive_examples = [remove_unwanted_words(item) for item in positive_examples]
    
    positive_examples = [clean_str(sent) for sent in positive_examples]
    for i in positive_examples:
        if len(i.split(" ")) < 3:
            positive_examples.remove(i)
    positive_examples = list(filter(None, positive_examples) )    
    negative_examples = list(open(negative_data_file, "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    negative_examples = [toLower(item) for item in negative_examples]
    negative_examples = [remove_stop(item) for item in negative_examples]
    negative_examples = [remove_unwanted_words(item) for item in negative_examples]
    negative_examples = list(filter(None, negative_examples) )
    negative_examples = [clean_str(sent) for sent in negative_examples]
    for i in negative_examples:
        if len(i.split(" ")) < 3:
            negative_examples.remove(i)
    negative_examples = list(filter(None, negative_examples) )    
    
    # Split by words
    x_text = positive_examples + negative_examples
   # x_text = [clean_str(sent) for sent in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)

    return positive_examples, negative_examples 

In [117]:
def generat_word_dist(filename):
    
    unwanted_words = ["'s"," ","n't","\t", '``', "''", "", "//", "\\", "\\'s", "\\?","httpaddress", "usrid", "dd", "rt", "amp", "pm", " ", "'s", "n't", "\t", '``', "''", "", "//", "\\", "\\'s", "\\?"]
    
    default_stopwords = set(nltk.corpus.stopwords.words('english'))
    
    fp = codecs.open(filename, 'r', 'utf-8',errors='ignore')
    
    words = nltk.word_tokenize(fp.read())
    
    # Remove single-character tokens (mostly punctuation)
    words = [word for word in words if len(word) > 1]

    # Remove numbers
    words = [word for word in words if not word.isnumeric()]

    # Lowercase all words (default_stopwords are lowercase too)
    words = [word.lower() for word in words]

    # Stemming words seems to make matters worse, disabled
    # stemmer = nltk.stem.snowball.SnowballStemmer('german')
    # words = [stemmer.stem(word) for word in words]

    # Remove stopwords
    words = [word for word in words if word not in default_stopwords]
    
    words = [clean_str(word) for word in words]
    
    words = [word for word in words if word not in unwanted_words]
    
    print(len(words))    
    # Calculate frequency distribution
    fdist = nltk.FreqDist(words)
    return fdist

In [82]:
def generat_word_dist_bigrams(filename):
    
    default_stopwords = set(nltk.corpus.stopwords.words('english'))
    
    fp = codecs.open(filename, 'r', 'utf-8',errors='ignore')

    words = nltk.word_tokenize(fp.read())

    # Remove single-character tokens (mostly punctuation)
    words = [word for word in words if len(word) > 1]

    # Remove numbers
    words = [word for word in words if not word.isnumeric()]

    # Lowercase all words (default_stopwords are lowercase too)
    words = [word.lower() for word in words]

    # Stemming words seems to make matters worse, disabled
    # stemmer = nltk.stem.snowball.SnowballStemmer('german')
    # words = [stemmer.stem(word) for word in words]

    # Remove stopwords
    words = [word for word in words if word not in default_stopwords]
    
    words = [clean_str(word) for word in words]
    
    words = nltk.bigrams(words)

    # Calculate frequency distribution
    fdist = nltk.FreqDist(words)
    return fdist

In [121]:
def print_word_list(freq_dist):
    word_lst = []
    for word, frequency in freq_dist.most_common(1000):
            word_lst.append(word)
    return word_lst   

In [9]:
PV_pos = 'Data/PV_Data/pos_vio_news.txt'
PV_neg = 'Data/PV_Data/neg_vio_news.txt'

In [69]:
pos_wrd_dist_doc = generat_word_dist(PV_pos)
neg_wrd_dist_doc = generat_word_dist(PV_neg)

372578
244662


In [79]:
pos_most_occr_words = print_word_list(pos_wrd_dist_doc)
neg_most_occr_words = print_word_list(neg_wrd_dist_doc)

In [132]:
print(pos_most_occr_words[30000:])

['quotidien', '01 02 2010', 'sid 15059', 'o3 as', 'afmadow middle', 'juba 20100201 500', 'morur', 'portugeuse', "'libya", 'wershefana', 'exchangse', "and libya 's western", 'ceel saliini', 'boudhima', 'zajiri', 'thor al abiyat', 'thomor', 'induction', 'christiana', 'dagadu', 'sanniqueilie', 'nov 5', 'kalenda', 'giriama', 'kipini', 'commissioned', 'ghanbatt', 'ghana armed', 'ghana for', 'cefups', 'government allied', 'ruyterwacht', 'premeditated', 'titi laoye', 'tomori', 'satisfied', 'galgduud', 'kaba kudukade', 'assuring', 'mirair', 'bungatira', 'rebs', 'reb', 'r55', 'yaqberiweyne', 'labagale', 'membres', 'senhuile sen thanol', 'agro industrial', 'fanaye', 'colusion', 'group attacked a', 'al hayy', 'nunu', 'gambella', 'shataya', 'attampted', 'transparency no', 'mandevu', 'ayr habar', 'saleban habar', 'balli howd', 'control clashes which', 'al khusus', 'armed clashes left', '01 05 2011', 'assertion', 'zewaid', 'sure', 'ajipowo', 'sisfm', 'sempana', 'gems for guns', 'ncs', 'seme', 'huffp

In [143]:
# the no. occurances of the most freq words in pos in the negative set
l = []
c=0
for i in pos_most_occr_words:
    if neg_wrd_dist_doc[i] > 1000:
        c +=1
        l.append(i)
        print(i, neg_wrd_dist_doc[i],'  -  ', i,pos_wrd_dist_doc[i])
print(c)
print(",".join(l))        

market 9066   -   market 344
new 2113   -   new 305
report 1619   -   report 231
health 3716   -   health 95
business 5662   -   business 73
tv 1200   -   tv 32
technology 4473   -   technology 28
finance 1254   -   finance 20
research 4608   -   research 12
announces 1311   -   announces 9
markets 4137   -   markets 7
global 2063   -   global 7
12
market,new,report,health,business,tv,technology,finance,research,announces,markets,global


In [142]:
# the no. occurances of the most freq words in neg in the positive set
c = 0
l = []
for i in neg_most_occr_words:
    if pos_wrd_dist_doc[i] > 1000:
        c+= 1
        l.append(i)
        print(i, pos_wrd_dist_doc[i],'  -  ', i,neg_wrd_dist_doc[i])
print(c)
print(",".join(l))

group 2011   -   group 732
security 1474   -   security 187
state 1074   -   state 167
one 1710   -   one 166
two 3095   -   two 113
government 1938   -   government 98
local 1001   -   local 88
three 1355   -   three 69
area 1819   -   area 68
students 1012   -   students 53
members 1279   -   members 49
military 1553   -   military 43
forces 3960   -   forces 39
people 2082   -   people 36
near 1322   -   near 25
district 1450   -   district 24
reported 1665   -   reported 21
police 4061   -   police 19
al 1828   -   al 12
attack 2866   -   attack 12
army 1071   -   army 12
town 1369   -   town 9
shot 1369   -   shot 8
fighting 1077   -   fighting 8
village 1577   -   village 6
protest 2353   -   protest 3
attacked 1838   -   attacked 3
injured 1976   -   injured 2
killed 5322   -   killed 2
armed 1729   -   armed 2
rebels 1893   -   rebels 2
soldiers 1813   -   soldiers 2
protesters 1008   -   protesters 1
killing 1011   -   killing 1
unknown 1117   -   unknown 1
suspected 1034   - 

In [137]:
# the no. occurances of the most freq words in neg in the positive set
c = 0
l = []
for i in neg_most_occr_words:
    if neg_wrd_dist_doc[i] > 1000:
        c+= 1
        l.append(i)
        print(i, neg_wrd_dist_doc[i])
print(c)
print(",".join(l))

market 9066
business 5662
research 4608
technology 4473
markets 4137
health 3716
new 2113
global 2063
report 1619
announces 1311
update 1292
finance 1254
tv 1200
13
market,business,research,technology,markets,health,new,global,report,announces,update,finance,tv


In [83]:
pos_big_dist_doc = generat_word_dist_bigrams(PV_pos)
neg_big_dist_doc = generat_word_dist_bigrams(PV_neg)

In [87]:
pos_most_occr_bigrams = print_word_list(pos_big_dist_doc)
neg_most_occr_bigrams = print_word_list(neg_big_dist_doc)

In [92]:
# the no. occurances of the most freq words in pos in the negative set
for i in pos_most_occr_bigrams:
    if neg_big_dist_doc[i] > 100:
        print(i, neg_big_dist_doc[i])

('vice', 'president') 188
('health', 'care') 602
('new', 'york') 161
('finance', 'minister') 154
('small', 'business') 410


In [91]:
# the no. occurances of the most freq words in neg in the positive set
for i in neg_most_occr_bigrams:
    if pos_big_dist_doc[i] > 100:
        print(i, pos_big_dist_doc[i])

('service', 'delivery') 119
('security', 'forces') 683
('reports', 'indicate') 422
('government', 'military') 115


In [105]:
CF_tweets_PV_pos = 'Data/turkish_protest_test_pos_prccd2.txt'
CF_tweets_PV_neg = 'Data/turkish_protest_test_neg_prccd2.txt'

In [122]:
CFT_pos_wrd_dist_doc = generat_word_dist(CF_tweets_PV_pos)
CFT_neg_wrd_dist_doc = generat_word_dist(CF_tweets_PV_neg)
CFT_pos_most_occr_words = print_word_list(CFT_pos_wrd_dist_doc)
CFT_neg_most_occr_words = print_word_list(CFT_neg_wrd_dist_doc)

498
3386


In [145]:
# the no. occurances of the most freq words in pos in the negative set
l = []
c = 0
for i in CFT_pos_most_occr_words:
    if CFT_neg_wrd_dist_doc[i] > 10:
        c +=1
        l.append(i)
        print(i, CFT_neg_wrd_dist_doc[i], '   -  ',i, CFT_pos_wrd_dist_doc[i])
print(c)        
print(",".join(l)) 

police 27    -   police 31
occupygezi 90    -   occupygezi 23
istanbul 65    -   istanbul 16
taksim 53    -   taksim 14
turkey 192    -   turkey 12
gezi 25    -   gezi 12
park 24    -   park 11
people 44    -   people 10
protesters 18    -   protesters 9
turkish 47    -   turkish 9
square 16    -   square 6
gas 12    -   gas 6
direngeziparki 27    -   direngeziparki 6
via 19    -   via 5
protests 30    -   protests 5
erdogan 46    -   erdogan 3
please 13    -   please 2
says 12    -   says 1
new 13    -   new 1
19
police,occupygezi,istanbul,taksim,turkey,gezi,park,people,protesters,turkish,square,gas,direngeziparki,via,protests,erdogan,please,says,new


In [146]:
# the no. occurances of the most freq words in neg in the positive set
c = 0
l = []
for i in CFT_neg_most_occr_words:
    if CFT_pos_wrd_dist_doc[i] > 10:
        c+= 1
        print(i, CFT_pos_wrd_dist_doc[i], '   -  ',i, CFT_neg_wrd_dist_doc[i])
        l.append(i)
print(c)        
print(",".join(l))

turkey 12    -   turkey 192
occupygezi 23    -   occupygezi 90
istanbul 16    -   istanbul 65
taksim 14    -   taksim 53
police 31    -   police 27
gezi 12    -   gezi 25
park 11    -   park 24
7
turkey,occupygezi,istanbul,taksim,police,gezi,park


In [159]:
print('positive',CFT_pos_wrd_dist_doc['clashes'])
print('negative',CFT_neg_wrd_dist_doc['clashes'])

positive 2
negative 3
