In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import re
import numpy as np
import pandas as pd
from itertools import islice

In [3]:
import nltk
from nltk import NaiveBayesClassifier
from nltk.metrics.scores import f_measure, precision, recall
import collections

In [4]:
def load_tsv(data_file, n):
    data_features = []
    data = []
    infile = open(data_file, encoding='utf-8')
    for line in infile:
        if not line.strip():
            continue
        label, text = line.split('\t')
        text_features = process_text(text, n)
        if text_features:
            data_features += text_features
            data.append((text_features, label))
    return data, data_features

def process_text(text, n=1, remove_repeated_chars=False):
    clean_text = text
    if remove_repeated_chars:
        clean_text = remove_repeating_char(clean_text)

    if n == 1:
        return clean_text.split()
    else:
        tokens = clean_text.split()
        grams = tokens
        for i in range(2, n + 1):
            grams += [  ' '.join(g) for g in list(window(tokens, i))  ]
        return grams

def remove_repeating_char(text):
    # return re.sub(r'(.)\1+', r'\1', text)
    return re.sub(r'(.)\1+', r'\1\1', text)  # keep 2 repeat

def window(words_seq, n):
    it = iter(words_seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

def document_features(document, corpus_features):
    document_words = set(document)
    features = {}
    for word in corpus_features:
        features['has({})'.format(word)] = (word in document_words)
    return features

In [5]:
folder = 'drive/MyDrive/BuildUp/UNDPLeb/tweets_data/data2/'
pos_train_file = 'train_Arabic_tweets_positive_20190413.tsv'
neg_train_file = 'train_Arabic_tweets_negative_20190413.tsv'

pos_test_file = 'test_Arabic_tweets_positive_20190413.tsv'
neg_test_file = 'test_Arabic_tweets_negative_20190413.tsv'

In [6]:
n = 1

pos_train_data, pos_train_feat = load_tsv(folder+pos_train_file, n)
neg_train_data, neg_train_feat = load_tsv(folder+neg_train_file, n)

pos_test_data, pos_test_feat = load_tsv(folder+pos_test_file, n)
neg_test_data, neg_test_feat = load_tsv(folder+neg_test_file, n)

In [7]:
all_features = pos_train_feat + neg_train_feat + pos_test_feat + pos_test_feat

In [8]:
all_features_count = {}
for w in all_features:
    all_features_count[w] = all_features_count.get(w, 0) + 1

train_data = pos_train_data + neg_train_data
test_data = pos_test_data + neg_test_data

min_df = int(0.001 * len(train_data))
max_df = int(0.98 * len(train_data))
my_features = set([word for word, freq in all_features_count.items() if  max_df > freq > min_df ])

In [9]:
feature_sets = []
for d, c in train_data:
    feature_sets.append((document_features(d, my_features), c))

In [10]:
classifier = nltk.NaiveBayesClassifier.train(feature_sets)

In [11]:
test_features = [(document_features(d, my_features), c) for (d, c) in test_data]

In [12]:
ref_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_features):
    ref_sets[label].add(i)
    observed = classifier.classify(feats)
    test_sets[observed].add(i)

In [13]:
print('accuracy: ', nltk.classify.accuracy(classifier, test_features))
print('pos precision: ', precision(ref_sets['pos'], test_sets['pos']))
print('pos recall:', recall(ref_sets['pos'], test_sets['pos']))
print('neg precision: ', precision(ref_sets['neg'], test_sets['neg']))
print('neg recall:', recall(ref_sets['neg'], test_sets['neg']))
print('positive f-score:', f_measure(ref_sets['pos'], test_sets['pos']))
print('negative f-score:', f_measure(ref_sets['neg'], test_sets['neg']))

accuracy:  0.8913283975831844
pos precision:  0.9198425478618716
pos recall: 0.8611390284757119
neg precision:  0.8654657578708211
neg recall: 0.9225047569624633
positive f-score: 0.8895233151656718
negative f-score: 0.8930754416813196
