In [1]:
%matplotlib inline
import codecs
from collections import Counter
import random 
import numpy as np
from numpy.random import permutation, shuffle, rand
from numpy.linalg import svd

import matplotlib.pyplot as plt

from scipy.optimize import minimize
from scipy.io import loadmat

from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import svm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn import metrics
from sklearn.metrics import confusion_matrix

from sklearn.grid_search import GridSearchCV

In [14]:
train_set = r'/home/disooqi/qcri/dialects/task/DSL-training/task2-train.txt'
test_set = r'/home/disooqi/qcri/dialects/task/DSL2016-test/C.txt'

In [57]:
def from_buck_to_utf8(text):
    b2a = {'A': u'\u0627',  '<': u'\u0625',  '|': u'\u0622',  '>': u'\u0623',  "'": u'\u0621',  'b': u'\u0628',  
           't': u'\u062a',  'v': u'\u062b',  'j': u'\u062c',  'H': u'\u062d',  'x': u'\u062e',  'd': u'\u062f',  
           '*': u'\u0630',  'r': u'\u0631',  'z': u'\u0632',  's': u'\u0633',  '$': u'\u0634',  'S': u'\u0635',  
           'D': u'\u0636',  'T': u'\u0637',  'Z': u'\u0638',  'E': u'\u0639',  'g': u'\u063a',  'f': u'\u0641',  
           'q': u'\u0642',  'k': u'\u0643',  'l': u'\u0644',  'm': u'\u0645',  'n': u'\u0646',  'h': u'\u0647',  
           'w': u'\u0648',  'y': u'\u064a',  'Y': u'\u0649',  'p': u'\u0629',  '&': u'\u0624',  '}': u'\u0626',  
           'a': u'\u064e',  'F': u'\u064b',  'u': u'\u064f',  'N': u'\u064c',  'i': u'\u0650',  'K': u'\u064d',  
           'o': u'\u0652',  '~': u'\u0651'}
    text = text.strip().split()
    tmp_sentence = list()
    for word in text:
        tmp_word = list()
        for c in word:
            tmp_word.append(b2a.get(c,c))
        else:
            tmp_sentence.append(''.join(tmp_word))
    else:
        return ' '.join(tmp_sentence)

In [77]:
test_sentences = list()
with codecs.open(test_set) as test:
    for i, line in enumerate(test):
#         if len(line.strip().split()) <= 1:
#             print i, from_buck_to_utf8(line.strip())
#             continue
        test_sentences.append(from_buck_to_utf8(line.strip()))
    else:
        print 'Test sentence count:', len(test_sentences)

Train sentence count: 7619
135 أميركا
166 أقسام
179 أسعار
365 الحقيقة
617 بحث
704 على
944 كتب
945 كتب
984 لبنان
985 لبنان
1052 ما
1158 نعم
1263 تعليم
Test sentence count: 1527


In [59]:
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(sentences)
X_test = count_vect.transform(test_sentences)


In [65]:
train_one_str = ' '.join(sentences)
test_one_str = ' '.join(test_sentences)

words_in_train = set(train_one_str.split())
words_in_test = set(test_one_str.split())
print len(words_in_test), len(words_in_train), len(count_vect.vocabulary_)

OOV = list()
for word in words_in_test:
    if count_vect.vocabulary_.get(unicode(word), 0) == 0:
        OOV.append(word)

print 'Out-Of-Vocab: ', len(OOV)

19493 55992 55929
Out-Of-Vocab:  6803


# QCRI-closed-C-run1.txt -CANDIDATE 01

In [None]:
sentences = list()
removed_sentences = list()
removed_sentences_labels = list()
labels = list()

labels_dist = set()

dataset = dict()
#We will release training and testing data for the following Arabic dialects: 
# Egyptian, Gulf, Levantine, and North-African, and Modern Standard Arabic (MSA)

with codecs.open('/home/disooqi/qcri/dialects/task/DSL-training/task2-train.txt') as training:
    LAV = list()
    MSA = list()
    EGY = list()
    GLF = list()
    NOR = list()
    for i, line in enumerate(training):
        sentence_label = line.strip().split('\t')
        utf8_sentence = from_buck_to_utf8(sentence_label[0])
        
        # labels.append(sentence_label[2])
        
        if len(utf8_sentence.strip().split()) <= 0:
            removed_sentences.append(utf8_sentence)
            removed_sentences_labels.append(sentence_label[2])
#             print i, sentence_label[0]
            continue
        
        sentences.append(utf8_sentence)
        if sentence_label[2] == 'LAV':
            LAV.append(utf8_sentence)
        elif sentence_label[2] == 'MSA':
            MSA.append(utf8_sentence)
        elif sentence_label[2] == 'EGY':
            EGY.append(utf8_sentence)
        elif sentence_label[2] == 'GLF':
            GLF.append(utf8_sentence)
        elif sentence_label[2] == 'NOR':
            NOR.append(utf8_sentence)
        else:
            print(utf8_sentence)
    else:
#         print 'sentence count:', len(sentences)
#         print set(labels)
        dataset['LAV'] = LAV
        dataset['MSA'] = MSA
        dataset['EGY'] = EGY
        dataset['GLF'] = GLF
        dataset['NOR'] = NOR
        LAV = list()
        MSA = list()
        EGY = list()
        GLF = list()
        NOR = list()

target_names = dataset.keys()

def divide_dataset(dataset ,CV=True, train_perc=80 , CV_perc=0, test_perc=20):
    if train_perc + CV_perc + test_perc != 100:
        print 'the sum of percs is not 100'
        return
    samples_train = dict()
    samples_cv = dict()
    samples_test = dict()
    
    for dialect, sentences in dataset.items():
        samples = permutation(sentences)
        train_len = int(np.ceil(len(samples)*(train_perc/100.0)))
        samples_train[dialect] = sentences[:train_len]
        cv_len = 0
        if CV:
            cvp = CV_perc/(100.0-60)
            cv_len = int(np.ceil((len(samples)-train_len) * cvp))
            samples_cv[dialect] = sentences[train_len:train_len+cv_len]
            samples_test[dialect] = sentences[train_len+cv_len:]
        else:
            samples_cv[dialect] = list()
            samples_test[dialect] = sentences[train_len:]
    else:
        return samples_train, samples_cv, samples_test
            

train_set, cv_set, test_set = divide_dataset(dataset, CV=False, train_perc=80 ,CV_perc=0, test_perc=20)


t,c,ts = 0,0,0
for dial in ['LAV', 'MSA', 'EGY', 'GLF', 'NOR']:
    t += len(train_set[dial])
    c += len(cv_set[dial])
    ts+= len(test_set[dial])
    print dial, 'training dataset: ', len(train_set[dial]), ', cross-validation set: ', \
    len(cv_set[dial]),', test:', len(test_set[dial])
    
else:
    print 70*'-'
    print 'Total  ...  Training: ', t, ', cross-validation data', c, ', test: ', ts

dataset_train = train_set['LAV']+train_set['MSA']+train_set['EGY']+train_set['GLF']+train_set['NOR']
dataset_cv = cv_set['LAV']+cv_set['MSA']+cv_set['EGY']+cv_set['GLF']+cv_set['NOR']
dataset_test = test_set['LAV']+test_set['MSA']+test_set['EGY']+test_set['GLF']+test_set['NOR']


label_train = ['LAV' for x in train_set['LAV']] + ['MSA' for x in train_set['MSA']] +\
['EGY' for x in train_set['EGY']] + ['GLF' for x in train_set['GLF']]+['NOR' for x in train_set['NOR']]

label_cv = ['LAV' for x in cv_set['LAV']] + ['MSA' for x in cv_set['MSA']] +\
['EGY' for x in cv_set['EGY']] + ['GLF' for x in cv_set['GLF']]+['NOR' for x in cv_set['NOR']]

label_test = ['LAV' for x in test_set['LAV']] + ['MSA' for x in test_set['MSA']] +\
['EGY' for x in test_set['EGY']] + ['GLF' for x in test_set['GLF']]+['NOR' for x in test_set['NOR']]

train_set, cv_set, test_set = 0,0,0
#print len(label_train),len(label_cv),len(label_test)

train_zipped = zip(dataset_train, label_train)
random.shuffle(train_zipped)
dataset_train, label_train = zip(*train_zipped)

if dataset_cv:
    cv_zipped = zip(dataset_cv, label_cv)
    random.shuffle(cv_zipped)
    dataset_cv, label_cv = zip(*cv_zipped)

if dataset_test:
    dataset_test.extend(removed_sentences)
    label_test.extend(removed_sentences_labels)
    test_zipped = zip(dataset_test, label_test)
    random.shuffle(test_zipped)
    dataset_test, label_test = zip(*test_zipped)

print len(dataset_test), len(removed_sentences), len(label_test)

In [67]:
vectorizer = TfidfVectorizer(analyzer= 'char',lowercase=False, max_df=0.95,ngram_range=(2,5), smooth_idf=False,
                             sublinear_tf=True)

vectorizer.fit(sentences)
X_train = vectorizer.transform(dataset_train)
X_test = vectorizer.transform(dataset_test)

sgd_clf_02 = SGDClassifier()
sgd_clf_02.fit(X_train, label_train)

pred_train = sgd_clf_02.predict(X_train)
pred_test = sgd_clf_02.predict(X_test)

# pipeline_01.fit(dataset_train, label_train)

# pred_train = pipeline_01.predict(dataset_train)
# pred_test = pipeline_01.predict(dataset_test)

print 'Training Acc: ',np.around(np.mean(pred_train == label_train)*100,2), '%'
print 'Testing Acc: ',np.around(np.mean(pred_test == label_test)*100,2), '%'

NameError: name 'dataset_train' is not defined

# QCRI-closed-C-run1.txt -CANDIDATE 02