## Suggestions:
1. http://scikit-learn.org/stable/modules/feature_extraction.html
* https://github.com/timshenkao/StringKernelSVM

### Use the following to show off the results
http://melissagymrek.com/python/2014/01/12/ipython-tables.html

In [1]:
%matplotlib inline
import codecs
from collections import Counter
import random 
import numpy as np
from numpy.random import permutation, shuffle, rand
from numpy.linalg import svd

import matplotlib.pyplot as plt

from scipy.optimize import minimize
from scipy.io import loadmat

from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import svm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn import metrics
from sklearn.metrics import confusion_matrix

from sklearn.grid_search import GridSearchCV

In [2]:
def from_buck_to_utf8(text):
    b2a = {'A': u'\u0627',  '<': u'\u0625',  '|': u'\u0622',  '>': u'\u0623',  "'": u'\u0621',  'b': u'\u0628',  
           't': u'\u062a',  'v': u'\u062b',  'j': u'\u062c',  'H': u'\u062d',  'x': u'\u062e',  'd': u'\u062f',  
           '*': u'\u0630',  'r': u'\u0631',  'z': u'\u0632',  's': u'\u0633',  '$': u'\u0634',  'S': u'\u0635',  
           'D': u'\u0636',  'T': u'\u0637',  'Z': u'\u0638',  'E': u'\u0639',  'g': u'\u063a',  'f': u'\u0641',  
           'q': u'\u0642',  'k': u'\u0643',  'l': u'\u0644',  'm': u'\u0645',  'n': u'\u0646',  'h': u'\u0647',  
           'w': u'\u0648',  'y': u'\u064a',  'Y': u'\u0649',  'p': u'\u0629',  '&': u'\u0624',  '}': u'\u0626',  
           'a': u'\u064e',  'F': u'\u064b',  'u': u'\u064f',  'N': u'\u064c',  'i': u'\u0650',  'K': u'\u064d',  
           'o': u'\u0652',  '~': u'\u0651'}
    text = text.strip().split()
    tmp_sentence = list()
    for word in text:
        tmp_word = list()
        for c in word:
            tmp_word.append(b2a.get(c,c))
        else:
            tmp_sentence.append(''.join(tmp_word))
    else:
        return ' '.join(tmp_sentence)

In [3]:
sentences = list()
removed_sentences = list()
removed_sentences_labels = list()
labels = list()

labels_dist = set()

dataset = dict()
#We will release training and testing data for the following Arabic dialects: 
# Egyptian, Gulf, Levantine, and North-African, and Modern Standard Arabic (MSA)

with codecs.open('/home/disooqi/qcri/dialects/task/DSL-training/task2-train.txt') as training:
    LAV = list()
    MSA = list()
    EGY = list()
    GLF = list()
    NOR = list()
    for i, line in enumerate(training):
        sentence_label = line.strip().split('\t')
        utf8_sentence = sentence_label[0]
        
        # labels.append(sentence_label[2])
        
        if len(utf8_sentence.strip().split()) <= 0:
            removed_sentences.append(utf8_sentence)
            removed_sentences_labels.append(sentence_label[2])
#             print i, sentence_label[0]
            continue
        
        sentences.append(utf8_sentence)
        if sentence_label[2] == 'LAV':
            LAV.append(utf8_sentence)
        elif sentence_label[2] == 'MSA':
            MSA.append(utf8_sentence)
        elif sentence_label[2] == 'EGY':
            EGY.append(utf8_sentence)
        elif sentence_label[2] == 'GLF':
            GLF.append(utf8_sentence)
        elif sentence_label[2] == 'NOR':
            NOR.append(utf8_sentence)
        else:
            print(utf8_sentence)
    else:
#         print 'sentence count:', len(sentences)
#         print set(labels)
        dataset['LAV'] = LAV
        dataset['MSA'] = MSA
        dataset['EGY'] = EGY
        dataset['GLF'] = GLF
        dataset['NOR'] = NOR
        LAV = list()
        MSA = list()
        EGY = list()
        GLF = list()
        NOR = list()

target_names = dataset.keys()
print target_names

def divide_dataset(dataset ,CV=True, train_perc=80 , CV_perc=0, test_perc=20):
    if train_perc + CV_perc + test_perc != 100:
        print 'the sum of percs is not 100'
        return
    samples_train = dict()
    samples_cv = dict()
    samples_test = dict()
    
    for dialect, sentences in dataset.items():
        samples = permutation(sentences)
        train_len = int(np.ceil(len(samples)*(train_perc/100.0)))
        samples_train[dialect] = sentences[:train_len]
        cv_len = 0
        if CV:
            cvp = CV_perc/(100.0-60)
            cv_len = int(np.ceil((len(samples)-train_len) * cvp))
            samples_cv[dialect] = sentences[train_len:train_len+cv_len]
            samples_test[dialect] = sentences[train_len+cv_len:]
        else:
            samples_cv[dialect] = list()
            samples_test[dialect] = sentences[train_len:]
    else:
        return samples_train, samples_cv, samples_test
            

train_set, cv_set, test_set = divide_dataset(dataset, CV=False, train_perc=80 ,CV_perc=0, test_perc=20)

target_names = ['LAV', 'MSA', 'EGY', 'GLF', 'NOR']

t,c,ts = 0,0,0
for dial in ['LAV', 'MSA', 'EGY', 'GLF', 'NOR']:
    t += len(train_set[dial])
    c += len(cv_set[dial])
    ts+= len(test_set[dial])
    print dial, 'training dataset: ', len(train_set[dial]), ', cross-validation set: ', \
    len(cv_set[dial]),', test:', len(test_set[dial])
    
else:
    print 70*'-'
    print 'Total  ...  Training: ', t, ', cross-validation data', c, ', test: ', ts

dataset_train = train_set['LAV']+train_set['MSA']+train_set['EGY']+train_set['GLF']+train_set['NOR']
dataset_cv = cv_set['LAV']+cv_set['MSA']+cv_set['EGY']+cv_set['GLF']+cv_set['NOR']
dataset_test = test_set['LAV']+test_set['MSA']+test_set['EGY']+test_set['GLF']+test_set['NOR']


label_train = ['LAV' for x in train_set['LAV']] + ['MSA' for x in train_set['MSA']] +\
['EGY' for x in train_set['EGY']] + ['GLF' for x in train_set['GLF']]+['NOR' for x in train_set['NOR']]

label_cv = ['LAV' for x in cv_set['LAV']] + ['MSA' for x in cv_set['MSA']] +\
['EGY' for x in cv_set['EGY']] + ['GLF' for x in cv_set['GLF']]+['NOR' for x in cv_set['NOR']]

label_test = ['LAV' for x in test_set['LAV']] + ['MSA' for x in test_set['MSA']] +\
['EGY' for x in test_set['EGY']] + ['GLF' for x in test_set['GLF']]+['NOR' for x in test_set['NOR']]

train_set, cv_set, test_set = 0,0,0
#print len(label_train),len(label_cv),len(label_test)

train_zipped = zip(dataset_train, label_train)
random.shuffle(train_zipped)
dataset_train, label_train = zip(*train_zipped)

if dataset_cv:
    cv_zipped = zip(dataset_cv, label_cv)
    random.shuffle(cv_zipped)
    dataset_cv, label_cv = zip(*cv_zipped)

if dataset_test:
    dataset_test.extend(removed_sentences)
    label_test.extend(removed_sentences_labels)
    test_zipped = zip(dataset_test, label_test)
    random.shuffle(test_zipped)
    dataset_test, label_test = zip(*test_zipped)

print len(dataset_test), len(removed_sentences), len(label_test)

with codecs.open('LIBSVM_train', mode='w', encoding='utf8') as of:
    for l, s in zip(label_train, dataset_train):
        of.write(str(target_names.index(l)))
        of.write('\t')
        of.write(s)
        of.write('\n')
        

['NOR', 'LAV', 'GLF', 'EGY', 'MSA']
LAV training dataset:  1407 , cross-validation set:  0 , test: 351
MSA training dataset:  800 , cross-validation set:  0 , test: 199
EGY training dataset:  1263 , cross-validation set:  0 , test: 315
GLF training dataset:  1338 , cross-validation set:  0 , test: 334
NOR training dataset:  1290 , cross-validation set:  0 , test: 322
----------------------------------------------------------------------
Total  ...  Training:  6098 , cross-validation data 0 , test:  1521
1521 0 1521


In [7]:
with codecs.open('LIBSVM_test', mode='w', encoding='utf8') as of:
    for l, s in zip(label_test, dataset_test):
        of.write(str(target_names.index(l)))
        of.write('\t')
        of.write(s)
        of.write('\n')

In [4]:
print len(sentences)
print len(removed_sentences), len(removed_sentences_labels)

for t in zip(removed_sentences, removed_sentences_labels):
    print t[0], t[1]

7619
0 0


In [None]:
-

In [5]:
def randInitializeWeights(L_in_size, L_out_size):
    epsilon_init = np.sqrt(6)/np.sqrt(L_in_size+L_out_size)
    epsilon_init = 0.12
    return  rand(L_out_size, L_in_size+1) * 2*epsilon_init - epsilon_init

def sigmoid(z): 
    return 1/(1+np.exp(-z))

def sigmoidGradient(z):
    return sigmoid(z) * (1-sigmoid(z))

def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, features, incidence_y, reg_parameter):
    if nn_params.ndim != 1:
        return
    theta1_size = (input_layer_size+1) * hidden_layer_size
    Theta1 = nn_params[:theta1_size].reshape((hidden_layer_size,input_layer_size+1), order='F') # (25, 401)
    Theta2 = nn_params[theta1_size:].reshape((num_labels, hidden_layer_size+1), order='F') # (10, 26)
    
    m, _ = features.shape
    a_1 = np.c_[np.ones((m)), features]
    
    z_2 = Theta1.dot(a_1.T) # (25, 401) * (401, 5000)
    a_tmp = sigmoid(z_2)    # (25, 5000)
    
    a_2 = np.vstack((np.ones((m)), a_tmp))
    z_3 = Theta2.dot(a_2)
    a_3 = sigmoid(z_3)
    
    reg_term = _lambda *(np.sum(Theta1[:,1:]**2) + np.sum(Theta2[:,1:]**2))/(2*m)
    error =  np.sum(-incidence_y*np.log(a_3.T) - (1-incidence_y)*np.log(1 - a_3.T))/m
    print 'Train error: ', error
    return error +reg_term


def nn_gradient(nn_params, input_layer_size, hidden_layer_size, num_labels,features, incidence_y, _lambda):
    m, _ = features.shape
    ones = np.ones((m))
    # print ones.shape, type(features), features.shape
    X = np.c_[np.ones((m)), features]
    
    if nn_params.ndim != 1:
        return
    theta1_size = (input_layer_size+1) * hidden_layer_size
    Theta1 = nn_params[:theta1_size].reshape((hidden_layer_size,input_layer_size+1), order='F') # (25, 401)
    Theta2 = nn_params[theta1_size:].reshape((num_labels, hidden_layer_size+1), order='F') # (10, 26)

    Delta2 = np.zeros_like(Theta2)
    Delta1 = np.zeros_like(Theta1)

    for i in np.arange(m):        
        # forward pass
        x = X[i,:]
    
        z2 = Theta1.dot(x[:,np.newaxis])
        a2 = np.r_[[[1]], sigmoid(z2)]
    
        z3 = Theta2.dot(a2)
        hx = sigmoid(z3).ravel()
    
        # computing the "error terms" that measure how much the nodes were responsible for any errors 
        # in our output
        delta3 = hx - incidence_y[i,:]
        delta2 = Theta2.T.dot(delta3)[1:] * sigmoidGradient(z2).ravel()
    
        Delta2 = Delta2 + delta3[:,np.newaxis].dot(a2.T)
        Delta1 = Delta1 + delta2[:,np.newaxis].dot(x[:,np.newaxis].T)
    else:
        D2 = Delta2/m + _lambda/m * np.c_[np.zeros((Theta2.shape[0])), Theta2[:,1:]]
        D1 = Delta1/m + _lambda/m * np.c_[np.zeros((Theta1.shape[0])), Theta1[:,1:]]
        return np.r_[D1.ravel(order='F'), D2.ravel(order='F')]

    
def predict_from_three_layer_NN(Theta1, Theta2, X):
    m, _ = X.shape
    A_1 = np.c_[np.ones((m)), X] # (5000, 400)
    
    Z_2 = Theta1.dot(A_1.T) # (25, 401) * (401, 5000)
    A_tmp = sigmoid(Z_2).T # (5000, 25)    
    A_2 = np.c_[(np.ones((m)), A_tmp)] # (5000, 26) 
    
    Z_3 = Theta2.dot(A_2.T) # (10, 26) * (26, 5000) 
    A_3 = sigmoid(Z_3).T # (5000, 10)
    
    return A_3

# Feature Generation

In [None]:
tfidf_vect = TfidfVectorizer() #min_df=.0001, max_df=0.8, max_features=10000, ngram_range=(1,2)
tfidf_vect.fit(sentences)
X_train = tfidf_vect.transform(dataset_train)
X_test = tfidf_vect.transform(dataset_test)

input_layer_size =  X_train.shape[1]
hidden_layer_size = 10
num_labels = 5

labels=[ 'MSA', 'LAV', 'EGY', 'GLF', 'NOR']
org_y = np.array([labels.index(d) for d in label_train])
tmp = org_y.copy()
y = np.zeros((tmp.size, num_labels))
y[np.arange(tmp.size), tmp.ravel()] = 1  # (5000, 10)

# input_layer_size  = 400;  # 20x20 Input Images of Digits
# hidden_layer_size = 10;   # 25 hidden units
# num_labels = 10;          # 10 labels, from 1 to 10

# handwritten_digits = loadmat('/home/disooqi/ml/machine-learning-ex4/ex4/ex4data1.mat')
# handwritten_digits.keys()

# X_train = handwritten_digits['X']
# #m, n = features.shape

# org_y = handwritten_digits['y']
# tmp = org_y.copy()
#tmp[tmp==10] = 0
# y = np.zeros((tmp.size, num_labels))
# y[np.arange(tmp.size), tmp.ravel()] = 1  # (5000, 10)

X_train.shape, y.shape

In [None]:
_lambda = 1


# print X_train.shape
initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)
# print initial_Theta1.shape, initial_Theta2.shape

initial_weights = np.r_[initial_Theta1.ravel(order='F'), initial_Theta2.ravel(order='F')]
# print initial_weights.shape


#D = nn_gradient(initial_weights, input_layer_size, hidden_layer_size, num_labels, X_train, y, _lambda)

initial_weights = np.r_[initial_Theta1.ravel(order='F'), initial_Theta2.ravel(order='F')]
res = minimize(fun=nnCostFunction, x0 =initial_weights, 
               args=(input_layer_size, hidden_layer_size, num_labels,X_train.toarray(), y, _lambda), method='CG', 
               jac=nn_gradient, options={'maxiter':200})

theta1_size = (input_layer_size+1) * hidden_layer_size
opt_Theta1 = res.x[:theta1_size].reshape((hidden_layer_size,input_layer_size+1), order='F') # (25, 401)
opt_Theta2 = res.x[theta1_size:].reshape((num_labels, hidden_layer_size+1), order='F') # (10, 26)
pred = predict_from_three_layer_NN(opt_Theta1, opt_Theta2, X_train.toarray())
np.mean(pred.argmax(axis=1) == tmp.ravel())*100

In [None]:
pred = predict_from_three_layer_NN(opt_Theta1, opt_Theta2, X_test.toarray())

y_test = np.array([labels.index(d) for d in label_test])
np.mean(pred.argmax(axis=1) == y_test.ravel())*100

In [None]:
# lambda = 3 and full features
pred = predict_from_three_layer_NN(opt_Theta1, opt_Theta2, X_test.toarray())

y_test = np.array([labels.index(d) for d in label_test])
np.mean(pred.argmax(axis=1) == y_test.ravel())*100

In [None]:
pred = predict_from_three_layer_NN(opt_Theta1, opt_Theta2, X_train.toarray())
np.mean(pred.argmax(axis=1) == tmp.ravel())*100

In [None]:
mnb_clf = MultinomialNB()
lr_clf = LogisticRegression()
sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
svm_clf = svm.LinearSVC()

# help (SGDClassifier)

In [6]:
vectorizer = TfidfVectorizer(analyzer= 'char',lowercase=False, max_df=0.95,ngram_range=(2,5), smooth_idf=False,
                             sublinear_tf=True)

vectorizer.fit(sentences)
X_train = vectorizer.transform(dataset_train)
lda = LinearDiscriminantAnalysis()

lda.fit(X_train.toarray(), label_train)
# svd = TruncatedSVD(n_components=640, random_state=42)
# u,s,v = svd.fit(X_train) 
# train_pred = lda.predict(X_train)
# test_pred = lda.predict(X_test)

# print 'Training Acc: ',np.around(np.mean(pred_train == label_train)*100,2), '%'
# print 'Testing Acc: ',np.around(np.mean(pred_test == label_test)*100,2), '%'
print type(X_train)

MemoryError: 

In [None]:
vectorizer = TfidfVectorizer(analyzer= 'char',lowercase=False, max_df=0.95,ngram_range=(2,5), smooth_idf=False,
                             sublinear_tf=True)

pipeline_01 = Pipeline([('v_01', vectorizer),
                        ('clf_01', SGDClassifier())])
# lowercase=False, ngram_range=(1,2)
#min_df=.0001, max_df=0.8, max_features=10000, ngram_range=(1,2)
parameters = {
#      'v_01__ngram_range': [(1, 1), (1, 2),(1, 3),(2,5),(3,5)],
#     'v_01__analyzer':('word', 'char'),
#     'v_01__use_idf': (True, False),
#     'v_01__max_df' : np.round(np.linspace(.6,.7,10), 2),
#     'v_01__max_df' : (0.5,.6,.66,.7, 0.95),
#     'v_01__min_df':(1,2,3,4,5,6,7,8,9,10),
#     'v_01__lowercase' : (True, False),
#     'v_01__norm' : ('l1', 'l2', None),
#     'v_01__binary': (False,True),
#     'clf_01__alpha': (1e-2, 1e-3),
#     'v_01__smooth_idf':(True,False),
#     'v_01__sublinear_tf':(True,False),
    #'clf_01__penalty':('none', 'l2', 'l1', 'elasticnet'),
#     'clf_01__loss':('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 
#      'epsilon_insensitive', 'squared_epsilon_insensitive'),
}
gs_clf = GridSearchCV(pipeline_01, parameters, n_jobs=-1)
gs_clf.fit(dataset_train, label_train)


pred_train = gs_clf.predict(dataset_train)
pred_test = gs_clf.predict(dataset_test)

# pipeline_01.fit(dataset_train, label_train)

# pred_train = pipeline_01.predict(dataset_train)
# pred_test = pipeline_01.predict(dataset_test)

print 'Training Acc: ',np.around(np.mean(pred_train == label_train)*100,2), '%'
print 'Testing Acc: ',np.around(np.mean(pred_test == label_test)*100,2), '%'

print(metrics.classification_report(label_test, pred_test, target_names=['LAV', 'MSA', 'EGY', 'GLF', 'NOR']))

best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
score 

In [8]:
vectorizer = TfidfVectorizer(analyzer= 'char',lowercase=False, max_df=0.95,ngram_range=(2,5), smooth_idf=False,
                             sublinear_tf=True)

vectorizer.fit(sentences)
X_train = vectorizer.transform(dataset_train)
X_test = vectorizer.transform(dataset_test)
# X_removed = vectorizer.transform(removed_sentences)

sgd_clf_02 = SGDClassifier()
sgd_clf_02.fit(X_train, label_train)

pred_train = sgd_clf_02.predict(X_train)
pred_test = sgd_clf_02.predict(X_test)
# pred_removed = sgd_clf_02.predict(X_removed)

print 'Training Acc: ',np.around(np.mean(pred_train == label_train)*100,2), '%'
print 'Testing Acc: ',np.around(np.mean(pred_test == label_test)*100,2), '%'
# print 'removed Acc: ',np.around(np.mean(pred_removed == removed_sentences_labels)*100,2), '%'

Training Acc:  98.49 %
Testing Acc:  65.94 %


In [None]:
lr_clf.fit(X_train, label_train)
y_pred = lr_clf.predict(X_test)                  # .score(X_train, label_train)


print 'Training Acc: ',np.around(lr_clf.score(X_train, label_train)*100,2), '%'
print 'Testing Acc: ',np.around(lr_clf.score(X_test, label_test)*100,2), '%'

In [None]:
sgd_clf.fit(X_train, label_train)
train_pred = sgd_clf.predict(X_train) 
y_pred = sgd_clf.predict(X_test) 


print 'Training Acc: ',np.around(np.mean(train_pred == label_train)*100,2), '%'
print 'Testing Acc: ',np.around(np.mean(y_pred == label_test)*100,2), '%'

print(metrics.classification_report(label_test, y_pred, target_names=['LAV', 'MSA', 'EGY', 'GLF', 'NOR']))

In [None]:
cm = confusion_matrix(label_test, y_pred, labels=[ 'MSA', 'LAV', 'EGY', 'GLF', 'NOR'])
cm_normalized = np.round(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] *100,3)

print cm_normalized

In [None]:
A = X_test.toarray()

dd = y_pred!=label_test
for i in np.arange(dd.size):
    if dd[i]:
        
        if label_test[i]=='LAV' and y_pred[i]=='EGY':
            print from_buck_to_utf8(dataset_test[i]), label_test[i], y_pred[i]
        print '-' * 80
        

#dataset_test
#A[y_pred != label_test].shape

In [None]:
['زين', 'ما صار', 'اللي', 'شو', 'لحتى', 'زي ما بيقولوا', 'مش هيجي', 'هون','ما بينفعنا','','','','','','','','','']

In [None]:
list()