In [1]:
%matplotlib inline

In [38]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from preparedata import prepare_train_data_for_task_1, prepare_dev_data_for_task_1, prepare_test_data_for_task_1
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [2]:
corpus = 'corpus-26'

_train_src, _, train_tgt, _ = prepare_train_data_for_task_1(corpus= corpus)
_dev_src, dev_tgt = prepare_dev_data_for_task_1(corpus= corpus)
_test_src = prepare_test_data_for_task_1(corpus= corpus)

train_src = _train_src[:]
dev_src = _dev_src[:]
test_src = _test_src[:]
train_src[:3]

['مانقدرش ندير الأمتعة ديالي تحت الكرسي . تقدر تخليهم ، عافاك ؟',
 'ما عندك مانع أجي لعندك ؟',
 'هدول مش صناعة يابانية ، هما هيك ؟']

In [41]:
count_vec =CountVectorizer(analyzer='word',  min_df=1, max_df=0.95, ngram_range=(1, 1))
X_train_counts = count_vec.fit_transform(train_src)
X_dev_counts = count_vec.transform(dev_src)

tf_vec = TfidfTransformer(use_idf=False)
X_train_tf = tf_vec.fit_transform(X_train_counts)
X_dev_tf = tf_vec.transform(X_dev_counts)

tfidf_vect = TfidfTransformer(use_idf=True, smooth_idf=False, sublinear_tf=True)
X_train_tfidf = tfidf_vect.fit_transform(X_train_counts)
X_dev_tfidf = tfidf_vect.transform(X_dev_counts)

len(count_vec.get_feature_names())

27364

In [32]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

feature_names = count_vec.get_feature_names()
# get the document that we want to extract keywords from
doc="ذهب محمد الى المدرسة ."
 
#generate tf-idf for the given document
tf_idf_vector=tfidf_vect.transform(count_vec.transform([doc]))
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,2)
 
# now print the results
print("\n=====Doc=====")
print(doc)
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


=====Doc=====
ذهب محمد الى المدرسة .

===Keywords===
محمد 0.623
المدرسة 0.617


In [35]:
tt = X_train_tfidf.toarray()
dd = X_dev_tfidf.toarray()
tt.shape, dd.shape

((41600, 27364), (5200, 27364))

In [7]:
clf_1 = MultinomialNB().fit(X_train_tfidf, train_tgt)

train_pred = clf_1.predict(X_train_tfidf)
dev_pred_1 = clf_1.predict(X_dev_tfidf)

print('Training Acc: ',np.around(np.mean(train_pred == train_tgt)*100,2), '%')
print('Testing Acc: ',np.around(np.mean(dev_pred_1 == dev_tgt)*100,2), '%')

print(metrics.classification_report(dev_tgt, dev_pred_1))

Training Acc:  81.51 %
Testing Acc:  63.02 %
              precision    recall  f1-score   support

         ALE       0.62      0.56      0.59       200
         ALG       0.73      0.80      0.76       200
         ALX       0.71      0.78      0.74       200
         AMM       0.43      0.53      0.48       200
         ASW       0.47      0.60      0.53       200
         BAG       0.74      0.58      0.65       200
         BAS       0.68      0.64      0.66       200
         BEI       0.77      0.57      0.66       200
         BEN       0.66      0.70      0.68       200
         CAI       0.65      0.42      0.51       200
         DAM       0.65      0.51      0.57       200
         DOH       0.58      0.62      0.60       200
         FES       0.64      0.70      0.67       200
         JED       0.63      0.61      0.62       200
         JER       0.45      0.59      0.51       200
         KHA       0.49      0.69      0.57       200
         MOS       0.82      0.79   

In [19]:
tfidf_ch_vect = TfidfVectorizer(analyzer= 'char',lowercase=False, max_df=0.95,ngram_range=(2, 5), smooth_idf=False,
                             sublinear_tf=True)

X_train = tfidf_ch_vect.fit_transform(train_src)
X_dev = tfidf_ch_vect.transform(dev_src)

clf_2 = SGDClassifier('log', max_iter=1000, tol=1e-3, n_jobs=-1)
clf_2.fit(X_train, train_tgt)
train_pred = clf_2.predict(X_train)
dev_pred_2 = clf_2.predict(X_dev)

print('Training Acc: ',np.around(np.mean(train_pred == train_tgt)*100,2), '%')
print('Testing Acc: ',np.around(np.mean(dev_pred_2 == dev_tgt)*100,2), '%')
t = metrics.classification_report(dev_tgt, dev_pred_2)
# print(t)

Training Acc:  65.57 %
Testing Acc:  55.92 %


In [21]:
from sklearn.ensemble import VotingClassifier

eclf1 = VotingClassifier(estimators=[
        ('lr', clf_1), ('rf', clf_2)], voting='soft', flatten_transform=True)

eclf1.fit(X_train_tfidf, train_tgt)
train_pred = eclf1.predict(X_train_tfidf)
dev_pred_3 = eclf1.predict(X_dev_tfidf)

print('Training Acc: ',np.around(np.mean(train_pred == train_tgt)*100,2), '%')
print('Testing Acc: ',np.around(np.mean(dev_pred_3 == dev_tgt)*100,2), '%')
t = metrics.classification_report(dev_tgt, dev_pred_3)
print(t)

Training Acc:  78.01 %
Testing Acc:  61.42 %
              precision    recall  f1-score   support

         ALE       0.61      0.57      0.59       200
         ALG       0.71      0.78      0.74       200
         ALX       0.71      0.77      0.74       200
         AMM       0.47      0.49      0.48       200
         ASW       0.49      0.59      0.54       200
         BAG       0.69      0.54      0.60       200
         BAS       0.66      0.65      0.65       200
         BEI       0.67      0.60      0.63       200
         BEN       0.62      0.65      0.63       200
         CAI       0.68      0.44      0.53       200
         DAM       0.56      0.47      0.51       200
         DOH       0.57      0.60      0.58       200
         FES       0.65      0.66      0.65       200
         JED       0.55      0.59      0.57       200
         JER       0.47      0.59      0.52       200
         KHA       0.49      0.68      0.57       200
         MOS       0.80      0.78   