In [1]:
import pandas                            as pd
import numpy                             as np
from typing                          import List
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base                    import BaseEstimator
from sklearn.preprocessing           import LabelEncoder, LabelBinarizer
from sklearn.linear_model            import LogisticRegression
from sklearn.metrics                 import classification_report


from classify_dialects               import load_data

## Arabic $n$-gram analysis

Calculate the $k$ most informative/predictive $n$-grams per dialect class

#### Step 0: Install CAMeL Tools

In [2]:
#!pip install camel-tools

In [3]:
#!camel_data full

#### Step 1: define our tokenizer using a morphological analyzer from the CAMeL Tools

In [4]:
from camel_tools.tokenizers.morphological import MorphologicalTokenizer
from camel_tools.morphology.database      import MorphologyDB
from camel_tools.morphology.analyzer      import Analyzer
from camel_tools.disambig.mle             import MLEDisambiguator

# see https://camel-tools.readthedocs.io/en/latest/api/morphology/analyzer.html#camel_tools.morphology.analyzer.Analyzer

db            = MorphologyDB.builtin_db()

# Create analyzer with no backoff
#analyzer       = Analyzer(db)

# Create analyzer with NOAN_PROP backoff
analyzer      = Analyzer(db, backoff='NOAN_PROP')

disambiguator = MLEDisambiguator(analyzer, mle_path=None)
segmenter     = MorphologicalTokenizer(disambiguator, split=True, diac=True)

In [5]:
text = "بعد ما أخيرا زبط نومي بعد الnight rotation ونمت عال٩ اليوم بصحى عال٣ الصبح"

In [6]:
tokenize = lambda text: segmenter.tokenize(text)

#### Step 2: Load our dataset

In [7]:
training_data = "data/DA_train_labeled.tsv"
train_df      = load_data(training_data)

In [8]:
train_df.columns

Index(['#1_tweetid', '#2_tweet', '#3_country_label', '#4_province_label'], dtype='object')

In [9]:
x_column = "#2_tweet"
y_column = "#3_country_label"

In [10]:
train_df = train_df[[x_column, y_column]]

In [11]:
labels = set(train_df['#3_country_label'])
print(labels)
#regions   = set(train_df['#4_province_label'])

{'Morocco', 'Mauritania', 'Syria', 'Tunisia', 'Kuwait', 'Palestine', 'Oman', 'Jordan', 'Djibouti', 'United_Arab_Emirates', 'Somalia', 'Sudan', 'Yemen', 'Qatar', 'Algeria', 'Lebanon', 'Saudi_Arabia', 'Egypt', 'Bahrain', 'Libya', 'Iraq'}


#### Step 3: Preprocess our data and transform text as $n$-grams

In [12]:
#Vectorizer = CountVectorizer
Vectorizer = TfidfVectorizer

custom_vectorizer = Vectorizer(
    #analyzer=tokenize, 
    tokenizer=tokenize,
    binary=False, 
    ngram_range=(3,3)#(1, 3)
)

le        = LabelEncoder()
# calculate (thresholded) n-gram counts based on all training data
X         = custom_vectorizer.fit_transform(train_df[x_column])
y         = le.fit_transform(train_df[y_column].values)


# feature ID -> feature name
custom_vectorizer.id2feat = {i:f for (f,i) in custom_vectorizer.vocabulary_.items()}

# convenience method
label2id = lambda lbl: le.transform([lbl])[0]

#### Step 5: fit a classifier for each class (i.e., a series of binary classifiers)

In [13]:
res = {}
for label in labels:
    # binarize classes in 1 v. rest fashion
    y_subset   = np.array(y, copy=True)
    label_id   = label2id(label)
    y_subset[y_subset != label_id] = -1
    y_subset[y_subset == label_id] = 1
    #print(f"label: {label}")
    clf        = LogisticRegression()
    clf.fit(X, y_subset)
    res[label] = clf

#### Step 6: Calculate the top $k$ most informative features based on the learned model coefficients

In [14]:
def get_top_n_features(clf: LogisticRegression, n: int) -> np.ndarray:
    return np.argsort(clf.coef_[0])[:n]

In [15]:
def get_features_for(indices: np.ndarray, vectorizer: BaseEstimator = custom_vectorizer)  -> List[str]:
    return [vectorizer.id2feat[i] for i in indices]

In [16]:
def top_n_for(clf: LogisticRegression, n: int, vectorizer: BaseEstimator = custom_vectorizer) -> List[str]:
    feature_ids = get_top_n_features(clf, n)
    return get_features_for(feature_ids, vectorizer)

In [17]:
get_top_n_features(res["Iraq"], n=10)

array([ 1826,  1586, 25335, 22355, 26011, 28793, 11730, 22907,  1554,
       14913])

In [18]:
custom_vectorizer.id2feat[get_top_n_features(res["Iraq"], n=1)[0]]

'  وَ  '

In [19]:
top_n_for(res["Iraq"], n=10)

['  وَ  ',
 '  ف ي',
 'م ش  ',
 'ف ي  ',
 'ن أَ  ',
 'ي أَ  ',
 'أَ ن أَ',
 'ق وَ لِ',
 '  ف  ',
 'ح أَ ج']

In [20]:
top_n_for(res["Kuwait"], n=10)

['ه ه ه',
 '  بِ ي',
 'م أَ لِ',
 'م ش  ',
 'ه   أَ',
 'أَ ي ه',
 '  ف ي',
 'أَ ت  ',
 'ف ي  ',
 '  u r']