In [2]:
import pandas as pd
from keras.models import load_model
import keras
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.utils import resample 

# Data Preperation

In [14]:
ts = pd.read_csv('/mnt/acropolis/akcigitlab/restricted/Publications/code/Data Analysis/Deniz_temp/LSTM_training_set.csv')
ts = ts[ts.Name.notna()]

In [15]:
ts.head()

Unnamed: 0,Name,family
0,a,Non Turkish
1,aa,Non Turkish
2,aab,Non Turkish
3,aabakken,Non Turkish
4,aabaye,Non Turkish


In [16]:
ts.Name = ts.Name.str.capitalize()

In [17]:
ts.family.value_counts()

Non Turkish                 532690
Turkish                       7154
Low Probability Turkish       1742
High Probability Turkish       452
Name: family, dtype: int64

In [18]:
ts.family = ts.family.replace('Low Probability Turkish', 'Non Turkish')
ts.family = ts.family.replace('High Probability Turkish', 'Non Turkish')

In [19]:
def upsample_dataframe(df, random_state, target, _class = None): 
    if _class is not None:
        df_minority = df[df[target] == _class]
        df_majority = df.drop(df_minority.index.to_list())
        df_minority_upsampled = resample(df_minority,
                                     replace=True,
                                     n_samples=df_majority.shape[0],
                                     random_state=random_state)
    else:
        labels = df[target].unique() 
        value_counts = {} 
        for label in labels: 
            value_counts[label] = df[target].value_counts()[label] 
        minority = min(value_counts.items(), key=lambda x: x[1])[0] 
        majority = max(value_counts.items(), key=lambda x: x[1])[0] 
        df_minority = df[df[target] == minority] 
        df_majority = df[df[target] == majority] 
        df_minority_upsampled = resample(df_minority,
                                         replace=True,
                                         n_samples=value_counts[majority],
                                         random_state=random_state) # reproducible results 
    df_upsampled = pd.concat([df_majority, df_minority_upsampled]) 
    return df_upsampled

In [20]:
ts = upsample_dataframe(ts[['Name', 'family']], 112233, 'family')

In [21]:
ts.family.value_counts()

Non Turkish    534884
Turkish        534884
Name: family, dtype: int64

# Model Preperation

In [None]:
NGRAMS = 2
SAMPLE = 1062360
EPOCHS = 15
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(ts.Name)
vocab = vect.vocabulary_

# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)

In [23]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

In [28]:
X = np.array(ts.Name.apply(lambda c: find_ngrams(c, NGRAMS)))
X_len = []
for x in X:
    X_len.append(len(x))
max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))
print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(ts.family.astype('category').cat.codes)
ts['category_codes'] = y

Max feature len = 39, Avg. feature len = 5


In [29]:
ts_categories = ts[['family', 'category_codes']].drop_duplicates()

In [30]:
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [31]:
test = pd.DataFrame(columns = ['X', 'Y'])
test['X'] = X_test
test["Y"] = y_test

In [32]:
def concenate(row):
    rv = ''
    for i in row:
        rv += str(i)
    return rv

In [33]:
test['X1'] = test.X.apply(lambda row: concenate(row)) 
test = test.drop_duplicates(subset = ['X1', 'Y'])
X_test = np.array(test['X'])
y_test = np.array(test["Y"])

# Training

In [34]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

In [35]:
max_features = num_words # 20000
feature_len = 25 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

In [36]:
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

855814 train sequences
114114 test sequences
Pad sequences (samples x time)
X_train shape: (855814, 25)
X_test shape: (114114, 25)
2 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (855814, 2)
y_test shape: (114114, 2)


In [37]:
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())


Build model...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 32)            38976     
_________________________________________________________________
lstm (LSTM)                  (None, 128)               82432     
_________________________________________________________________
dense (Dense)                (None, 2)                 258       
Total params: 121,666
Trainable params: 121,666
Non-trainable params: 0
_________________________________________________________________
None


In [38]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=1)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=1)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 0.08240486681461334
Test accuracy: 0.9764971733093262


In [47]:
model.save('./lstm_deniz.h5')
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('words_model.csv', index = False)

In [40]:
p = model.predict(X_test, verbose=2) # to predict probability
y_pred = np.argmax(p, axis=-1)
target_names = list(ts.family.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))

3567/3567 - 10s
              precision    recall  f1-score   support

 Non Turkish       1.00      0.97      0.99    106960
     Turkish       0.73      1.00      0.84      7154

    accuracy                           0.98    114114
   macro avg       0.86      0.99      0.91    114114
weighted avg       0.98      0.98      0.98    114114

[[104278   2682]
 [     0   7154]]


In [41]:
target_names = list(ts.family.astype('category').cat.categories)

In [42]:
def predict_origin(names_list, target_names):
    def find_ngrams(text, n):
        a = zip(*[str(text)[i:] for i in range(n)])
        wi = []
        for i in a:
            w = ''.join(i)
            try:
                idx = words_df.vocab.to_list().index(w)
            except Exception as e:
                idx = 0
            wi.append(idx)
        return wi

    feature_len = 25
    X = np.array(pd.Series(names_list).apply(lambda c: find_ngrams(c, 2)))
    X_test = sequence.pad_sequences(X, maxlen=feature_len)
    
    df_dict = {'Name': names_list, 
               'Origin' : model.predict_classes(X_test, verbose=1)}
    
    df = pd.DataFrame.from_dict(df_dict) 
    
    target_names = {0: target_names[0], 1: target_names[1]}

    df['Origin'] = df['Origin'].replace(target_names)
    
    return df

In [43]:
predict_origin(['deniz tokmakoglu'], target_names)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


Unnamed: 0,Name,Origin
0,deniz tokmakoglu,Turkish


In [44]:
predict_origin(['napeleon boneparte'], target_names)



Unnamed: 0,Name,Origin
0,napeleon boneparte,Non Turkish


In [45]:
predict_origin(['ufuk akcigit'], target_names)



Unnamed: 0,Name,Origin
0,ufuk akcigit,Turkish


In [46]:
predict_origin(['reyhan ayas'], target_names)



Unnamed: 0,Name,Origin
0,reyhan ayas,Turkish
