In [23]:
import pandas as pd
import numpy as np
from nltk.util import ngrams
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
np.random.seed(1322)
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM

Using TensorFlow backend.


# HW3:  Классификация имен
В этом домашнем задании мы рассмотрим задачу бинарной классификации. Пусть дано два списка имен:  мужские и женские имена.  Требуется разработать классификатор, который по данному имени будет определять мужское оно или женское.
### Выполнили:  

* Булгаков Дмитрий

* Тефикова Алие

### Группа ИАД-2

## 0. Loading data

In [24]:
male_names = pd.read_csv('data/male.txt', header=-1, names=['name'], encoding = 'latin1')
female_names = pd.read_csv('data/female.txt', header=-1, names=['name'], encoding = 'latin1')

In [25]:
male_names_number = len(male_names)
female_names_number = len(female_names)
print('Количество мужских имен: ', male_names_number)
print('Количество женских имен: ', female_names_number)

Количество мужских имен:  2943
Количество женских имен:  5001


## 1. Filtering data

Предварительная обработка данных:<br>
* удалите неоднозначные имена (те имена, которые являются и мужскими, и женскими одновременно), если такие есть.<br>
* создайте тестовое множество по следующему принципу: 20% от общего количества имен на каждую букву (т.е. 20% от имен на букву А, 20% имен на букву B, и.т.д.)

### 1.1 Looking for same names in both male and female df

In [26]:
same_names = female_names.merge(male_names, on=['name'], how='inner')
print('Количество совпадающих имен: ', len(same_names))
print('Список имен, являющихся одновременно и мужскими, и женскими: (первые 10)')
same_names.head(n = 10)

Количество совпадающих имен:  365
Список имен, являющихся одновременно и мужскими, и женскими: (первые 10)


Unnamed: 0,name
0,Abbey
1,Abbie
2,Abby
3,Addie
4,Adrian
5,Adrien
6,Ajay
7,Alex
8,Alexis
9,Alfie


### 1.2 And deleting duplicated names

In [27]:
remove_criterion = lambda row: row['name'] not in same_names['name'].values
male_names = male_names[male_names.apply(remove_criterion, axis=1)]
female_names = female_names[female_names.apply(remove_criterion, axis=1)]

In [28]:
print('Количество мужских имен после удаления дубликатов: ', len(male_names))
print('Количество женских имен после удаления дубликатов: ', len(female_names))
print('Все ок? ', (male_names_number - len(same_names) == len(male_names)) & 
      (female_names_number - len(same_names) == len(female_names)))

Количество мужских имен после удаления дубликатов:  2578
Количество женских имен после удаления дубликатов:  4636
Все ок?  True


### 1.3 Creating test sample (20% of dataset)

In [29]:
def createTrainTestSample(dataframe, letters_list, field, field2, test_percentage=0.2, rnd_state=0):
    train_X, train_y, test_X, test_y = ([] for i in range(4))
    for letter_value in letters_list:
        letter_df = dataframe[dataframe[field].str.startswith(letter_value)]
        X_train, X_test, y_train, y_test = train_test_split(letter_df[field].tolist(), letter_df[field2].tolist(), 
                                                            test_size=test_percentage, random_state=rnd_state)
        train_X.extend(X_train)
        test_X.extend(X_test)
        train_y.extend(y_train)
        test_y.extend(y_test)
    return train_X, test_X, train_y, test_y

In [30]:
def getListOfFirstLetters(dataframe, field):
    letters = dataframe[field].astype(str).str[0]
    letters = np.unique(letters.tolist())
    return letters

In [31]:
all_names = pd.concat([male_names, female_names], axis=0, ignore_index=True)
all_names['gender'] = np.concatenate((np.ones(len(male_names)), np.zeros(len(female_names))), axis=0).astype(int)
print('Общее количество имен: ', len(all_names))
print('Все ок? ', len(all_names) == len(male_names) + len(female_names))

Общее количество имен:  7214
Все ок?  True


In [32]:
# getting list of fisrt letters of names
lettes_list = getListOfFirstLetters(all_names, 'name')
print('Список первых букв имен, присутствующих в датасете:')
lettes_list

Список первых букв имен, присутствующих в датасете:


array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'], 
      dtype='<U1')

In [33]:
%%time
X_train, X_test, y_train, y_test = createTrainTestSample(all_names, lettes_list, 'name', 'gender') 

CPU times: user 265 ms, sys: 4.56 ms, total: 269 ms
Wall time: 277 ms


## 2. Fitting Naive Bayes classifier

Используйте метод наивного Байеса для классификации имен: в качестве признаков используйте символьные n-граммы. <br>Сравните результаты, получаемые при разных n= 2, 3, 4 по F-мере и аккуратности. В каких случаях метод ошибается? <br>Для генерации n-грамм используйте <br><b>from nltk.util import ngrams.<b>

In [34]:
def remove_duplicates(list_to_filter):
    return list(set(list_to_filter))

In [35]:
def tuple_to_string(tuple_to_convert):
    return ''.join(tuple_to_convert)

In [36]:
def generateNgrams(dataframe, field, n):
    all_ngrams_list = []
    for name_value in dataframe[field].str.lower():
        for ngram in ngrams(name_value, n):
            all_ngrams_list.append(ngram)
    return remove_duplicates(all_ngrams_list)

In [37]:
def genFitMatrix(names_list, all_features):
    matrix = np.zeros((len(names_list), len(all_features)))
    for i, name_value in enumerate(names_list):
        for j, feature_value in enumerate(all_features):
            if tuple_to_string(feature_value) in name_value:
                matrix[i][j] = 1
    return matrix

#### N = 2

In [38]:
list_ngrams = generateNgrams(all_names, 'name', 2)
X_train_matrix = genFitMatrix(X_train, list_ngrams)
X_test_matrix = genFitMatrix(X_test, list_ngrams)

In [39]:
clf = GaussianNB().fit(X_train_matrix, y_train)
y_pred = clf.predict(X_test_matrix)
print("accuracy =", accuracy_score(y_test, y_pred))
print("F-score =", f1_score(y_test, y_pred))

accuracy = 0.699382292382
F-score = 0.376068376068


#### N = 3

In [40]:
list_ngrams = generateNgrams(all_names, 'name', 3)
X_train_matrix = genFitMatrix(X_train, list_ngrams)
X_test_matrix = genFitMatrix(X_test, list_ngrams)

In [41]:
clf = GaussianNB().fit(X_train_matrix, y_train)
y_pred = clf.predict(X_test_matrix)
print("accuracy =", accuracy_score(y_test, y_pred))
print("F-score =", f1_score(y_test, y_pred))

accuracy = 0.743994509266
F-score = 0.524840764331


#### N = 4

In [42]:
list_ngrams = generateNgrams(all_names, 'name', 4)
X_train_matrix = genFitMatrix(X_train, list_ngrams)
X_test_matrix = genFitMatrix(X_test, list_ngrams)

In [43]:
clf = GaussianNB().fit(X_train_matrix, y_train)
y_pred = clf.predict(X_test_matrix)
print("accuracy =", accuracy_score(y_test, y_pred))
print("F-score =", f1_score(y_test, y_pred))

accuracy = 0.732326698696
F-score = 0.705882352941


## 3. Fitting Neural Network

In [44]:
#checking number of duplicated names
k = 0 
for i in all_names.index:
     if all_names.name.value_counts()[i] > 1: 
        k = k + 1
k

0

In [45]:
#max name length
max_len = 0
for i in all_names.index:
    if len(all_names.name.values[i]) > max_len:
        max_len = len(all_names.name.values[i])
max_len

15

In [46]:
#all names to lower case
male = male_names.name.str.lower()
female = female_names.name.str.lower()

In [47]:
#extracting the chars 
chars = set("".join(male) + "".join(female))
char_indices = dict((c, i) for i, c in enumerate(chars))
print('Number of chars:', len(chars))

Number of chars: 29


In [48]:
num_epochs = 5 #количество эпох обучения
weightsFileName = "gender_weights.h5"
batch_size = 16

X = np.zeros((len(all_names), max_len, len(chars)), dtype = np.bool)
y = np.zeros((len(all_names), 2), dtype = np.bool)

for i, name in enumerate(male):
    for t, char in enumerate(name):
        X[i, t, char_indices[char]] = 1
    y[i, 0 ] = 1

for i, name in enumerate(female):
    for t, char in enumerate(name):
        X[i + len(male), t, char_indices[char]] = 1
    y[i + len(male), 1] = 1

In [49]:
#bulding model
model = Sequential()
model.add(LSTM(512, return_sequences = True, input_shape = (max_len, len(chars)))) #first layer
model.add(Dropout(0.2)) #adding dropout to reduce overfitting
model.add(LSTM(512, return_sequences = False)) #second layer
model.add(Dropout(0.2))

#The output layer will be a softmax layer with two units (one for male, one for female). 
#In other words, target output is a 2-dimensional one hot vector.
model.add(Dense(2)) #linear unit
model.add(Activation('softmax')) 

model.compile(loss = 'binary_crossentropy',  
              metrics = ['fmeasure', 'accuracy'], #reporting the f-measure and accuracy
              optimizer = 'rmsprop')

json_string = model.to_json()
with open("model.json", "w") as text_file:
    text_file.write(json_string)

model.fit(X, y, batch_size = batch_size, nb_epoch = num_epochs,
          validation_split = 0.1, verbose = 1)

model.save_weights('my_model_weights.h5')
result = model.evaluate(X, y, batch_size = batch_size)
print("Results: ", result)

Train on 6492 samples, validate on 722 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Results:  [1.9776847926756964, 0.63681729684452415, 0.63681729971982448]


просто разные дропауты

In [51]:
#bulding model
model = Sequential()
model.add(LSTM(512, return_sequences = True, input_shape = (max_len, len(chars)))) #first layer
model.add(Dropout(0.2)) #adding dropout to reduce overfitting
model.add(LSTM(512, return_sequences = False)) #second layer
model.add(Dropout(0.2))

#The output layer will be a softmax layer with two units (one for male, one for female). 
#In other words, target output is a 2-dimensional one hot vector.
model.add(Dense(2)) #linear unit
model.add(Dropout(0.2))
model.add(Activation('softmax')) 

model.compile(loss = 'binary_crossentropy',  
              metrics = ['fmeasure', 'accuracy'], #reporting the f-measure and accuracy
              optimizer = 'rmsprop')

json_string = model.to_json()
with open("model.json", "w") as text_file:
    text_file.write(json_string)

model.fit(X, y, batch_size = batch_size, nb_epoch = num_epochs,
          validation_split = 0.1, verbose = 1)

model.save_weights('my_model_weights.h5')
result = model.evaluate(X, y, batch_size = batch_size)
print("Results: ", result)

Train on 6492 samples, validate on 722 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Results:  [1.9678837271338014, 0.64527307834557823, 0.64527308014677209]


In [52]:
#bulding model
model = Sequential()
model.add(LSTM(512, return_sequences = True, input_shape = (max_len, len(chars)))) #first layer
model.add(Dropout(0.3)) #adding dropout to reduce overfitting
model.add(LSTM(512, return_sequences = False)) #second layer
model.add(Dropout(0.3))

#The output layer will be a softmax layer with two units (one for male, one for female). 
#In other words, target output is a 2-dimensional one hot vector.
model.add(Dense(2)) #linear unit
model.add(Dropout(0.3))
model.add(Activation('softmax')) 

model.compile(loss = 'binary_crossentropy',  
              metrics = ['fmeasure', 'accuracy'], #reporting the f-measure and accuracy
              optimizer = 'rmsprop')

json_string = model.to_json()
with open("model.json", "w") as text_file:
    text_file.write(json_string)

model.fit(X, y, batch_size = batch_size, nb_epoch = num_epochs,
          validation_split = 0.1, verbose = 1)

model.save_weights('my_model_weights.h5')
result = model.evaluate(X, y, batch_size = batch_size)
print("Results: ", result)

Train on 6492 samples, validate on 722 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Results:  [1.9819161445937497, 0.63529248607928712, 0.63529248685594875]


In [53]:
#bulding model
model = Sequential()
model.add(LSTM(512, return_sequences = True, input_shape = (max_len, len(chars)))) #first layer
model.add(Dropout(0.5)) #adding dropout to reduce overfitting
model.add(LSTM(512, return_sequences = False)) #second layer
model.add(Dropout(0.5))

#The output layer will be a softmax layer with two units (one for male, one for female). 
#In other words, target output is a 2-dimensional one hot vector.
model.add(Dense(2)) #linear unit
model.add(Dropout(0.5))
model.add(Activation('softmax')) 

model.compile(loss = 'binary_crossentropy',  
              metrics = ['fmeasure', 'accuracy'], #reporting the f-measure and accuracy
              optimizer = 'rmsprop')

json_string = model.to_json()
with open("model.json", "w") as text_file:
    text_file.write(json_string)

model.fit(X, y, batch_size = batch_size, nb_epoch = num_epochs,
          validation_split = 0.1, verbose = 1)

model.save_weights('my_model_weights.h5')
result = model.evaluate(X, y, batch_size = batch_size)
print("Results: ", result)

Train on 6492 samples, validate on 722 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Results:  [1.9566740601947856, 0.63806487310451587, 0.63806487388117739]
