In [11]:
import pandas as pd
import numpy as np
from itertools import groupby
from operator import itemgetter

import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.util import ngrams

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.utils import np_utils
from keras.layers import (Dense, Dropout, Activation, Embedding, LSTM,
                            Convolution1D, MaxPooling1D)

In [2]:
female = pd.read_table('female.txt', header = None, squeeze = True)
male = pd.read_table('male.txt', header = None, squeeze = True)

In [3]:
# задание 1

female_only = sorted(list(set(female).difference(set(male))))
male_only = sorted(list(set(male).difference(set(female))))

female_only = [i.lower() for i in female_only]
male_only = [i.lower() for i in male_only]

In [4]:
names_f = pd.DataFrame({'names': female_only, 'gender': 'female'})
names_m = pd.DataFrame({'names': male_only, 'gender': 'male'})
names = pd.concat([names_f, names_m])

In [5]:
names.tail()

Unnamed: 0,gender,names
2573,male,zeus
2574,male,zippy
2575,male,zollie
2576,male,zolly
2577,male,zorro


In [6]:
names = names.sort_values(by = ['names'])
names.index = range(len(names))

In [7]:
def testarr(data):
    X_train, X_test, y_train, y_test = [], [], [], []
    for letters, words in groupby(data['names'], key=itemgetter(0)):
        names = []
        for word in words:
            names.append(word)
        data_n = data[data['names'].isin(names)]
        X_tr, X_t, y_tr, y_t = train_test_split(data_n['names'], data_n['gender'], test_size = 0.2, random_state = 42)
        X_train += list(X_tr)
        X_test += list(X_t)
        y_train += list(y_tr)
        y_test += list(y_t)
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = testarr(names)

In [9]:
# задание 2

def word_feats(word, n):
    return dict(['ngram', ngram] for ngram in ngrams((word), n))

In [12]:
for n in range(2, 5):
    trainfeats = [(word_feats(X_train[i], n), y_train[i]) for i in range(len(X_train))]
    classifier = NaiveBayesClassifier.train(trainfeats)
    testfeats = [(word_feats(X_test[i], n), y_test[i]) for i in range(len(X_test))]
    print('Accuracy ', n,'grams = ', nltk.classify.accuracy(classifier,   testfeats))
    y_pred = [classifier.classify(word_feats(i, n)) for i in X_test]
    print('F1_score', n,'grams = ', f1_score(y_test, y_pred, average='macro'), '\n')

Accuracy  2 grams =  0.8332189430336308
F1_score 2 grams =  0.810955928641 

Accuracy  3 grams =  0.8071379547014413
F1_score 3 grams =  0.776951687039 

Accuracy  4 grams =  0.7467398764584763
F1_score 4 grams =  0.675680224404 



чем больше n, тем больше делает ошибок метод

In [13]:
# задание 3

totalEntries_tr = len(X_train)
maxlen = len(max(list(names['names']) , key=len))

chars = set("".join(names['names']))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

X_tr = np.zeros((totalEntries_tr , maxlen, len(chars) ), dtype=np.bool)
y_tr = np.zeros((totalEntries_tr , 2 ), dtype=np.bool)


for i, name in enumerate(X_train):
    for t, char in enumerate(name):
        X_tr[i, t, char_indices[char]] = 1
        
totalEntries_test = len(X_test)

X_tst = np.zeros((totalEntries_test , maxlen, len(chars)), dtype=np.bool)
y_tst = np.zeros((totalEntries_test , 2 ), dtype=np.bool)


for i, name in enumerate(X_test):
    for t, char in enumerate(name):
        X_tst[i, t, char_indices[char]] = 1

In [14]:
def one_hot_encode_object_array(arr):
    uniques, ids = np.unique(arr, return_inverse=True)
    return np_utils.to_categorical(ids, len(uniques))

train_y_ohe = one_hot_encode_object_array(y_train)
test_y_ohe = one_hot_encode_object_array(y_test)
test_y = [1 if i == 'male' else 0 for i in y_test]

In [36]:
lstm = [100, 300, 500]
drop = [0.2, 0.5, 0.8]

accuracy_df = pd.DataFrame(columns = lstm)
f1_score_df = pd.DataFrame(columns = lstm)
for i in drop:
    accuracy_df.loc[i] = 0
    f1_score_df.loc[i] = 0

for l in lstm:
    for d in drop:
        model = Sequential()
        model.add(LSTM(l, return_sequences=True, input_shape=(maxlen, len(chars))))
        model.add(Dropout(d))
        model.add(LSTM(l, return_sequences=False))
        model.add(Dropout(d))
        model.add(Dense(2))
        model.add(Activation('softmax'))
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=["accuracy"])
        model.fit(X_tr, train_y_ohe, batch_size=16, nb_epoch=5)

        loss, accuracy = model.evaluate(X_tst, test_y_ohe, verbose=0)
        y_pred = model.predict_classes(X_tst)
        accuracy_df[l][d] = accuracy
        f1_score_df[l][d] = f1_score(test_y, y_pred, average='macro')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [38]:
accuracy_df


Unnamed: 0,100,300,500
0.2,0.851064,0.79547,0.836651
0.5,0.836651,0.826356,0.844887
0.8,0.826356,0.811256,0.831846


In [39]:
f1_score_df

Unnamed: 0,100,300,500
0.2,0.835968,0.788689,0.82034
0.5,0.823263,0.818668,0.829554
0.8,0.812047,0.784427,0.813229


большой dropout ведет к переобучению. лучшее значение получилось при dropout = 0.5 и количестве узлов = 500

### задание 4

Лучше нейронная сеть, если подобрать параметры. Думаю, потому что она обучается, находя разные зависимости между X и y, а не просто смотрит на вероятности.