In [1]:
import pandas as pd
import numpy as np

## Часть 1. Предварительная обработка данных

In [2]:
male_names = pd.read_csv("male.txt", header=None)
male_names.columns = ["name"]
male_names['sex'] = 0
male_names.head()

Unnamed: 0,name,sex
0,Aamir,0
1,Aaron,0
2,Abbey,0
3,Abbie,0
4,Abbot,0


In [3]:
female_names = pd.read_csv("female.txt", header=None)
female_names.columns = ["name"]
female_names['sex'] = 1
female_names.head()

Unnamed: 0,name,sex
0,Abagael,1
1,Abagail,1
2,Abbe,1
3,Abbey,1
4,Abbi,1


In [4]:
df = male_names.append(female_names)
df.name = df.name.str.lower()

display(male_names.info())
display(female_names.info())
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2943 entries, 0 to 2942
Data columns (total 2 columns):
name    2943 non-null object
sex     2943 non-null int64
dtypes: int64(1), object(1)
memory usage: 46.1+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 2 columns):
name    5001 non-null object
sex     5001 non-null int64
dtypes: int64(1), object(1)
memory usage: 78.2+ KB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7944 entries, 0 to 5000
Data columns (total 2 columns):
name    7944 non-null object
sex     7944 non-null int64
dtypes: int64(1), object(1)
memory usage: 186.2+ KB


None

In [5]:
df.drop_duplicates(subset=["name"], keep=False, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7208 entries, 0 to 5000
Data columns (total 2 columns):
name    7208 non-null object
sex     7208 non-null int64
dtypes: int64(1), object(1)
memory usage: 168.9+ KB


### Разбиваем на обучающее и тестовое множество:

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=111, stratify=df['sex'])
df_train.reset_index(inplace = True, drop = True)
df_test.reset_index(inplace = True, drop = True)

## Часть 2. Базовый метод классификации

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [9]:
from sklearn.metrics import f1_score, accuracy_score

In [10]:
for n in [2, 3, 4]:
    clf = Pipeline([
        ('vect', CountVectorizer(lowercase=False, analyzer='char', ngram_range=(n, n))),
        ('clf', MultinomialNB())
    ])
    clf.fit(df_train.name, df_train.sex)
    prediction = clf.predict(df_test.name)
    print("\nN-граммы: n = {}".format(n))
    print("Accuracy:    {0:.4f}".format(accuracy_score(df_test.sex, prediction)))  
    print("F1-measure:  {0:.4f}".format(f1_score(df_test.sex, prediction, average='macro')))


N-граммы: n = 2
Accuracy:    0.7746
F1-measure:  0.7408

N-граммы: n = 3
Accuracy:    0.8093
F1-measure:  0.7847

N-граммы: n = 4
Accuracy:    0.8044
F1-measure:  0.7673


Лучший результат при n = 3