In [1]:
from pandas import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [2]:
df = pd.read_csv('BanglaNames.csv')
print ("%d names in dataset" %len(df))
df = df.drop_duplicates(subset="Name")
df.head()

1369 names in dataset


Unnamed: 0,Id,Name,Gender
0,0,fairuz,female
1,1,maliha,female
2,2,mehedi,male
3,3,sohan,male
4,4,shatil,male


In [4]:
print (df[df.Gender == 'female'].count())

Id        573
Name      573
Gender    573
dtype: int64


In [5]:
print (df[df.Gender == 'male'].count())

Id        436
Name      436
Gender    436
dtype: int64


In [6]:
# Check if the name ends in vowel
def checkVowelEnd(name):
    if name[-1] in "aeiou":
        return "Vowel End"
    else:
        return "Consonant End"

In [7]:
df["Vowel/Consonant End"] = df["Name"].apply(checkVowelEnd)
df.head()

Unnamed: 0,Id,Name,Gender,Vowel/Consonant End
0,0,fairuz,female,Consonant End
1,1,maliha,female,Vowel End
2,2,mehedi,male,Vowel End
3,3,sohan,male,Consonant End
4,4,shatil,male,Consonant End


In [8]:
def checkGender(gender):
    if gender == "female":
        return 0
    else:
        return 1
    
df["Gender Value"] = df["Gender"].apply(checkGender)

df.head()

Unnamed: 0,Id,Name,Gender,Vowel/Consonant End,Gender Value
0,0,fairuz,female,Consonant End,0
1,1,maliha,female,Vowel End,0
2,2,mehedi,male,Vowel End,1
3,3,sohan,male,Consonant End,1
4,4,shatil,male,Consonant End,1


In [9]:
def compare(group):
    return df.groupby([group])["Gender Value"].sum()*100/df.groupby([group])["Gender Value"].count()

In [10]:
df.groupby(["Vowel/Consonant End"])['Gender Value'].count()
df.groupby(['Vowel/Consonant End','Gender Value']).size()

Vowel/Consonant End  Gender Value
Consonant End        0               186
                     1               376
Vowel End            0               387
                     1                60
dtype: int64

In [11]:
print (len(df))
# > 93889 = 43635 + 50254

1009


In [12]:
female_names = sum(df.groupby(["Vowel/Consonant End"])["Gender Value"].sum())
all_names = df.groupby(["Gender"])["Gender Value"].count()
print (all_names)
print ("\nBoth are equal? %s" % str(female_names == all_names["female"]))

Gender
female    573
male      436
Name: Gender Value, dtype: int64

Both are equal? False


In [13]:
print(df.groupby(["Vowel/Consonant End"])["Gender Value"].sum()*100/df.groupby(["Vowel/Consonant End"])["Gender Value"].count())

Vowel/Consonant End
Consonant End    66.903915
Vowel End        13.422819
Name: Gender Value, dtype: float64


In [14]:
print(compare("Vowel/Consonant End"))

Vowel/Consonant End
Consonant End    66.903915
Vowel End        13.422819
Name: Gender Value, dtype: float64


In [15]:
def vowelConsonantStart(name):
    if name[0] in "aeiou":
        return "Vowel Start"
    else:
        return "Consonant Start"

df["Vowel/Consonant Start"] = df["Name"].apply(vowelConsonantStart)
df.groupby(['Vowel/Consonant Start','Gender Value']).size()
#print("\n Comparison => %s", compare("Vowel/Consonant Start"))

#df.head()

Vowel/Consonant Start  Gender Value
Consonant Start        0               476
                       1               331
Vowel Start            0                97
                       1               105
dtype: int64

In [16]:
def shortLongName(name):
    if len(name) < 6:
        return "Short"
    else:
        return "Long"

df["Short/Long Name"] = df["Name"].apply(shortLongName)
df.groupby(['Short/Long Name','Gender Value']).size()
#print(compare("Short/Long Name"))
#df.head(20)

Short/Long Name  Gender Value
Long             0               349
                 1               226
Short            0               224
                 1               210
dtype: int64

In [17]:
# By Analogy most female names ends in 'A' or 'E' or has the sound of 'A'
def features(name):
    name = name.lower()
    return {
        'first-letter'  : name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter'   : name[-1],
        'last2-letters' : name[-2:],
        'last3-letters' : name[-3:],
    }

In [18]:
df["features"] = df["Name"].apply(features)
df.head()

Unnamed: 0,Id,Name,Gender,Vowel/Consonant End,Gender Value,Vowel/Consonant Start,Short/Long Name,features
0,0,fairuz,female,Consonant End,0,Consonant Start,Long,"{'first-letter': 'f', 'first2-letters': 'fa', ..."
1,1,maliha,female,Vowel End,0,Consonant Start,Long,"{'first-letter': 'm', 'first2-letters': 'ma', ..."
2,2,mehedi,male,Vowel End,1,Consonant Start,Long,"{'first-letter': 'm', 'first2-letters': 'me', ..."
3,3,sohan,male,Consonant End,1,Consonant Start,Short,"{'first-letter': 's', 'first2-letters': 'so', ..."
4,4,shatil,male,Consonant End,1,Consonant Start,Long,"{'first-letter': 's', 'first2-letters': 'sh', ..."


In [19]:
def checkfeature(name):
    if name[-1] in "a,e":
        return "a"
    return "b"
df["Feature End"] = df["Name"].apply(checkfeature)
df.head(-20)

Unnamed: 0,Id,Name,Gender,Vowel/Consonant End,Gender Value,Vowel/Consonant Start,Short/Long Name,features,Feature End
0,0,fairuz,female,Consonant End,0,Consonant Start,Long,"{'first-letter': 'f', 'first2-letters': 'fa', ...",b
1,1,maliha,female,Vowel End,0,Consonant Start,Long,"{'first-letter': 'm', 'first2-letters': 'ma', ...",a
2,2,mehedi,male,Vowel End,1,Consonant Start,Long,"{'first-letter': 'm', 'first2-letters': 'me', ...",b
3,3,sohan,male,Consonant End,1,Consonant Start,Short,"{'first-letter': 's', 'first2-letters': 'so', ...",b
4,4,shatil,male,Consonant End,1,Consonant Start,Long,"{'first-letter': 's', 'first2-letters': 'sh', ...",b
...,...,...,...,...,...,...,...,...,...
1333,1333,mounota,female,Vowel End,0,Consonant Start,Long,"{'first-letter': 'm', 'first2-letters': 'mo', ...",a
1334,1334,tanbin,female,Consonant End,0,Consonant Start,Long,"{'first-letter': 't', 'first2-letters': 'ta', ...",b
1335,1335,nafia,female,Vowel End,0,Consonant Start,Short,"{'first-letter': 'n', 'first2-letters': 'na', ...",a
1339,1339,disha,female,Vowel End,0,Consonant Start,Short,"{'first-letter': 'd', 'first2-letters': 'di', ...",a


In [20]:
g=df.groupby(['Feature End','Gender'])
g.head()

Unnamed: 0,Id,Name,Gender,Vowel/Consonant End,Gender Value,Vowel/Consonant Start,Short/Long Name,features,Feature End
0,0,fairuz,female,Consonant End,0,Consonant Start,Long,"{'first-letter': 'f', 'first2-letters': 'fa', ...",b
1,1,maliha,female,Vowel End,0,Consonant Start,Long,"{'first-letter': 'm', 'first2-letters': 'ma', ...",a
2,2,mehedi,male,Vowel End,1,Consonant Start,Long,"{'first-letter': 'm', 'first2-letters': 'me', ...",b
3,3,sohan,male,Consonant End,1,Consonant Start,Short,"{'first-letter': 's', 'first2-letters': 'so', ...",b
4,4,shatil,male,Consonant End,1,Consonant Start,Long,"{'first-letter': 's', 'first2-letters': 'sh', ...",b
5,5,akid,male,Consonant End,1,Vowel Start,Short,"{'first-letter': 'a', 'first2-letters': 'ak', ...",b
6,6,samia,female,Vowel End,0,Consonant Start,Short,"{'first-letter': 's', 'first2-letters': 'sa', ...",a
7,7,tanvir,male,Consonant End,1,Consonant Start,Long,"{'first-letter': 't', 'first2-letters': 'ta', ...",b
16,16,khaja,male,Vowel End,1,Consonant Start,Short,"{'first-letter': 'k', 'first2-letters': 'kh', ...",a
23,23,sadiqua,female,Vowel End,0,Consonant Start,Long,"{'first-letter': 's', 'first2-letters': 'sa', ...",a


In [21]:
names =(df.groupby(['Feature End','Gender'])).size()
#female_names =(df.groupby(['features','Gender Value'])).count()

#all_names = df.groupby(["Gender"])["Gender Value"].count()
print (names)

Feature End  Gender
a            female    316
             male       24
b            female    257
             male      412
dtype: int64


In [22]:
 training_data = df[["Gender Value", "Vowel/Consonant Start", "Short/Long Name", "Vowel/Consonant End","Feature End"]]
# training_data.head()

In [23]:
X= df[['Vowel/Consonant Start', 'Short/Long Name', 'Vowel/Consonant End','Feature End']]
y= df ['Gender Value']

In [26]:
def reprCategory(column):
    column = column.astype("category")
    return column.cat.codes
    
#training_data[["Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start","Feature End"]] = training_data[["Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start","Feature End"]].apply(reprCategory)
X[["Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start","Feature End"]] = X[["Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start","Feature End"]].apply(reprCategory)
#training_data.head()
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.20)

#train,test = train_test_split(training_data, test_size = 0.20)


In [27]:
len(X_train)


807

In [28]:
len(X_test)


202

In [29]:
len(y_train)


807

In [30]:
len(y_test)

202

In [31]:
from sklearn.naive_bayes import MultinomialNB
MultinomialNB_clf = MultinomialNB()


In [32]:
MultinomialNB_clf.fit(X_train,y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [33]:
MultinomialNB_clf.predict(X_test)

array([1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 1], dtype=int64)

In [34]:
y_test

722     0
149     1
58      1
5       1
551     1
       ..
1146    0
322     0
1158    0
727     0
1012    1
Name: Gender Value, Length: 202, dtype: int64

In [35]:
MultinomialNB_clf.score(X_test,y_test)

0.7772277227722773

In [36]:
# Dicision Tree Classifier
DecisionTreeClassifier_clf = DecisionTreeClassifier()
DecisionTreeClassifier_clf = DecisionTreeClassifier_clf.fit(X_train,y_train)
#predictions = DecisionTreeClassifier_clf.predict(test[["Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start"]])
#accuracy_score(test["Gender Value"], predictions)
DecisionTreeClassifier_clf.score(X_test,y_test)

0.7772277227722773

In [37]:
from sklearn.linear_model import LogisticRegression
logreg_clf = LogisticRegression()
logreg_clf.fit(X_train,y_train)
#predictions = logreg_clf.predict(test[["Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start"]])
#accuracy_score(test["Gender Value"], predictions)
logreg_clf.score(X_test,y_test)



0.7772277227722773

In [38]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
SVC_model = SVC()
SVC_model.fit(X_train,y_train)
#prediction = SVC_model.predict(test[["Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start"]])
#accuracy_score(test["Gender Value"], prediction)
SVC_model.score(X_test,y_test)



0.7772277227722773

In [39]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
KNN_model = KNeighborsClassifier(n_neighbors=5)
KNN_model.fit(X_train,y_train)
#predictions = KNN_model.predict(test[["Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start"]])
#accuracy_score(test["Gender Value"], predictions)
KNN_model.score(X_test,y_test)

0.7079207920792079

In [40]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda_clf = LinearDiscriminantAnalysis()
lda_clf.fit(X_train,y_train)
lda_clf.score(X_test,y_test)

0.7772277227722773

In [41]:
from sklearn.naive_bayes import GaussianNB
gnb_clf = GaussianNB()

gnb_clf.fit(X_train,y_train)
gnb_clf.score(X_test,y_test)

0.7772277227722773

In [42]:
with open("decidenamesB.dot", "w") as dot_file:
    dot_file = export_graphviz(DecisionTreeClassifier_clf,
                            feature_names=["Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start","Feature End"], out_file=dot_file)