# Python Name Probability

In [148]:
#set input source
sourceFileName = 'names.csv'

## Using Numpy

In [149]:
#create numpy array 
import numpy as np
ndf = np.genfromtxt(sourceFileName,dtype=None,encoding='ascii',skip_header=1)
#print("#1. Numpy Array")
#print("Type:\t\t",type(ndf))
#print("Shape:\t\t",ndf.shape)
#print("Numpy Array\n",ndf)

## Using Pandas

In [150]:
#create pandas dataframe
import pandas as pd
df = pd.read_csv(sourceFileName) #df is the master dataframe
names = df.values
names #master array of names
items = len(names) #number of items in the array
#print("\n\n#2. Pandas Dataframe")
#print("Type:\t\t",type(df))
#print("Shape:\t\t",df.shape) #how can this be converted to an 1xn array?
#print("Numpy Array\n",df)

### Distribution for First Alphabet (Count)

In [214]:
# create a distribution for starting alphabets of names
fahist = {} #histogram of count of first alphabet
for i in names:
    fa = i[0][0] #identify the first alphabet
    fahist[fa] = fahist.get(fa, 0) + 1#increment the count for element fa
#fahist

fadf = pd.DataFrame.from_dict(fahist, orient='index') #convert dictionary to dataframe
fadf.columns =['count of starting'] #rename the column header to 'starting with count'
fadf

Unnamed: 0,count of starting
a,284
b,107
c,82
d,144
e,34
f,34
g,77
h,81
i,58
j,98


### Probability of starting with an alphabet

In [215]:
pfahist = fahist #initialize histogram of probability of first alphabet
for i in pfahist:
    pfahist[i] = pfahist.get(i, 0)/items #devide count by maximum observations
    pfahist[i] = round(pfahist.get(i, 0),4) #roundinf off to four decimals
#pfahist

pfadf = pd.DataFrame.from_dict(pfahist, orient='index') #convert dictionary to dataframe
pfadf.columns =['prob of starting with'] #rename the column header to 'prob of starting with'
pfadf

Unnamed: 0,prob of starting with
a,0.1151
b,0.0434
c,0.0332
d,0.0583
e,0.0138
f,0.0138
g,0.0312
h,0.0328
i,0.0235
j,0.0397


In [216]:
#joing the starting with dataframes
swdf = pd.concat([fadf, pfadf], axis=1, sort=False)
swdf

Unnamed: 0,count of starting,prob of starting with
a,284,0.1151
b,107,0.0434
c,82,0.0332
d,144,0.0583
e,34,0.0138
f,34,0.0138
g,77,0.0312
h,81,0.0328
i,58,0.0235
j,98,0.0397


In [219]:
#joing the starting with dataframes
swdf = pd.concat([fadf, pfadf], axis=1, sort=False)
swdf

Unnamed: 0,count of starting,prob of starting with
a,284,0.1151
b,107,0.0434
c,82,0.0332
d,144,0.0583
e,34,0.0138
f,34,0.0138
g,77,0.0312
h,81,0.0328
i,58,0.0235
j,98,0.0397


### Distribution of alphabets across the entire name

In [217]:
eahist = {} #histogram of count of alphabet occuring at any part of the name
alenspace = 0
for i in names:
    for s in i[0]:
        ca = s #identify the current alphabet
        alenspace = alenspace + len(s) #total number of alphabets
        eahist[ca] = eahist.get(ca, 0) + 1 #increment the count for element ca
eahist

eadf = pd.DataFrame.from_dict(eahist, orient='index') #convert dictionary to dataframe
eadf.columns =['count of ever occuring'] #rename the column header to 'count of ever occuring'
eadf

Unnamed: 0,count of ever occuring
a,4034
c,169
h,1444
m,674
n,1498
d,776
e,669
s,1029
i,1273
f,49


### Probability of an alphabet ever occuring in a name

In [218]:
peahist = eahist #initialize histogram of probability of an alphabet ever ocurring
for i in peahist:
    peahist[i] = peahist.get(i, 0)/alenspace #devide count by maximum observations
    peahist[i] = round(peahist.get(i, 0),4) #roundinf off to four decimals
peahist

peadf = pd.DataFrame.from_dict(peahist, orient='index') #convert dictionary to dataframe
peadf.columns =['probability of ever occuring'] #rename the column header to 'count of ever occuring'
peadf

Unnamed: 0,probability of ever occuring
a,0.2253
c,0.0094
h,0.0807
m,0.0376
n,0.0837
d,0.0433
e,0.0374
s,0.0575
i,0.0711
f,0.0027


In [220]:
#joing the ever occuring dataframes
eodf = pd.concat([eadf, peadf], axis=1, sort=False)
eodf

Unnamed: 0,count of ever occuring,probability of ever occuring
a,4034,0.2253
c,169,0.0094
h,1444,0.0807
m,674,0.0376
n,1498,0.0837
d,776,0.0433
e,669,0.0374
s,1029,0.0575
i,1273,0.0711
f,49,0.0027


In [239]:
#joing the dataframes to create a master attributes data frame
madf = pd.concat([swdf, eodf], axis=1, sort=False)
madf

Unnamed: 0,count of starting,prob of starting with,count of ever occuring,probability of ever occuring
a,284.0,0.1151,4034,0.2253
b,107.0,0.0434,305,0.017
c,82.0,0.0332,169,0.0094
d,144.0,0.0583,776,0.0433
e,34.0,0.0138,669,0.0374
f,34.0,0.0138,49,0.0027
g,77.0,0.0312,267,0.0149
h,81.0,0.0328,1444,0.0807
i,58.0,0.0235,1273,0.0711
j,98.0,0.0397,349,0.0195


### Saving Master Attributes Data Frame to CSV

In [266]:
madf = madf.fillna(0) #filling 0 into nan
madf
madf.to_csv('master_attributes_dataframe.csv') #write dataframe to file

### Adding average length of names

In [275]:
# create a distribution for length of names starting with a particular alphabet
lenhist = {} #histogram of count of first alphabet
lengths = []
for i in names:
    fa = i[0][0] #identify the first alphabet
    lenhist[fa] = round((lenhist.get(fa, 0) + len(i[0]))/2,0) #increment element fa by the length of current word
lenhist

lendf = pd.DataFrame.from_dict(lenhist, orient='index') #convert dictionary to dataframe
lendf.columns =['avg name length'] #rename the column header to 'avg name length'
lendf

Unnamed: 0,avg name length
a,7.0
b,6.0
c,10.0
d,7.0
e,6.0
f,5.0
g,6.0
h,8.0
i,6.0
j,10.0


In [277]:
#joing the dataframes to create a master attributes data frame
madf = pd.concat([madf, lendf], axis=1, sort=False)
madf = madf.fillna(0) #filling 0 into nan
madf
madf.to_csv('master_attributes_dataframe.csv') #write dataframe to file