# Get actor data with celebrity divorce prediction

This notebook imports data of American film actor from the following wikipedia link:

https://en.wikipedia.org/wiki/Category:American_male_film_actors

I parse and wrangle the data of American male film actors on the wikipedia using beautifulsoap. The goal is to get the information of the actor and to see if we can determine and predict their divorce.

In [1]:
import requests
from bs4 import BeautifulSoup
import urllib.request as urllib2
import re

# Data manipulation
import pandas as pd
import numpy as np

import nltk 

## Data preprocessing and cleaning
Get links from wiki

In [2]:
def getHTMLContent(link):
    '''function that can parse the website'''
    html = urllib2.urlopen(link)
    soup = BeautifulSoup(html, 'html.parser')
    return soup

html_page = "https://en.wikipedia.org/wiki/Category:American_male_film_actors"

soap = getHTMLContent(html_page)

In [3]:
result_parse = soap.body.find('div', attrs={'style':'text-align:center'}).findAll('a')

links = [] # get all the actress' link from the website

for link in result_parse:
    links.append(link.get('href'))

In [4]:
"https:"+links[0]

'https://en.wikipedia.org/wiki/Category:American_male_film_actors?from=*'

Get links for actors' info

In [5]:
# get links for all actors' info
link_name = [] # get all the actress' link from the website

for url in links:
    get_page = getHTMLContent("https:"+ url)
    page = get_page.find('div', {'class': 'mw-category'}).findAll('a')
    
    for link in page:
        link_name.append(link.get('href'))

In [6]:
print('# of links: {}'.format(len(link_name)))

# of links: 26445


In [7]:
# save links to a csv file
df_links = pd.DataFrame(link_name, columns = ['link']);
df_links.to_csv ('export_links_actors.csv', header=True)

In [57]:
df_links.head(10)

Unnamed: 0,link
0,/wiki/50_Cent
1,/wiki/Lee_Aaker
2,/wiki/Willie_Aames
3,/wiki/Quinton_Aaron
4,/wiki/Victor_Aaron
5,/wiki/Abbott_and_Costello
6,/wiki/Bruce_Abbott
7,/wiki/Bud_Abbott
8,/wiki/Christopher_Abbott
9,/wiki/Philip_Abbott


Get information of each actor from the links, including *name, birthday, education, spouse, child, birthplace, role, their spouse's birthday, spouse's role, spouse's child*.


In [9]:
##### parse text data from wiki #####
name1 = []
bday1 = []
education1 = []
spouse1 = []
child1 =[] 
birthplace1 = []
role1=[]
bday1_spouse = []
birthplace1_spouse = []
role1_spouse = []
spouse1_spouse = []
child1_spouse = []

for url in df_links['link']:

    get_page = getHTMLContent('https://en.wikipedia.org' + url)
    table = get_page.find('table', {'class': 'infobox biography vcard'})
    try:
        name = table.find("div", { "class" : "fn" }).text
        name1.append(name)       
    except:
        name1.append(np.nan)
    try:
        bday = table.find("span", { "class" : "bday" }).text
        bday1.append(bday)
    except:
        bday1.append(np.nan)
    try:
        education = table.find("th", text="Education").find_next_sibling("td").text
        education1.append(education)
    except:
        education1.append(np.nan)
    try:
        spouse = table.find("span", { "class" : "nowrap"}).findNext('td').get_text(separator=" ")
        spouse1.append(spouse)
    except:
        spouse1.append(np.nan)
    try:
        child = table.find("th", text="Children").find_next_sibling("td").text
        child1.append(child)
    except:
        child1.append(np.nan)
    try:
        birthplace = table.find("div", { "class" : "birthplace"}).text
        birthplace1.append(birthplace)
    except:
        birthplace1.append(np.nan)
    try:
        role = table.find("td", { "class" : "role" }).text
        role1.append(role)
    except:
        role1.append(np.nan)
    
    # first spouse information
    
    try:
        spouse_1 = table.find("div", { "style" : "display:inline-block;line-height:normal;margin:2px 0px;"}).findAll('a')#.get('href')
        spouse_link = [link.get('href') for link in spouse_1]
        get_spouse_link = getHTMLContent('https://en.wikipedia.org' + spouse_link[0])
        table_spouse = get_spouse_link.find('table', {'class': 'infobox biography vcard'})
        
        bday_spouse = table_spouse.find("span", { "class" : "bday" }).text
        bday1_spouse.append(bday_spouse)
        
    except:
        bday1_spouse.append(np.nan)
        
    try:
        spouse_1 = table.find("div", { "style" : "display:inline-block;line-height:normal;margin:2px 0px;"}).findAll('a')#.get('href')
        spouse_link = [link.get('href') for link in spouse_1]
        get_spouse_link = getHTMLContent('https://en.wikipedia.org' + spouse_link[0])
        table_spouse = get_spouse_link.find('table', {'class': 'infobox biography vcard'})
        
        birthplace_spouse = table_spouse.find("div", { "class" : "birthplace" }).text
        birthplace1_spouse.append(birthplace_spouse)
    except:
        birthplace1_spouse.append(np.nan)
    try:
        spouse_1 = table.find("div", { "style" : "display:inline-block;line-height:normal;margin:2px 0px;"}).findAll('a')#.get('href')
        spouse_link = [link.get('href') for link in spouse_1]
        get_spouse_link = getHTMLContent('https://en.wikipedia.org' + spouse_link[0])
        table_spouse = get_spouse_link.find('table', {'class': 'infobox biography vcard'})
        
        role_spouse = table_spouse.find("td", { "class" : "role" }).text
        role1_spouse.append(role_spouse)
    except:
        role1_spouse.append(np.nan) 
    try:
        spouse_1 = table.find("div", { "style" : "display:inline-block;line-height:normal;margin:2px 0px;"}).findAll('a')#.get('href')
        spouse_link = [link.get('href') for link in spouse_1]
        get_spouse_link = getHTMLContent('https://en.wikipedia.org' + spouse_link[0])
        table_spouse = get_spouse_link.find('table', {'class': 'infobox biography vcard'})
        
        spouse_spouse = table_spouse.find("span", { "class" : "nowrap"}).findNext('td').get_text(separator=" ")
        spouse1_spouse.append(spouse_spouse)
    except:
        spouse1_spouse.append(np.nan) 
    try:
        spouse_1 = table.find("div", { "style" : "display:inline-block;line-height:normal;margin:2px 0px;"}).findAll('a')#.get('href')
        spouse_link = [link.get('href') for link in spouse_1]
        get_spouse_link = getHTMLContent('https://en.wikipedia.org' + spouse_link[0])
        table_spouse = get_spouse_link.find('table', {'class': 'infobox biography vcard'})
        
        child_spouse = table_spouse.find("th", text="Children").find_next_sibling("td").text
        child1_spouse.append(child_spouse)
    except:
        child1_spouse.append(np.nan) 

In [19]:
# convert to pandas dataformat
df = pd.DataFrame({'name': name1, 'bday':bday1, 'name_sp':spouse1, 'num_of_child':child1,\
                   'education':education1,'role':role1,\
                   'birthplace':birthplace1,\
                  'bday_sp':bday1_spouse, 'birthplace_sp': birthplace1_spouse, 'role_sp':role1_spouse,\
                  'name_sp_sp':spouse1_spouse, 'num_of_child_sp':child1_spouse});

In [20]:
# clean nan data and reset index
df = df.dropna(axis=0, how = 'all') # drop the row has all nan
df.head()

Unnamed: 0,name,bday,name_sp,num_of_child,education,role,birthplace,bday_sp,birthplace_sp,role_sp,name_sp_sp,num_of_child_sp
0,50 Cent,1975-07-06,\n G-Unit \n Dr. Dre \n Eminem \n The Game \n ...,2.0,,\nRapper\nsinger\nsongwriter\nactor\nentrepren...,"New York City, New York, U.S.",,,,,
1,Lee Aaker,1943-09-25,Sharon Ann Hamilton (1969-71) (divorced),,,"Actor, producer, carpenter, ski instructor","Los Angeles, California, U.S.",,,,,
2,Willie Aames,1960-07-15,Vicki Weatherman ( m. 1979–1984) Maylo McCasl...,2.0,,"Actor, director, television producer, screenwr...","Newport Beach, California, U.S.",,,,,
3,Quinton Aaron,1984-08-15,,,,Actor,"The Bronx, New York, U.S.",,,,,
4,Victor Aaron,1956-09-11,Eduvina Matta (m. 1974–1984) divorced,2.0,,"Actor, voice actor","Odessa, Texas, U.S.",,,,,


In [21]:
# count missing data
print(df.isnull().sum())
print("total number of actress : {}".format(len(df)) )

name                   8
bday                2524
name_sp            11889
num_of_child       16104
education          21189
role                2155
birthplace          1076
bday_sp            21270
birthplace_sp      21193
role_sp            21305
name_sp_sp         21175
num_of_child_sp    21602
dtype: int64
total number of actress : 22947


lots of missing data!

In [59]:
# save links to a csv file
df.to_csv ('export_df_actor.csv', header=True)

In [2]:
df = pd.read_csv('export_df_actor.csv')

### Calculate age of actor and age difference with his spouse
Age difference between the spouses at their first marriage

In [22]:
# Calculate the ages
from time import gmtime, strftime
time_now = pd.to_datetime(strftime("%Y-%m-%d", gmtime()))

df['bday'] = pd.to_datetime(df['bday']);
df['bday_sp'] = pd.to_datetime(df['bday_sp']);

df['age_diff'] = (df['bday']-df['bday_sp'])/np.timedelta64(1,'Y')
df['age']= (time_now - df['bday'])/np.timedelta64(1,'Y')
df['age_spouse']= (time_now - df['bday_sp'])/np.timedelta64(1,'Y')

In [15]:
df.head()

Unnamed: 0,name,bday,name_sp,num_of_child,education,role,birthplace,bday_sp,birthplace_sp,role_sp,name_sp_sp,num_of_child_sp,age_diff,age,age_spouse
0,50 Cent,1975-07-06,\n G-Unit \n Dr. Dre \n Eminem \n The Game \n ...,2.0,,\nRapper\nsinger\nsongwriter\nactor\nentrepren...,"New York City, New York, U.S.",,,,,,,43.836629,
1,Lee Aaker,1943-09-25,Sharon Ann Hamilton (1969-71) (divorced),,,"Actor, producer, carpenter, ski instructor","Los Angeles, California, U.S.",,,,,,,75.615516,
2,Willie Aames,1960-07-15,Vicki Weatherman ( m. 1979–1984) Maylo McCasl...,2.0,,"Actor, director, television producer, screenwr...","Newport Beach, California, U.S.",,,,,,,58.810243,
3,Quinton Aaron,1984-08-15,,,,Actor,"The Bronx, New York, U.S.",,,,,,,34.724875,
4,Victor Aaron,1956-09-11,Eduvina Matta (m. 1974–1984) divorced,2.0,,"Actor, voice actor","Odessa, Texas, U.S.",,,,,,,62.651526,


### Clean text
Helper functions for cleaning the text

In [18]:
REPLACE_BY_SPACE_RE = re.compile('[/{}\[\]\|.,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #-+_]')
REPLACE_BY_a = re.compile('[(]')
REPLACE_BY_b = re.compile('[)]')
REPLACE_BY_c = re.compile('[–]')
REPLACE_BY_c = re.compile('[–-]')

def clean_text(text):
    """
        text: a string of the spouse column
        return: modified initial string
    """
    #text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    #text = BAD_SYMBOLS_RE.sub(' ', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = REPLACE_BY_a.sub(' ( ', text)
    text = REPLACE_BY_b.sub(' ) ', text)
    text = REPLACE_BY_c.sub(' ', text)
    text = re.sub(r"divorced", " div ", text)
    text = re.sub(r"divorce", " div ", text)
    text = re.sub(r"annulled", " div ", text)
    text = re.sub(r"separated", " div ", text)
    text = re.sub(r" sep ", " div ", text)
    text = re.sub(r" ann ", " div ", text)
    text = re.sub(r"his death", " de ", text)
    text = re.sub(r"died", " de ", text)
    
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

def clean_text_role(text):
    """
        text: a string of the role column
        remove stopword, lower case
        return: modified initial string
    """
    import nltk
    from nltk.corpus import stopwords
    try:
        text = text.lower() # lowercase text
        text = re.sub(r"and", " ", text)
        text = re.sub(r"U.S.", " US ", text)
        text = re.sub(r"divorce", " div ", text)
    except:text = np.nan
    return text

def clean_text_child(text):
    """
        text: a string of child column
        remove stopword, lower case
        return: modified initial string
    """
    import nltk
    from nltk.corpus import stopwords
    try:
        
        text = str(text)
        text = text.lower() # lowercase text
        text = re.compile('[^0-9]').sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
        text = int(text)
        if text > 100:
            text = np.nan
        else: text = int(text)
            
    except:
        text = np.nan 

    return text

In [19]:
def spouse_m_d(data):
    '''
    clean text and return spouse name, marraged year, and divoice year
    only return the result of marriage less than 5 times
    input: array for spouse_f column before 'clean_text' function
    output: spouse name, marraged year, divoice year (list)
    '''
    try:
        text = clean_text(data).split()
        indices_m = [i for i, x in enumerate(text) if x == "m"]
        indices_div = [i for i, x in enumerate(text) if x == "div"]
        indices1 = [i for i, x in enumerate(text) if x == "("]
        indices2 = [i for i, x in enumerate(text) if x == ")"]

        if len(indices_m) == 0:
            name = np.nan
            year_m = np.nan
        elif len(indices_m) == 1:
            name = [" ".join(text[0:indices1[0]])]
            year_m =[int(text[indices_m[0]+1])]
        elif len(indices_m) == 2:
            name = [" ".join(text[0:indices1[0]]), " ".join(text[indices2[0]+1:indices1[1]])]
            year_m = [int(text[indices_m[0]+1]), int(text[indices_m[1]+1])]
        elif len(indices_m) == 3:
            name = [" ".join(text[0:indices1[0]]), " ".join(text[indices2[0]+1:indices1[1]]),\
                    " ".join(text[indices2[1]+1:indices1[2]])]
                             
            year_m = [int(text[indices_m[0]+1]), int(text[indices_m[1]+1]), int(text[indices_m[2]+1])]
        elif len(indices_m) == 4:
            name = [" ".join(text[0:indices1[0]]), " ".join(text[indices2[0]+1:indices1[1]]),\
                    " ".join(text[indices2[1]+1:indices1[2]]), " ".join(text[indices2[2]+1:indices1[3]])]
            year_m = [int(text[indices_m[0]+1]), int(text[indices_m[1]+1]), int(text[indices_m[2]+1]), int(text[indices_m[3]+1])]
        elif len(indices_m) == 5:
            name = [" ".join(text[0:indices1[0]]), " ".join(text[indices2[0]+1:indices1[1]]),\
                    " ".join(text[indices2[1]+1:indices1[2]]), " ".join(text[indices2[2]+1:indices1[3]]),\
                   " ".join(text[indices2[3]+1:indices1[4]])]
            year_m = [int(text[indices_m[0]+1]), int(text[indices_m[1]+1]), int(text[indices_m[2]+1]), \
                     int(text[indices_m[3]+1]), int(text[indices_m[4]+1])]
        else:
            name = "more than 5"
            year_m = np.nan
    
        
        if len(indices_div) == 0:
            year_d = np.nan
        elif len(indices_div) == 1:
            year_d = [int(text[indices_div[0]+1])]
        elif len(indices_div) == 2:
            year_d = [int(text[indices_div[0]+1]), int(text[indices_div[1]+1])]
        elif len(indices_div) == 3:
            year_d = [int(text[indices_div[0]+1]), int(text[indices_div[1]+1]), int(text[indices_div[2]+1])]
        elif len(indices_div) == 4:
            year_d = [int(text[indices_div[0]+1]), int(text[indices_div[1]+1]), int(text[indices_div[2]+1]), int(text[indices_div[3]+1])]                                                                                       
        elif len(indices_div) == 5:
            year_d = [int(text[indices_div[0]+1]), int(text[indices_div[1]+1]), int(text[indices_div[2]+1]), \
                     int(text[indices_div[3]+1]),  int(text[indices_div[4]+1])]  
        else:
            year_d = np.nan

    except:
        name = np.nan
        year_m = np.nan
        year_d = np.nan
    
    return (name, year_m, year_d)

Clean text of *spouse name* and calculate *year of marriaged*, *year of divoice*, and *number of childs*

In [20]:
name_spouse = []
year_m = []
year_div = []
name_sp_sp = []
year_m_sp = []
year_div_sp = []
num_of_child = []
num_of_child_sp = []


for i in range(len(df)):
    a_name, a_year_m, a_year_d = spouse_m_d(df['name_sp'][i])
    a_name_sp, a_year_m_sp, a_year_d_sp = spouse_m_d(df['name_sp_sp'][i])
    
    a_child = clean_text_child(df['num_of_child'][i])
    a_child_sp  = clean_text_child(df['num_of_child_sp'][i])
    
    name_spouse.append(a_name); year_m.append(a_year_m);year_div.append(a_year_d);
    name_sp_sp.append(a_name_sp); year_m_sp.append(a_year_m_sp);year_div_sp.append(a_year_d_sp);
    num_of_child.append(a_child); 
    num_of_child_sp.append(a_child_sp);


dict = {'name_sp_cl':name_spouse, 'year_m':year_m, 'year_div': year_div, \
        'name_sp_sp_cl':name_sp_sp, 'year_m_sp':year_m_sp, 'year_div_sp':year_div_sp,\
        'num_of_child_cl':num_of_child , 'num_of_child_sp_cl':num_of_child_sp}

df_spouse_info = pd.DataFrame(dict) 

In [21]:
# combine dataframe
df = pd.concat([df, df_spouse_info], axis=1)
df.head()

Unnamed: 0,name,bday,name_sp,num_of_child,education,role,birthplace,bday_sp,birthplace_sp,role_sp,...,age,age_spouse,name_sp_cl,year_m,year_div,name_sp_sp_cl,year_m_sp,year_div_sp,num_of_child_cl,num_of_child_sp_cl
0,50 Cent,1975-07-06,\n G-Unit \n Dr. Dre \n Eminem \n The Game \n ...,2.0,,\nRapper\nsinger\nsongwriter\nactor\nentrepren...,"New York City, New York, U.S.",,,,...,43.836629,,,,,,,,2.0,
1,Lee Aaker,1943-09-25,Sharon Ann Hamilton (1969-71) (divorced),,,"Actor, producer, carpenter, ski instructor","Los Angeles, California, U.S.",,,,...,75.615516,,,,,,,,,
2,Willie Aames,1960-07-15,Vicki Weatherman ( m. 1979–1984) Maylo McCasl...,2.0,,"Actor, director, television producer, screenwr...","Newport Beach, California, U.S.",,,,...,58.810243,,"[Vicki Weatherman, Maylo McCaslin, Winnie Hung]","[1979, 1986, 2014]",,,,,2.0,
3,Quinton Aaron,1984-08-15,,,,Actor,"The Bronx, New York, U.S.",,,,...,34.724875,,,,,,,,,
4,Victor Aaron,1956-09-11,Eduvina Matta (m. 1974–1984) divorced,2.0,,"Actor, voice actor","Odessa, Texas, U.S.",,,,...,62.651526,,,,,,,,2.0,


### number of marriage
Calculate number of marriage of actor and his spouse. This was calculated based on number of his spouse.

In [25]:
df_s = pd.DataFrame()
df_s['name_sp_cl'] = df['name_sp_cl']

df_s.dropna(inplace = True)
df_s['num_of_m'] = df['name_sp_cl'].str.len()
df_s.head()

Unnamed: 0,name_sp_cl,num_of_m
2,"[Vicki Weatherman, Maylo McCaslin, Winnie Hung]",3.0
5,"[Linda Hamilton, Kathleen Quinlan]",2.0
6,[Betty Smith],1.0
8,[Jane Dufrayne],1.0
10,[Allie Wood],1.0


In [26]:
df_ss = pd.DataFrame()
df_ss['name_sp_sp_cl'] = df['name_sp_sp_cl']

df_ss.dropna(inplace = True)
df_ss['num_of_m_sp'] = df['name_sp_sp_cl'].str.len()
df_ss.head()

Unnamed: 0,name_sp_sp_cl,num_of_m_sp
5,"[Bruce Abbott, James Cameron]",2.0
17,"[Jonathan Lemkin, Kirk Acevedo]",2.0
18,[Jensen Ackles 1],1.0
30,[Nick Adams],1.0
40,[Paul Adelstein],1.0


In [27]:
df = pd.concat([df, df_s, df_ss], axis=1)
df.head()

Unnamed: 0,name,bday,name_sp,num_of_child,education,role,birthplace,bday_sp,birthplace_sp,role_sp,...,year_div,name_sp_sp_cl,year_m_sp,year_div_sp,num_of_child_cl,num_of_child_sp_cl,name_sp_cl,num_of_m,name_sp_sp_cl.1,num_of_m_sp
0,50 Cent,1975-07-06,\n G-Unit \n Dr. Dre \n Eminem \n The Game \n ...,2.0,,\nRapper\nsinger\nsongwriter\nactor\nentrepren...,"New York City, New York, U.S.",,,,...,,,,,2.0,,,,,
1,Lee Aaker,1943-09-25,Sharon Ann Hamilton (1969-71) (divorced),,,"Actor, producer, carpenter, ski instructor","Los Angeles, California, U.S.",,,,...,,,,,,,,,,
2,Willie Aames,1960-07-15,Vicki Weatherman ( m. 1979–1984) Maylo McCasl...,2.0,,"Actor, director, television producer, screenwr...","Newport Beach, California, U.S.",,,,...,,,,,2.0,,"[Vicki Weatherman, Maylo McCaslin, Winnie Hung]",3.0,,
3,Quinton Aaron,1984-08-15,,,,Actor,"The Bronx, New York, U.S.",,,,...,,,,,,,,,,
4,Victor Aaron,1956-09-11,Eduvina Matta (m. 1974–1984) divorced,2.0,,"Actor, voice actor","Odessa, Texas, U.S.",,,,...,,,,,2.0,,,,,


### number of role
calculate the number of role of actor and his spouse

In [28]:
num_of_role = []
for i in range(len(df)):
    try:
        x = len(df['role'].values.tolist()[i].split())
        num_of_role.append(x)
    except:
        num_of_role.append(np.nan)
        
df_nor = pd.DataFrame({'num_of_role': num_of_role})#

In [29]:
num_of_role_sp = []

for i in range(len(df)):
    try:
        x = len(df['role_sp'].values.tolist()[i].split())
        num_of_role_sp.append(x)
    except:
        num_of_role_sp.append(np.nan)
df_nor_sp = pd.DataFrame({'num_of_role_sp': num_of_role_sp})#

In [30]:
df = pd.concat([df, df_nor, df_nor_sp], axis=1)
df.head()

Unnamed: 0,name,bday,name_sp,num_of_child,education,role,birthplace,bday_sp,birthplace_sp,role_sp,...,year_m_sp,year_div_sp,num_of_child_cl,num_of_child_sp_cl,name_sp_cl,num_of_m,name_sp_sp_cl,num_of_m_sp,num_of_role,num_of_role_sp
0,50 Cent,1975-07-06,\n G-Unit \n Dr. Dre \n Eminem \n The Game \n ...,2.0,,\nRapper\nsinger\nsongwriter\nactor\nentrepren...,"New York City, New York, U.S.",,,,...,,,2.0,,,,,,8.0,
1,Lee Aaker,1943-09-25,Sharon Ann Hamilton (1969-71) (divorced),,,"Actor, producer, carpenter, ski instructor","Los Angeles, California, U.S.",,,,...,,,,,,,,,5.0,
2,Willie Aames,1960-07-15,Vicki Weatherman ( m. 1979–1984) Maylo McCasl...,2.0,,"Actor, director, television producer, screenwr...","Newport Beach, California, U.S.",,,,...,,,2.0,,"[Vicki Weatherman, Maylo McCaslin, Winnie Hung]",3.0,,,5.0,
3,Quinton Aaron,1984-08-15,,,,Actor,"The Bronx, New York, U.S.",,,,...,,,,,,,,,1.0,
4,Victor Aaron,1956-09-11,Eduvina Matta (m. 1974–1984) divorced,2.0,,"Actor, voice actor","Odessa, Texas, U.S.",,,,...,,,2.0,,,,,,3.0,


### Birthplace
Get "latitude" and "longitude" from birthplace and convert them to geohash

In [32]:
def get_geohash(address, api_key=None, return_full_response=False):
    """
    Get geocode results from Google Maps Geocoding API.
    
    Note, that in the case of multiple google geocode reuslts, this function returns details of the FIRST result.
    
    @param address: String address as accurate as possible. For Example "18 Grafton Street, Dublin, Ireland"
    @param api_key: String API key if present from google. 
                    If supplied, requests will use your allowance from the Google API. If not, you
                    will be limited to the free usage of 2500 requests per day.
    @param return_full_response: Boolean to indicate if you'd like to return the full response from google. This
                    is useful if you'd like additional location details for storage or parsing later.
    """
    import geohash
    if address != np.nan:

        # Set up your Geocoding url
        geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address)
        api_key = 'AIzaSyA7SMzj0ewhMeObO2EtLDRFDDJcDEUaj4E'
        if api_key is not None:
            geocode_url = geocode_url + "&key={}".format(api_key)
        
        # Ping google for the reuslts:
        results = requests.get(geocode_url)
        # Results will be in JSON format - convert to dict using requests functionality
        results = results.json()
    
        # if there's no results or an error, return empty results.
        if len(results['results']) == 0:
            output = {
                "formatted_address" : np.nan,
                "latitude": np.nan,
                "longitude": np.nan,
            }
        else:    
            answer = results['results'][0]
            output = {
                "latitude": answer.get('geometry').get('location').get('lat'),
                "longitude": answer.get('geometry').get('location').get('lng'),
            }
        

        if output["latitude"]== np.nan and output["longitude"] == np.nan:
            geohash = np.nan
        else:
            geohash = geohash.encode(output["latitude"],output["longitude"],7)
    else:
        geohash = np.nan
    return geohash

In [33]:
# convert birthplace to geohash
geohash = []
for i in range(len(df)):
    try:
        geohash.append(get_geohash(df['birthplace'].values.tolist()[i]))
    except:
        geohash.append(np.nan)        
df_geo1 = pd.DataFrame({'geohash':geohash})

In [37]:
# convert birthplace (spouse) to geohash
geohash_sp = []
for i in range(len(df)):
    try:
        geohash_sp.append(get_geohash(df['birthplace_sp'].values.tolist()[i]))
    except:
        geohash_sp.append(np.nan)

df_geo2 = pd.DataFrame({'geohash_sp':geohash_sp})

In [44]:
df_geo1.loc[df_geo1['geohash'] == 'dn5bpxt'] = np.nan # fill missing value with np.nan
df_geo2.loc[df_geo2['geohash_sp'] == 'dn5bpxt'] = np.nan # fill missing value with np.nan
df= pd.concat([df, df_geo1, df_geo2], axis=1)

In [45]:
import pygeohash as pgh

# find distance between birthplace of two spouses
df_geo = df[["geohash","geohash_sp"]]
df_geo = df_geo.dropna(how='any', axis=0)

geo_distance = []
for i in range(len(df)):
    try:
        x = pgh.geohash_approximate_distance(df_geo['geohash'][i], df_geo['geohash_sp'][i]) # in meter
        geo_distance.append(x)
    except:
        geo_distance.append(np.nan)

geo_distance = pd.DataFrame({'geo_distance ': geo_distance })#
df= pd.concat([df, geo_distance], axis=1)

In [48]:
df.head()

Unnamed: 0,name,bday,name_sp,num_of_child,education,role,birthplace,bday_sp,birthplace_sp,role_sp,...,num_of_child_sp_cl,name_sp_cl,num_of_m,name_sp_sp_cl,num_of_m_sp,num_of_role,num_of_role_sp,geohash,geohash_sp,geo_distance
0,50 Cent,1975-07-06,\n G-Unit \n Dr. Dre \n Eminem \n The Game \n ...,2.0,,\nRapper\nsinger\nsongwriter\nactor\nentrepren...,"New York City, New York, U.S.",,,,...,,,,,,8.0,,dr5regw,,
1,Lee Aaker,1943-09-25,Sharon Ann Hamilton (1969-71) (divorced),,,"Actor, producer, carpenter, ski instructor","Los Angeles, California, U.S.",,,,...,,,,,,5.0,,9q5ctr1,,
2,Willie Aames,1960-07-15,Vicki Weatherman ( m. 1979–1984) Maylo McCasl...,2.0,,"Actor, director, television producer, screenwr...","Newport Beach, California, U.S.",,,,...,,"[Vicki Weatherman, Maylo McCaslin, Winnie Hung]",3.0,,,5.0,,9mupk2n,,
3,Quinton Aaron,1984-08-15,,,,Actor,"The Bronx, New York, U.S.",,,,...,,,,,,1.0,,dr72r5t,,
4,Victor Aaron,1956-09-11,Eduvina Matta (m. 1974–1984) divorced,2.0,,"Actor, voice actor","Odessa, Texas, U.S.",,,,...,,,,,,3.0,,9txjnt6,,


### Zodiacal sign 
Get Zodiacal sign of actor and his spouse from birthday

In [50]:
import os
from datetime import date
def Zodiacal(Year, Month, Day):

    os.system("cls")

    '''Capricorn:0, aquarium:1,Pices:2, Aries:3, Taurus:4, Gemini:5, Cancer:6, Leo:7, Virgo:8,\
        Libra: 9, Scorpio:10, Sagittarius:11
    '''
    try:
        if ((int(Month)==12 and int(Day) >= 22)or(int(Month)==1 and int(Day)<= 19)):
            zodiac_sign = int(0)#("\n Capricorn")
        elif ((int(Month)==1 and int(Day) >= 20)or(int(Month)==2 and int(Day)<= 17)):
            zodiac_sign = int(1)#("\n aquarium")
        elif ((int(Month)==2 and int(Day) >= 18)or(int(Month)==3 and int(Day)<= 19)):
            zodiac_sign = int(2)#("\n Pices")
        elif ((int(Month)==3 and int(Day) >= 20)or(int(Month)==4 and int(Day)<= 19)):
            zodiac_sign = int(3)#("\n Aries")
        elif ((int(Month)==4 and int(Day) >= 20)or(int(Month)==5 and int(Day)<= 20)):
            zodiac_sign = int(4)#("\n Taurus")
        elif ((int(Month)==5 and int(Day) >= 21)or(int(Month)==6 and int(Day)<= 20)):
            zodiac_sign = int(5)#("\n Gemini")
        elif ((int(Month)==6 and int(Day) >= 21)or(int(Month)==7 and int(Day)<= 22)):
            zodiac_sign = int(6)#("\n Cancer")
        elif ((int(Month)==7 and int(Day) >= 23)or(int(Month)==8 and int(Day)<= 22)): 
            zodiac_sign = int(7)#("\n Leo")
        elif ((int(Month)==8 and int(Day) >= 23)or(int(Month)==9 and int(Day)<= 22)): 
            zodiac_sign = int(8)#("\n Virgo")
        elif ((int(Month)==9 and int(Day) >= 23)or(int(Month)==10 and int(Day)<= 22)):
            zodiac_sign = int(9)#("\n Libra")
        elif ((int(Month)==10 and int(Day) >= 23)or(int(Month)==11 and int(Day)<= 21)): 
            zodiac_sign = int(10)#("\n Scorpio")
        elif ((int(Month)==11 and int(Day) >= 22)or(int(Month)==12 and int(Day)<= 21)):
            zodiac_sign = int(11)#("\n Sagittarius")
    except:
        zodiac_sign = np.nan
    return(zodiac_sign)

In [51]:
# find the zodiac sign from birthday
zodiac = []

for i in range(len(df['bday'])):
    Year = pd.to_datetime(df['bday'].values[i]).year
    Month = pd.to_datetime(df['bday'].values[i]).month
    Day = pd.to_datetime(df['bday'].values[i]).day

    zodiac.append(Zodiacal(Year, Month, Day))

df_zod = pd.DataFrame({'zodiac':zodiac})

In [52]:
# find the zodiac sign from birthday (spouse)
zodiac_sp = []

for i in range(len(df['bday_sp'])):
    Year = pd.to_datetime(df['bday_sp'].values[i]).year
    Month = pd.to_datetime(df['bday_sp'].values[i]).month
    Day = pd.to_datetime(df['bday_sp'].values[i]).day

    zodiac_sp.append(Zodiacal(Year, Month, Day))

df_zod_sp = pd.DataFrame({'zodiac_sp':zodiac_sp})

In [53]:
df= pd.concat([df, df_zod, df_zod_sp], axis=1)

In [54]:
df.columns

Index(['name', 'bday', 'name_sp', 'num_of_child', 'education', 'role',
       'birthplace', 'bday_sp', 'birthplace_sp', 'role_sp', 'name_sp_sp',
       'num_of_child_sp', 'age_diff', 'age', 'age_spouse', 'name_sp_cl',
       'year_m', 'year_div', 'name_sp_sp_cl', 'year_m_sp', 'year_div_sp',
       'num_of_child_cl', 'num_of_child_sp_cl', 'name_sp_cl', 'num_of_m',
       'name_sp_sp_cl', 'num_of_m_sp', 'num_of_role', 'num_of_role_sp',
       'geohash', 'geohash_sp', 'geo_distance ', 'zodiac', 'zodiac_sp'],
      dtype='object')

In [55]:
df['sex'] = 'M' # add sex as a column 

In [59]:
# save links to a csv file
df.to_csv ('export_df_actor.csv', header=True)

In [60]:
df.describe()

Unnamed: 0,age_diff,age,age_spouse,num_of_child_cl,num_of_child_sp_cl,num_of_m,num_of_m_sp,num_of_role,num_of_role_sp,geo_distance,zodiac,zodiac_sp
count,1647.0,20423.0,1677.0,5570.0,1136.0,5624.0,1488.0,20792.0,1642.0,1743.0,20423.0,1677.0
mean,-3.013727,74.454506,69.427794,2.47684,2.108275,1.53325,1.661962,2.23509,1.989038,11820280.0,5.568722,5.443649
std,6.169619,32.732983,26.787046,1.421907,0.967162,0.967163,1.264317,1.740636,1.404225,8558806.0,3.393534,3.339744
min,-30.418147,12.64913,23.2065,1.0,1.0,1.0,1.0,1.0,1.0,118.0,0.0,0.0
25%,-6.813282,47.743622,46.292532,2.0,1.0,1.0,1.0,1.0,1.0,5003530.0,3.0,2.0
50%,-2.672197,68.505171,64.718646,2.0,2.0,1.0,1.0,1.0,1.0,20000000.0,6.0,5.0
75%,0.659836,98.73714,87.881339,3.0,3.0,2.0,2.0,3.0,3.0,20000000.0,8.0,9.0
max,17.016092,175.740775,145.026934,12.0,6.0,11.0,11.0,23.0,11.0,20000000.0,11.0,11.0


### Work on divorce (as a label)
Determine whether actor is divorce or not by the number of marrage. If 'num_of_m'=1, we can assume they have never divorced. If 'num_of_m' >1, we can assume they have divorced.

In [90]:
marriage1 = df[df['num_of_m']==1][['name_sp']]

In [91]:
marriage1 = marriage1.reset_index()

In [94]:
marriage1.head()

Unnamed: 0,index,name_sp
0,6,Betty Smith ( m. 1918)
1,8,Jane Dufrayne ( m. 1950–1998)
2,10,Allie Wood ( m. 2013)
3,13,Kate Hannan ( m. 1962)
4,17,Kiersten Warren ( m. 2005)


In [95]:
# search keyword to determine whether divorce or not
import re
divorce =[]
for i in range(len(marriage1)):
    if re.search("div.", marriage1['name_sp'][i]) or \
    re.search("–", marriage1['name_sp'][i]) or \
    re.search("divorced", marriage1['name_sp'][i]) or\
    re.search("sep.", marriage1['name_sp'][i]) or\
    re.search("separated:", marriage1['name_sp'][i]):
        divorce.append(1)
    else:                     
        divorce.append(0)

In [96]:
divorce = pd.DataFrame({'divorce':divorce})

In [97]:
#x = x.drop(['divorce'], axis=1)
marriage1= pd.concat([marriage1, divorce], axis=1)

In [98]:
marriage1a = marriage1.set_index('index')

In [99]:
marriage1a=marriage1a.drop(['name_sp'], axis=1 )
marriage1a.head(10)

Unnamed: 0_level_0,divorce
index,Unnamed: 1_level_1
6,0
8,1
10,0
13,0
17,0
18,0
19,0
29,0
30,1
31,0


In [100]:
df1 = df
df1 = pd.concat([df1, marriage1a], axis=1, sort=False)

In [103]:
df1.loc[df1['num_of_m']>1, ['divorce']] = 1

In [108]:
df1[['divorce', 'name_sp', 'name_sp_cl','num_of_m']].head(20)

Unnamed: 0,divorce,name_sp,name_sp_cl,name_sp_cl.1,num_of_m
0,,\n G-Unit \n Dr. Dre \n Eminem \n The Game \n ...,,,
1,,Sharon Ann Hamilton (1969-71) (divorced),,,
2,1.0,Vicki Weatherman ( m. 1979–1984) Maylo McCasl...,"[Vicki Weatherman, Maylo McCaslin, Winnie Hung]","[Vicki Weatherman, Maylo McCaslin, Winnie Hung]",3.0
3,,,,,
4,,Eduvina Matta (m. 1974–1984) divorced,,,
5,1.0,Linda Hamilton ( m. 1982; div. 1989) Kathle...,"[Linda Hamilton, Kathleen Quinlan]","[Linda Hamilton, Kathleen Quinlan]",2.0
6,0.0,Betty Smith ( m. 1918),[Betty Smith],[Betty Smith],1.0
7,,,,,
8,1.0,Jane Dufrayne ( m. 1950–1998),[Jane Dufrayne],[Jane Dufrayne],1.0
9,,,,,


In [112]:
# save links to a csv file
df1.to_csv('export_df_actor_div.csv', header=True)

In [113]:
df1.head()

Unnamed: 0,name,bday,name_sp,num_of_child,education,role,birthplace,bday_sp,birthplace_sp,role_sp,...,num_of_m_sp,num_of_role,num_of_role_sp,geohash,geohash_sp,geo_distance,zodiac,zodiac_sp,sex,divorce
0,50 Cent,1975-07-06,\n G-Unit \n Dr. Dre \n Eminem \n The Game \n ...,2.0,,\nRapper\nsinger\nsongwriter\nactor\nentrepren...,"New York City, New York, U.S.",,,,...,,8.0,,dr5regw,,,6.0,,M,
1,Lee Aaker,1943-09-25,Sharon Ann Hamilton (1969-71) (divorced),,,"Actor, producer, carpenter, ski instructor","Los Angeles, California, U.S.",,,,...,,5.0,,9q5ctr1,,,9.0,,M,
2,Willie Aames,1960-07-15,Vicki Weatherman ( m. 1979–1984) Maylo McCasl...,2.0,,"Actor, director, television producer, screenwr...","Newport Beach, California, U.S.",,,,...,,5.0,,9mupk2n,,,6.0,,M,1.0
3,Quinton Aaron,1984-08-15,,,,Actor,"The Bronx, New York, U.S.",,,,...,,1.0,,dr72r5t,,,7.0,,M,
4,Victor Aaron,1956-09-11,Eduvina Matta (m. 1974–1984) divorced,2.0,,"Actor, voice actor","Odessa, Texas, U.S.",,,,...,,3.0,,9txjnt6,,,8.0,,M,
