# Acquring Neighborhood URLs.

In [356]:
import pandas as pd
import numpy as np
import time
import requests
from bs4 import BeautifulSoup

In [357]:
# Creates one list of neighborhood names and another list of URLs for those neighborhoods.
def get_neighborhood_list(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    pageData = soup.find_all(class_ = "index")
    final_list = []
    # Finds Neighborhoods in Fort Worth, TX.
    for i in range(0, len(pageData[0].contents), 2):
        if pageData[0].contents[i].text.find("Fort Worth, TX") != -1:
            # Gets just the neighborhood name out of the line's text.
            hood_name = pageData[0].contents[i].text.replace(" neighborhood in Fort Worth, TX", "")
            # Grabs the URL for the neighborhood, and concatenates it with the URL for city-data's site.
            hood_url = "https://www.city-data.com" + pageData[0].contents[i].a['href']
            d = dict(zip(['Name', 'URL'], [hood_name, hood_url]))
            final_list.append(d)
    return final_list

In [358]:
# Full list of neighborhoods.
registry = []

for iterator in range(1,34):
    temp = get_neighborhood_list("http://www.city-data.com/indexes/neighborhoods/TX/{}/".format(iterator))
    registry = registry + temp
    time.sleep(2)

In [359]:
df_nameURL = pd.DataFrame(registry)
df_nameURL.set_index('Name', inplace = True)
df_nameURL.to_csv('names_urls.csv')

# Scraping Neighborhood Data.

In [360]:
df_read = pd.read_csv('names_urls.csv', index_col = 'Name')
df_read.head()

Unnamed: 0_level_0,URL
Name,Unnamed: 1_level_1
Alamo Heights,https://www.city-data.com/neighborhood/Alamo-H...
Arlington Heights,https://www.city-data.com/neighborhood/Arlingt...
Arlington Heights Sector,https://www.city-data.com/neighborhood/Arlingt...
Basswood Park,https://www.city-data.com/neighborhood/Basswoo...
Belmont Terrace,https://www.city-data.com/neighborhood/Belmont...


In [361]:
def scrape_neighborhood(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    masterList = []
    neighData = gather_data(soup)
    masterList = masterList + neighData
    return masterList

In [362]:
def gather_data(soup):
    neighName = get_name(soup)
    rows = soup.find_all(class_ = "row")
    area, pop, density = get_pop(rows, neighName)
    income, rent = get_income(rows, neighName)
    malePop, femalePop, maleMedAge, femaleMedAge = get_age(rows, neighName)
    householdSize, familyHousehold, marriedFamilies, marriedWithChildren, singleMother = get_households(rows, neighName)
    unmarriedMen, unmarriedWomen = get_marriages(rows, neighName)
    poorEnglish = get_poorEnglish(rows, neighName)
    texasBorn, usBorn, bornCitizen, bornForeign = get_locality(rows, neighName)
    # Creating and returning a list of dictionaries.
    dictList = []
    d = dict(zip(["Neighborhood Name", "Area", "Population", "Population Density", "Median Income", \
                  "Median Rent", "Male Population", "Female Population", "Median Age Among Men", \
                  "Median Age Among Women", "Average Household Size", "Percentage of Family Households", \
                  "Percentage of Married Families", "Percentage of Married Families with Children", \
                  "Percentage of Single Mother Families", "Percentage of Never-Married Men", \
                  "Percentage of Never-Married Women", "Percentage of People Who Speak English Poorly or Not at All", \
                  "Residents Born in Texas", "Residents Born in the U.S.", "Residents Born as U.S. Citizens Outside the U.S.", \
                  "Residents Born Foreign"], \
                 [neighName, area, pop, density, income, rent, malePop, femalePop, maleMedAge, femaleMedAge, \
                  householdSize, familyHousehold, marriedFamilies, marriedWithChildren, singleMother, \
                  unmarriedMen, unmarriedWomen, poorEnglish, texasBorn, usBorn, bornCitizen, bornForeign]))
    dictList.append(d)
    return dictList

In [363]:
def get_pop(row, neighName):
    curr = row[0].find(class_ = "content-item").text
    # Getting area, population and density.
    area = extract_stat(curr, "Area: ", "Population: ")
    pop = extract_stat(curr, "Population: ", "Population density:")
    density = extract_stat(curr, "Population density:" + neighName + ":", "Fort Worth:")
    return area, pop, density

In [364]:
def get_income(rows, neighName):
    curr = rows[0].find_all(class_ = "content-item")
    # Getting median income.
    incomeText = curr[1].text
    income = extract_stat(incomeText, neighName + ":", "Fort Worth:")
    # Getting median rent.
    rentText = curr[2].text
    rent = extract_stat(rentText, neighName + ":", "Fort Worth:")
    return income, rent

In [365]:
def get_age(rows, neighName):
    items = rows[0].find_all(class_ = "content-item")
    curr = items[4].text
    # Getting male and female populations.
    males = extract_stat(curr, "Males:", "Females:")
    females = extract_stat(curr, "Females:", "Median age:")
    # Getting median age.
    malesMed = extract_stat(curr, "Males:This neighborhood:", "Whole city:31.8")
    femalesMed = extract_stat(curr, "Females:This neighborhood:", "Whole city:33.0")
    return males, females, malesMed, femalesMed

In [366]:
def get_households(rows, neighName):
    items = rows[0].find_all(class_ = "content-item")
    mainItem = items[5]
    tables = mainItem.find_all(class_ = "hgraph")
    # Grabbing stats.
    householdSize = extract_stat(tables[0].text, neighName + ":", "Fort Worth:")
    familyHouseholds = extract_stat(tables[1].text, neighName + ":", "Fort Worth:")
    marriedFamilies = extract_stat(tables[2].text, "Here:", "Fort Worth:")
    marriedWithChildren = extract_stat(tables[3].text, neighName + ":", "Fort Worth:")
    singleMother = extract_stat(tables[4].text, neighName + ":", "Fort Worth:")
    return householdSize, familyHouseholds, marriedFamilies, marriedWithChildren, singleMother

In [367]:
def get_marriages(rows, neighName):
    items = rows[0].find_all(class_ = "content-item")
    mainItem = items[7]
    tables = mainItem.find_all(class_ = "hgraph")
    # Grabbing stats.
    unmarriedMen = extract_stat(tables[0].text, neighName + ":", "Fort Worth:")
    unmarriedWomen = extract_stat(tables[1].text, neighName + ":", "city:")
    return unmarriedMen, unmarriedWomen

In [368]:
def get_poorEnglish(rows, neighName):
    items = rows[0].find_all(class_ = "content-item")
    mainItem = items[8]
    tables = mainItem.find_all(class_ = "hgraph")
    # Grabbing the single stat from this content-item.
    poorEnglish = extract_stat(tables[0].text, neighName + ":", "Fort Worth:")
    return poorEnglish

In [369]:
def get_locality(rows, neighName):
    items = rows[0].find_all(class_ = "content-item")
    mainItem = items[9]
    tables = mainItem.find_all(class_ = "hgraph")
    # Grabbing data.
    texasBorn = extract_stat(tables[0].text, "Here:", "Fort Worth:")
    usBorn = extract_stat(tables[1].text, neighName + ":", "Fort Worth:")
    bornCitizen = extract_stat(tables[2].text, "Here:", "city:")
    foreignBorn = extract_stat(tables[3].text, "Here:", "Fort Worth:")
    return texasBorn, usBorn, bornCitizen, foreignBorn

In [370]:
def extract_stat(string, openStr, closeStr):
    statStart = string.find(openStr) + len(openStr)
    statEnd = string.find(closeStr)
    stat = string[statStart : statEnd].rstrip()
    return stat

In [371]:
def get_name(page):
    pageTitle = page.find_all(class_ = "city")
    titleText = pageTitle[0].text
    nameEnd = titleText.find(" neighborhood in Fort Worth, Texas")
    name = titleText[: nameEnd]
    return name

In [372]:
master_list = []
neighborhoods = pd.Index(df_read)
counter = 0
while counter < neighborhoods.size:
    neigh_url = neighborhoods.values[counter][0]
    try:
        curr = scrape_neighborhood(neigh_url)
        master_list = master_list + curr
    except:
        pass
    time.sleep(2)
    counter += 1

In [376]:
df_roughData = pd.DataFrame(master_list)
df_roughData.set_index("Neighborhood Name", inplace = True)
df_roughData.to_csv("roughData.csv")
df_roughData.head()

Unnamed: 0_level_0,Area,Average Household Size,Female Population,Male Population,Median Age Among Men,Median Age Among Women,Median Income,Median Rent,Percentage of Family Households,Percentage of Married Families,...,Percentage of Never-Married Men,Percentage of Never-Married Women,Percentage of People Who Speak English Poorly or Not at All,Percentage of Single Mother Families,Population,Population Density,Residents Born Foreign,Residents Born as U.S. Citizens Outside the U.S.,Residents Born in Texas,Residents Born in the U.S.
Neighborhood Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alamo Heights,0.503 square miles,1.6 people,232,242,33.3 years,31.1 years,"$80,729","$1,516",17.9%,18.9%,...,23.4%,23.0%,0.0%,35.1%,475,945 people per square mile,20.0%,2.2%,48.1%,29.7%
Arlington Heights,2.031 square miles,4.8 people,5589,4815,39.8 years,35.7 years,"$86,764",$887,38.6%,38.3%,...,16.8%,18.8%,0.8%,12.0%,10405,"5,124 people per square mile",4.6%,1.3%,65.3%,28.9%
Arlington Heights Sector,11.136 square miles,3.1 people,17089,15559,38.4 years,37.4 years,"$69,104",$849,35.2%,32.9%,...,17.9%,19.1%,2.7%,21.0%,32648,"2,932 people per square mile",9.6%,1.6%,64.5%,24.4%
Belmont Terrace,0.202 square miles,7.5 people,320,379,28.5 years,30.7 years,"$38,563",$578,60.1%,45.7%,...,24.7%,16.5%,28.7%,45.5%,699,"3,461 people per square mile",35.8%,1.6%,54.3%,8.3%
Benbrook,12.514 square miles,3.9 people,11443,11050,40.1 years,44.7 years,"$63,316",$841,48.9%,54.2%,...,13.3%,9.9%,2.3%,9.3%,22494,"1,797 people per square mile",4.7%,1.7%,63.9%,29.7%
