In [80]:
from bs4 import BeautifulSoup as bs
from splinter import Browser
import time
import pandas as pd
import requests

In [81]:
# WINDOWS/PC - Import splinter and set the cromedriver path
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

# MAC - Import splinter and set the cromedriver path
#executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
#browser = Browser('chrome', **executable_path, headless=False)

# Scraping Subreddits for Subscriber Count

In [82]:
subreddits = ["AZCardinals", "falcons", "ravens", "buffalobills", "panthers","CHIBears", "bengals", "Browns",
              "cowboys", "DenverBroncos", "detroitlions", "GreenBayPackers", "Texans", "Colts", "Jaguars",
              "KansasCityChiefs", "miamidolphins", "minnesotavikings", "Patriots", "Saints", "NYGiants", "nyjets",
              "oaklandraiders", "eagles", "steelers", "LosAngelesRams", "Chargers", "49ers", "Seahawks", 
              "buccaneers", "Tennesseetitans", "Redskins"]

subcounts = []

In [83]:
for sub in subreddits:
    time.sleep(1)
    url = f"https://www.reddit.com/r/{sub}"
    browser.visit(url)
    time.sleep(1)
    html = browser.html
    soup = bs(html, "html.parser")
    subscribers = int(float(soup.find("p", class_="s1bd5ppi-10").text.split("k")[0])*1000)
    dictionary = {"Subreddit URL (https://www.reddit.com/r/)":sub, "Subs":subscribers}
    subcounts.append(dictionary)

In [85]:
df = pd.DataFrame(subcounts)
teams = pd.read_csv("Teams.csv")

In [86]:
combined = pd.merge(df, teams, on="Subreddit URL (https://www.reddit.com/r/)")
combined = combined.drop(["Subreddit URL (https://www.reddit.com/r/)"], axis=1)
combined.head(32)

Unnamed: 0,Subs,Team,Stadium City,Real City
0,31700,Arizona Cardinals,"Glendale, Arizona","Phoenix, Arizona"
1,47400,Atlanta Falcons,"Atlanta, Georgia","Atlanta, Georgia"
2,36000,Baltimore Ravens,"Baltimore, Maryland","Baltimore, Maryland"
3,36700,Buffalo Bills,"Orchard Park, New York","Buffalo, New York"
4,36900,Carolina Panthers,"Charlotte, North Carolina","Charlotte, North Carolina"
5,62500,Chicago Bears,"Chicago, Illinois","Chicago, Illinois"
6,37800,Cincinnati Bengals,"Cincinnati, Ohio","Cincinnati, Ohio"
7,53300,Cleveland Browns,"Cleveland, Ohio","Cleveland, Ohio"
8,63500,Dallas Cowboys,"Arlington, Texas","Dallas, Texas"
9,51700,Denver Broncos,"Denver, Colorado","Denver, Colorado"


In [87]:
combined.to_csv("TeamsSubs.csv", index=False, header=True)

# Scraping Team Values

In [88]:
values_url = "https://www.reddit.com/r/nfl/comments/9hflml/forbes_nfl_2018_team_valuations_most_valuable/"

In [89]:
tables = pd.read_html(values_url)
tables

[                    Team Value(Billions) Revenue(Millions)  \
 0         Dallas Cowboys              $5              $864   
 1   New England Patriots            $3.8              $593   
 2        New York Giants            $3.3              $493   
 3       Los Angeles Rams            $3.2              $366   
 4    Washington Redskins            $3.1              $491   
 5    San Francisco 49ers           $3.05              $470   
 6          Chicago Bears            $2.9              $431   
 7          New York Jets           $2.85              $443   
 8         Houston Texans            $2.8              $464   
 9    Philadelphia Eagles           $2.75              $458   
 10        Denver Broncos           $2.65              $427   
 11     Green Bay Packers          $2.625              $434   
 12       Atlanta Falcons            $2.6              $451   
 13      Baltimore Ravens           $2.59              $417   
 14   Pittsburgh Steelers          $2.585              

In [90]:
values_df = tables[0]
values_df.columns = ["Team", "Value ($B)", "Revenue ($M)", "Operating Income ($M)"]

In [91]:
values_combined = pd.merge(combined, values_df, on="Team")
values_combined.head(32)

Unnamed: 0,Subs,Team,Stadium City,Real City,Value ($B),Revenue ($M),Operating Income ($M)
0,31700,Arizona Cardinals,"Glendale, Arizona","Phoenix, Arizona",$2.15,$380,$74
1,47400,Atlanta Falcons,"Atlanta, Georgia","Atlanta, Georgia",$2.6,$451,$113
2,36000,Baltimore Ravens,"Baltimore, Maryland","Baltimore, Maryland",$2.59,$417,$107
3,36700,Buffalo Bills,"Orchard Park, New York","Buffalo, New York",$1.6,$364,$67
4,36900,Carolina Panthers,"Charlotte, North Carolina","Charlotte, North Carolina",$2.3,$396,$62
5,62500,Chicago Bears,"Chicago, Illinois","Chicago, Illinois",$2.9,$431,$100
6,37800,Cincinnati Bengals,"Cincinnati, Ohio","Cincinnati, Ohio",$1.8,$359,$60
7,53300,Cleveland Browns,"Cleveland, Ohio","Cleveland, Ohio",$1.95,$375,$31
8,63500,Dallas Cowboys,"Arlington, Texas","Dallas, Texas",$5,$864,$365
9,51700,Denver Broncos,"Denver, Colorado","Denver, Colorado",$2.65,$427,$106


In [92]:
values_combined.to_csv("TeamsSubsValues.csv", index=False, header=True)

# Scraping City Populations

In [93]:
cities = values_combined["Real City"].tolist()
cities

['Phoenix, Arizona',
 'Atlanta, Georgia',
 'Baltimore, Maryland',
 'Buffalo, New York',
 'Charlotte, North Carolina',
 'Chicago, Illinois',
 'Cincinnati, Ohio',
 'Cleveland, Ohio',
 'Dallas, Texas',
 'Denver, Colorado',
 'Detroit, Michigan',
 'Green Bay, Wisconsin',
 'Houston, Texas',
 'Indianapolis, Indiana',
 'Jacksonville, Florida',
 'Kansas City, Missouri',
 'Miami, Florida',
 'Minneapolis, Minnesota',
 'Boston, Massachusetts',
 'New Orleans, Louisiana',
 'New York City, New York',
 'New York City, New York',
 'Oakland, California',
 'Philadelphia, Pennsylvania',
 'Pittsburgh, Pennsylvania',
 'Los Angeles, California',
 'Los Angeles, California',
 'San Francisco, California',
 'Seattle, Washington',
 'Tampa, Florida',
 'Nashville, Tennessee',
 'Washington, DC']

In [94]:
cities = [item.replace(", ", "-") for item in cities]
cities = [item.replace("New York City", "New York") for item in cities]
cities = [item.replace(" ", "-") for item in cities]
cities = [item.replace("Nashville-Tennessee", "Nashville-Davidson-Tennessee") for item in cities]
cities = [item.replace("Washington-DC", "Washington-District-of-Columbia") for item in cities]
cities_urls = [item + ".html" for item in cities]
cities_urls

['Phoenix-Arizona.html',
 'Atlanta-Georgia.html',
 'Baltimore-Maryland.html',
 'Buffalo-New-York.html',
 'Charlotte-North-Carolina.html',
 'Chicago-Illinois.html',
 'Cincinnati-Ohio.html',
 'Cleveland-Ohio.html',
 'Dallas-Texas.html',
 'Denver-Colorado.html',
 'Detroit-Michigan.html',
 'Green-Bay-Wisconsin.html',
 'Houston-Texas.html',
 'Indianapolis-Indiana.html',
 'Jacksonville-Florida.html',
 'Kansas-City-Missouri.html',
 'Miami-Florida.html',
 'Minneapolis-Minnesota.html',
 'Boston-Massachusetts.html',
 'New-Orleans-Louisiana.html',
 'New-York-New-York.html',
 'New-York-New-York.html',
 'Oakland-California.html',
 'Philadelphia-Pennsylvania.html',
 'Pittsburgh-Pennsylvania.html',
 'Los-Angeles-California.html',
 'Los-Angeles-California.html',
 'San-Francisco-California.html',
 'Seattle-Washington.html',
 'Tampa-Florida.html',
 'Nashville-Davidson-Tennessee.html',
 'Washington-District-of-Columbia.html']

In [95]:
values_combined['City URL'] = pd.Series(cities_urls)
values_combined.head(32)

Unnamed: 0,Subs,Team,Stadium City,Real City,Value ($B),Revenue ($M),Operating Income ($M),City URL
0,31700,Arizona Cardinals,"Glendale, Arizona","Phoenix, Arizona",$2.15,$380,$74,Phoenix-Arizona.html
1,47400,Atlanta Falcons,"Atlanta, Georgia","Atlanta, Georgia",$2.6,$451,$113,Atlanta-Georgia.html
2,36000,Baltimore Ravens,"Baltimore, Maryland","Baltimore, Maryland",$2.59,$417,$107,Baltimore-Maryland.html
3,36700,Buffalo Bills,"Orchard Park, New York","Buffalo, New York",$1.6,$364,$67,Buffalo-New-York.html
4,36900,Carolina Panthers,"Charlotte, North Carolina","Charlotte, North Carolina",$2.3,$396,$62,Charlotte-North-Carolina.html
5,62500,Chicago Bears,"Chicago, Illinois","Chicago, Illinois",$2.9,$431,$100,Chicago-Illinois.html
6,37800,Cincinnati Bengals,"Cincinnati, Ohio","Cincinnati, Ohio",$1.8,$359,$60,Cincinnati-Ohio.html
7,53300,Cleveland Browns,"Cleveland, Ohio","Cleveland, Ohio",$1.95,$375,$31,Cleveland-Ohio.html
8,63500,Dallas Cowboys,"Arlington, Texas","Dallas, Texas",$5,$864,$365,Dallas-Texas.html
9,51700,Denver Broncos,"Denver, Colorado","Denver, Colorado",$2.65,$427,$106,Denver-Colorado.html


In [96]:
populations = []

for city in cities_urls:
    pop_url = f"http://www.city-data.com/city/{city}"
    browser.visit(pop_url)
    time.sleep(1)
    pop_html = browser.html
    pop_soup = bs(pop_html, "html.parser")
    population = int(float(pop_soup.find("section", class_="city-population").text.split(":")[1].strip().replace(",","")))
    pop_dictionary = {"City URL":city, "Population (2016)":population}
    populations.append(pop_dictionary)

In [98]:
pop_df = pd.DataFrame(populations)
final_df = pd.merge(values_combined, pop_df, on="City URL")
values_pop_df = final_df.drop(["City URL"], axis=1)
values_pop_df.head(32)

Unnamed: 0,Subs,Team,Stadium City,Real City,Value ($B),Revenue ($M),Operating Income ($M),Population (2016)
0,31700,Arizona Cardinals,"Glendale, Arizona","Phoenix, Arizona",$2.15,$380,$74,1615041
1,47400,Atlanta Falcons,"Atlanta, Georgia","Atlanta, Georgia",$2.6,$451,$113,472506
2,36000,Baltimore Ravens,"Baltimore, Maryland","Baltimore, Maryland",$2.59,$417,$107,614664
3,36700,Buffalo Bills,"Orchard Park, New York","Buffalo, New York",$1.6,$364,$67,256908
4,36900,Carolina Panthers,"Charlotte, North Carolina","Charlotte, North Carolina",$2.3,$396,$62,842029
5,62500,Chicago Bears,"Chicago, Illinois","Chicago, Illinois",$2.9,$431,$100,2704965
6,37800,Cincinnati Bengals,"Cincinnati, Ohio","Cincinnati, Ohio",$1.8,$359,$60,298802
7,53300,Cleveland Browns,"Cleveland, Ohio","Cleveland, Ohio",$1.95,$375,$31,385810
8,63500,Dallas Cowboys,"Arlington, Texas","Dallas, Texas",$5,$864,$365,1317942
9,51700,Denver Broncos,"Denver, Colorado","Denver, Colorado",$2.65,$427,$106,693060


In [99]:
values_pop_df.to_csv("TeamsSubsValuesPops.csv", index=False, header=True)

# Scraping Team Ages

In [100]:
ages_url = "https://en.wikipedia.org/wiki/National_Football_League"

In [101]:
ages_tables = pd.read_html(ages_url)
ages_tables

[                                                 0  \
 0   Upcoming season or competition: 2019 NFL draft   
 1                                              NaN   
 2                                         Formerly   
 3                                            Sport   
 4                                          Founded   
 5                                 Inaugural season   
 6                                     Commissioner   
 7                                     No. of teams   
 8                                          Country   
 9                                     Headquarters   
 10                          Most recentchampion(s)   
 11                                     Most titles   
 12                                   TV partner(s)   
 13                                Official website   
 
                                                     1  
 0                                                 NaN  
 1                                                 NaN  
 2

In [113]:
ages_df = ages_tables[2]
ages_df = ages_df[["Club[57]", "First season[59]"]]
ages_df.columns = ["Team", "Founded"]
years = ages_df["Founded"].tolist()
years = [item[:4] for item in years]
del years[16]
years = years[:-1]
years = [int(item) for item in years]

KeyError: "['Club[57]' 'First season[59]'] not in index"

In [103]:
teams = ages_df["Team"].tolist()
del teams[16]
del teams[32]

KeyError: 'Team'

In [None]:
cleanteams = []

for foo in teams:
    try:
        team = foo.split("*")[0]
        cleanteams.append(team)
    except:
        cleanteams.append(foo)

In [None]:
import datetime
currentyear = datetime.date.today().year

teamages = []

for x in years:
    teamage = currentyear - x
    teamages.append(teamage)

In [None]:
ages_df = pd.DataFrame({
    "Team": cleanteams,
    "Team Age": teamages
})

In [None]:
ages_df.head(32)

In [None]:
final_df = pd.merge(values_pop_df, ages_df, on="Team")
final_df.head(32)