In [12]:
import re
import datetime
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import numpy as np
from time import sleep
import os
import calendar

In [128]:
season_2015 = pd.read_csv(os.path.join("data", "season-2015-16.csv"), parse_dates=["date"])
season_2016 = pd.read_csv(os.path.join("data", "game_raw.csv"), parse_dates=["date"])

In [145]:
all_teams = list(set(np.hstack((season_2015["home"].unique(),
                              season_2015["guest"].unique(),
                              season_2016["home"].unique(),
                              season_2016["guest"].unique()))))

In [152]:
all_teams = [s.lower() for s in all_teams]

In [116]:
pattern_date = re.compile("\w{3}\s\d{1,2},\s\d{4}")
pattern_streak = re.compile("(\w{3,4})\s(\d+)")

In [182]:
all_streaks = {2015:0, 2016:0}

In [197]:
for year in [2015, 2016]:
    team_dict = {}
    for team_ in all_teams:
        year_str = str(year) + "_" + str(year+1) + "_"
        url_ = "http://www.landofbasketball.com/results_by_team/" + year_str + team_ + ".htm"
        r = urllib.request.urlopen(url_).read()
        soup = BeautifulSoup(r, "lxml")
        if soup.find_all("td", class_="a-center", text=pattern_streak):
            all_date = [content.text.strip() for content in soup.find_all("td", text=pattern_date)]
            all_streak = [re.search(pattern_streak,
                                    content.text).group(0) for content in soup.find_all("td", 
                                                                        class_="a-center",
                                                                        text=pattern_streak)]
            conv_date = [datetime.datetime.strptime(date_, "%b %d, %Y").date() for date_ in all_date]
            streak = [int(re.search(pattern_streak, s).group(2)) \
                      if "won" in s else -int(re.search(pattern_streak, s).group(2)) \
                      for s in all_streak]
            team_dict.setdefault(team_, dict(zip(conv_date, streak)))
    all_streaks[year] = team_dict

In [249]:
def streak_map(data, year, team):
    try:
        streak_ = all_streaks[year][data[team].lower()] \
                              [data["date"].date() - datetime.timedelta(days=1)]
    except KeyError:
        streak_ = 0
    return streak_

In [252]:
season_2015["streak_home"] = season_2015.apply(lambda x: streak_map(x, 2015, "home"), axis=1)
season_2015["streak_guest"] = season_2015.apply(lambda x: streak_map(x, 2015, "guest"), axis=1)
season_2016["streak_home"] = season_2016.apply(lambda x: streak_map(x, 2016, "home"), axis=1)
season_2016["streak_guest"] = season_2016.apply(lambda x: streak_map(x, 2016, "guest"), axis=1)

In [257]:
season_2015.to_csv("data/input_2015.csv")
season_2016.to_csv("data/input_2016.csv")