In [1]:
import os
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd  
import numpy as np
import re
import urllib

import sqlite3




info about sql: http://www.sqlitetutorial.net/sqlite-select-distinct  
http://stackoverflow.com/questions/20963887/deleting-duplicate-rows-in-sqlite  
http://yznotes.com/write-pandas-dataframe-to-sqlite/  
https://www.dataquest.io/blog/python-pandas-databases/  


#### A few parameters to set

In [3]:
years_to_get = range(2013,2018,1) #these are the years that will be scraped

#  Overview  

  
This file will scrape data from two websites and put it in a SQLite database. It also does some cleaning of the team names so the data can be matched up.

# Scrape season efficiency numbers from Ken Pomeroy's Website  

http://kenpom.com/index.php

#### Define function to get table from page

In [4]:
def get_table_from_url(souper):
    
    tables = souper.find_all("table")
    table_list=[]
    for table in tables:
        #Get all rows
        rows = table.find_all("tr")
        #Loop over rows
        list_of_rows = []
        for row in rows:
            #Get all columns
            cols = row.find_all("td")
            #Loop over columns
            single_row = []
            for col in cols:
                z = col.get_text()
                #Append column text to single_row list
                single_row.append(z)
            #Append row (list of columns) to list_of_rows list
            list_of_rows.append(single_row)
        #List of rows is now a lists of lists. Convert to df
        df = pd.DataFrame(list_of_rows)
        table_list.append(df)
    return(df)

#### Define function to get the data and do some cleaning

In [5]:
def grab_the_url(url):
    
    html = urllib.urlopen(url).read()
    soup = BeautifulSoup(html,"lxml")

    return(get_table_from_url(soup))

#####  The next two cells are commented out in favor os using an alternative method that pulls from waybackmachine rather than directly from Pomeroy's site. They are still included here though for future reference, if needed.

def get_season_data(year):
    
    if year == 2017:
        url = "http://kenpom.com/index.php"
    else:
        url = "http://kenpom.com/index.php?y="+str(year)
    
    df = grab_the_url(url)

    colnames = ["Rank","Team","Conf","W-L","AdjEM","AdjO","AdjO_Rank","AdjD","AdjD_Rank","AdjT","AdjT_Rank","Luck","Luck_Rank","SOSAdjEM","SOSAdjEM_Rank","OppO","OppO_Rank","OppD","OppD_Rank","NCSOSAdjEM","NCSOSAdjEM_Rank"]
    df.columns = colnames
    df.dropna(axis=0,thresh=10,inplace=True) # #remove the two empty rows at top
    df['Team'].replace(to_replace="\d",value="",inplace=True,regex=True)
    df.loc[:,"Team"] = df['Team'].str.strip()
    df = df.join(df['W-L'].str.split('-',1,expand=True).rename(columns={0:'Wins',1:'Losses'}))
    df.drop('W-L',axis=1,inplace=True)
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    df.loc[:,"Year"] = year
    return(df)

conn = sqlite3.connect("basketball_data.db")

season_data = {}
for year in years_to_get:
    #season_data[year] = get_season_data(year)
    #get_season_data(year).to_sql("team_data",conn,if_exists="append")

conn.close()    

### Put 2013 to 2017 season level data in a table

In [1]:
def get_season_data_alternateway(year,url):
    
  
    df = grab_the_url(url)

    colnames = ["Rank","Team","Conf","W-L","AdjEM","AdjO","AdjO_Rank","AdjD","AdjD_Rank","AdjT","AdjT_Rank","Luck","Luck_Rank","SOSAdjEM","SOSAdjEM_Rank","OppO","OppO_Rank","OppD","OppD_Rank","NCSOSAdjEM","NCSOSAdjEM_Rank"]
    df.columns = colnames
    df.dropna(axis=0,thresh=10,inplace=True) # #remove the two empty rows at top
    df['Team'].replace(to_replace="\d",value="",inplace=True,regex=True)
    df.loc[:,"Team"] = df['Team'].str.strip()
    df = df.join(df['W-L'].str.split('-',1,expand=True).rename(columns={0:'Wins',1:'Losses'}))
    df.drop('W-L',axis=1,inplace=True)
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    df.loc[:,"Year"] = year
    return(df)

#### Selection Sunday Dates

Use these as reference so I know what archive to pull from waybackmachine

In [8]:
year = [2017,2016,2015,2014,2013,2012,2011]
ss = ["2017-03-12","2016-03-13","2015-03-15","2014-03-16","2013-03-17","2012-03-11","2011-03-13"]
selection_sunday = pd.DataFrame({"Year": year,"SS": ss})
selection_sunday['SS'] = pd.to_datetime(selection_sunday.SS,format='%Y-%m-%d')
selection_sunday

Unnamed: 0,SS,Year
0,2017-03-12,2017
1,2016-03-13,2016
2,2015-03-15,2015
3,2014-03-16,2014
4,2013-03-17,2013
5,2012-03-11,2012
6,2011-03-13,2011


#### Ken Pomeroy updates his season numbers through the championship game, so I need to use the waybackmachine to get numbers prior to the tournament. I use the last available archive after selection sunday and before the tournament starts.

In [26]:
conn = sqlite3.connect("basketball_data.db")
get_season_data_alternateway(2017,"http://kenpom.com/index.php").to_sql("team_data",conn,if_exists="replace")
get_season_data_alternateway(2016,"https://web.archive.org/web/20160313185626/http://kenpom.com/index.php").to_sql("team_data",conn,if_exists="append")
get_season_data_alternateway(2015,"https://web.archive.org/web/20150317134006/http://kenpom.com/index.php").to_sql("team_data",conn,if_exists="append")
get_season_data_alternateway(2014,"https://web.archive.org/web/20140319032435/http://kenpom.com/index.php").to_sql("team_data",conn,if_exists="append")
get_season_data_alternateway(2013,"https://web.archive.org/web/20130316035756/http://kenpom.com/index.php").to_sql("team_data",conn,if_exists="append")
get_season_data_alternateway(2012,"https://web.archive.org/web/20120315142211/http://kenpom.com/index.php").to_sql("team_data",conn,if_exists="append")

conn.close()  

# Scrape game level data from Sports-Reference.com  
http://www.sports-reference.com/cbb/schools/brigham-young/2017-gamelogs.html

#### Function to scrape and clean data for a team in a given year

In [10]:
def get_season_for_team(team,year):
    url = "http://www.sports-reference.com/cbb/schools/" + team + "/" + str(year) + "-gamelogs.html"
    df = grab_the_url(url)
    
    
    # add column names
    df.columns = ['Date','Location','Opp','Win','Tm_Pts','Opp_Pts','Tm_FG','Tm_FGA','Tm_FG_pct','Tm_3P','Tm_3PA','Tm_3P_pct','Tm_FT','Tm_FTA','Tm_FT_pct','Tm_ORB','Tm_TRB','Tm_AST','Tm_STL','Tm_BLK','Tm_TOV','Tm_PF','','Opp_FG','Opp_FGA','Opp_FG_pct','Opp_3P','Opp_3PA','Opp_3P_pct','Opp_FT','Opp_FTA','Opp_FT_pct','Opp_ORB','Opp_TRB','Opp_AST','Opp_STL','Opp_BLK','Opp_TOV','Opp_PF']
    # remove rows that are blank
    df.dropna(axis=0,thresh=10,inplace=True)
    df.Win.replace(to_replace="\(\d OT\)",value="",inplace=True,regex=True)
    df.Win = df.Win.str.strip()
    df.loc[df.Win=="W",'Win'] = 1
    df.loc[df.Win=="L",'Win'] = 0
    df.loc[df.Location=="",'Location'] = "Home"
    df.loc[df.Location=="N",'Location'] = "Neutral"
    df.loc[df.Location=="@",'Location'] = "Away"
    df.drop('',axis=1,inplace=True)
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    df['Date'] = pd.to_datetime(df.Date,format='%Y-%m-%d')
    df.loc[:,"Tm"] = team
    df.loc[:,"Year"] = year
    return(df)


In [11]:
#year = 2017
#columns_for_big_df = list(get_season_for_team("brigham-young",2017).columns)

#### Get list of all current schools

In [13]:
schools_page_url = "http://www.sports-reference.com/cbb/schools/"

schools_to_include = grab_the_url(schools_page_url)

# remove rows that are blank
schools_to_include.dropna(axis=0,thresh=10,inplace=True)
#df.drop('',axis=1,inplace=True)
schools_to_include = schools_to_include.apply(lambda x: pd.to_numeric(x, errors='ignore'))
schools_to_include = schools_to_include.iloc[:,0:4]
schools_to_include.columns = ['School','Location','From','To']
schools_to_include = schools_to_include[schools_to_include.To > 2006] #only keep teams that are still D1
schools_to_include = schools_to_include[['School']]
schools_to_include.head()

Unnamed: 0,School
1,Abilene Christian Wildcats
2,Air Force Falcons
3,Akron Zips
4,Alabama A&M Bulldogs
5,Alabama Crimson Tide


### Get url and name for all schools and then subset to only current D1

In [14]:
html = urllib.urlopen(schools_page_url).read()
names_soup = BeautifulSoup(html,"lxml")

In [15]:
school_urls = []
for link in names_soup.findAll('a', href=True):
    #print link['href']
    #all_links.append(link['href'])
    try:
        m = re.search("(?<=/cbb/schools/)[a-z-]+/$",link['href']).group(0)
        #school_urls.append([m,link.get_text()])
        school_urls.append([m[:len(m)-1],link.get_text()])
#        print link.get_text()
    except:
        continue
schools_df = pd.DataFrame(school_urls,columns=["url_name","School"])

##### Subset to only schools that are currently division 1

In [17]:
schools = pd.merge(schools_df,schools_to_include,on="School",how="right")

## Scrape yearly game data for every D1 team

In [21]:
schools.shape

(352, 2)

In [22]:
def get_info_for_chunk(schools,error_list):
    for school in schools['url_name']:
        for year in years_to_get:
            try:
                get_season_for_team(school,year).to_sql("game_data",conn,if_exists="append")
            except:
                print "could not get "+school+str(year)
                error_list.append([school,year])

        print "finished "+school
    return(error_list)

#### Use this for getting data for multiple years (currently commented out so dont overwrite database)

year = [2016,2015,2014,2013,2012,2011] #2017 was not included since the regular season wasn't finished yet.
conn = sqlite3.connect("basketball_data.db")
game_level_data = {}

error_list = []

error_list = get_info_for_chunk(schools,error_list)
conn.close()

#### Use this for getting the 2017 data once the regular season ended (currently commented out so dont overwrite database).

conn = sqlite3.connect("basketball_data.db")
error_list = []
for school in schools['url_name']:
    try:
        get_season_for_team(school,2017).to_sql("game_data",conn,if_exists="append")
    except:
        print "could not get "+school+str(2017)
        error_list.append([school,2017])
conn.close()

##### Use this to look at any errors, if any

In [27]:
#error_list = pd.DataFrame(error_list,columns=['School','Year'])
#error_list

# Create crosswalk for names  

The names are different between the two data sources. All that follows is for the purpose of matching up the names so can merge the data sets.

#### Make changes to names pulled from Sports Reference

In [28]:
schools.loc[:,"CleanName"] = schools.url_name
schools.CleanName.replace(to_replace="-",value=" ",inplace=True,regex=True)
schools.CleanName = schools.CleanName.str.upper()
schools.ix[schools.CleanName == "CENTRAL CONNECTICUT STATE",'CleanName'] = "CENTRAL CONNECTICUT"  
schools.ix[schools.CleanName == "CENTENARY LA",'CleanName'] = "CENTENARY"
schools.ix[schools.CleanName == "DETROIT MERCY",'CleanName'] = "DETROIT"
schools.ix[schools.CleanName == "BOWLING GREEN STATE",'CleanName'] = "BOWLING GREEN"
schools.ix[schools.CleanName == "NEVADA LAS VEGAS",'CleanName'] = "UNLV"
schools.ix[schools.CleanName == "VIRGINIA COMMONWEALTH",'CleanName'] = "VCU"
schools.ix[schools.CleanName == "TEXAS CHRISTIAN",'CleanName'] = "TCU"
schools.ix[schools.CleanName == "SOUTHERN CALIFORNIA",'CleanName'] = "USC"
schools.ix[schools.CleanName == "SOUTH CAROLINA UPSTATE",'CleanName'] = "USC UPSTATE"
schools.ix[schools.CleanName == "TEXAS SAN ANTONIO",'CleanName'] = "UTSA"
schools.ix[schools.CleanName == "TEXAS EL PASO",'CleanName'] = "UTEP"
schools.ix[schools.CleanName == "SOUTHERN METHODIST",'CleanName'] = "SMU"
schools.ix[schools.CleanName == "LOUISIANA STATE",'CleanName'] = "LSU"
schools.ix[schools.CleanName == "SAINT MARYS CA",'CleanName'] = "SAINT MARYS"
schools.ix[schools.CleanName == "MARYLAND BALTIMORE COUNTY",'CleanName'] = "UMBC"
schools.ix[schools.CleanName == "LOYOLA IL", 'CleanName'] = "LOYOLA CHICAGO"
schools.ix[schools.CleanName == "PENNSYLVANIA", 'CleanName'] = "PENN"
schools.ix[schools.CleanName == "SAINT FRANCIS PA","CleanName"] = "ST FRANCIS PA"
schools.ix[schools.CleanName == "MISSOURI KANSAS CITY","CleanName"] = "UMKC"
schools.ix[schools.CleanName == "MASSACHUSETTS LOWELL","CleanName"] = "UMASS LOWELL"
schools.ix[schools.CleanName == "TEXAS ARLINGTON","CleanName"] = "UT ARLINGTON"
sportsref_names = schools.copy()
#sportsref_names


#### Import the data scraped from Pomeroy and clean the names

In [29]:
conn = sqlite3.connect("basketball_data.db")
team_data = pd.read_sql_query("select * from team_data;", conn)
conn.close()

In [30]:
team_data.loc[:,"CleanName"] = team_data.Team
team_data.CleanName = team_data.CleanName.str.upper()
team_data.CleanName.replace(to_replace="\&",value="",inplace=True,regex=True)
team_data.CleanName.replace(to_replace="\'",value="",inplace=True,regex=True)
team_data.ix[team_data.CleanName == "ST. JOHNS","CleanName"] = "ST JOHNS NY"
team_data.CleanName.replace(to_replace=" ST\.",value=" STATE",inplace=True,regex=True)
team_data.CleanName.replace(to_replace="UC ",value="CALIFORNIA ",inplace=True,regex=True)
team_data.CleanName.replace(to_replace="UNC ",value="NORTH CAROLINA ",inplace=True,regex=True)
team_data.CleanName.replace(to_replace="NC ",value="NORTH CAROLINA ",inplace=True,regex=True)
team_data.ix[team_data.CleanName == "BYU",'CleanName'] = "BRIGHAM YOUNG"
team_data.ix[team_data.CleanName == "ALBANY",'CleanName'] = "ALBANY NY"
team_data.ix[team_data.CleanName == "TEXAS AM CORPUS CHRIS",'CleanName'] = "TEXAS AM CORPUS CHRISTI"
team_data.ix[team_data.CleanName == "GRAMBLING STATE",'CleanName'] = "GRAMBLING"
team_data.ix[team_data.CleanName == "FIU",'CleanName'] = "FLORIDA INTERNATIONAL"
team_data.ix[team_data.CleanName == "LIU BROOKLYN",'CleanName'] = "LONG ISLAND UNIVERSITY"
team_data.ix[team_data.CleanName == "LONG ISLAND",'CleanName'] = "LONG ISLAND UNIVERSITY"
team_data.ix[team_data.CleanName == "STEPHEN F. AUSTIN",'CleanName'] = "STEPHEN F AUSTIN"
team_data.ix[team_data.CleanName == "SOUTHERN MISS",'CleanName'] = "SOUTHERN MISSISSIPPI"
team_data.ix[team_data.CleanName == "SIU EDWARDSVILLE",'CleanName'] = "SOUTHERN ILLINOIS EDWARDSVILLE"
team_data.ix[team_data.CleanName == "VMI", 'CleanName'] = "VIRGINIA MILITARY INSTITUTE"
team_data.ix[team_data.CleanName == "UAB", 'CleanName'] = "ALABAMA BIRMINGHAM"
team_data.ix[team_data.CleanName == "UCF", 'CleanName'] = "CENTRAL FLORIDA"
team_data.ix[team_data.CleanName == "THE CITADEL", 'CleanName'] = "CITADEL"
team_data.ix[team_data.CleanName == "PRAIRIE VIEW AM", 'CleanName'] = "PRAIRIE VIEW"
team_data.ix[team_data.CleanName == "MOUNT STATE MARYS","CleanName"] = "MOUNT ST MARYS"
team_data.ix[team_data.CleanName == "ST JOHNS","CleanName"] = "ST JOHNS NY"
team_data.ix[team_data.CleanName == "WILLIAM  MARY","CleanName"] = "WILLIAM MARY"
team_data.ix[team_data.CleanName == "LITTLE ROCK","CleanName"] = "ARKANSAS LITTLE ROCK"
team_data.ix[team_data.CleanName == "UT RIO GRANDE VALLEY","CleanName"] = "TEXAS PAN AMERICAN"
team_data.CleanName.replace(to_replace="\.",value="",inplace=True,regex=True)
team_data.ix[team_data.CleanName == "FORT WAYNE","CleanName"] = "IUPU FORT WAYNE"
team_data.ix[team_data.CleanName == "LOUISIANA STATE","CleanName"] = "LSU"
team_data.ix[team_data.CleanName == "MD BALTIMORE COUNTY","CleanName"] = "UMBC"
team_data.ix[team_data.CleanName == "MISSOURI KANSAS CITY","CleanName"] = "UMKC"
team_data.ix[team_data.CleanName == "NEVADA LAS VEGAS","CleanName"] = "UNLV"
team_data.ix[team_data.CleanName == "NJ INST OF TECHNOLOGY","CleanName"] = "NJIT"
team_data.ix[team_data.CleanName == "PENNSYLVANIA","CleanName"] = "PENN"
team_data.ix[team_data.CleanName == "SOUTH CAROLINA UPSTATE","CleanName"] = "USC UPSTATE"
team_data.ix[team_data.CleanName == "SOUTHERN CALIFORNIA","CleanName"] = "USC"
team_data.ix[team_data.CleanName == "SOUTHERN METHODIST","CleanName"] = "SMU"
team_data.ix[team_data.CleanName == "TEXAS ARLINGTON","CleanName"] = "UT ARLINGTON"
team_data.ix[team_data.CleanName == "TEXAS CHRISTIAN","CleanName"] = "TCU"
team_data.ix[team_data.CleanName == "TEXAS EL PASO","CleanName"] = "UTEP"
team_data.ix[team_data.CleanName == "TEXAS SAN ANTONIO","CleanName"] = "UTSA"
team_data.ix[team_data.CleanName == "VIRGINIA COMMONWEALTH","CleanName"] = "VCU"
team_data.ix[team_data.CleanName == "VIRGINIA MILITARY INST","CleanName"] = "VIRGINIA MILITARY INSTITUTE"
team_data.ix[team_data.CleanName == "ST LOUIS","CleanName"] = "SAINT LOUIS"
team_data.ix[team_data.CleanName == "ST MARYS","CleanName"] = "SAINT MARYS"
team_data.ix[team_data.CleanName == "ST PETERS","CleanName"] = "SAINT PETERS"
team_data.ix[team_data.CleanName == "WISCONSIN GREEN BAY","CleanName"] = "GREEN BAY"
team_data.ix[team_data.CleanName == "WISCONSIN MILWAUKEE","CleanName"] = "MILWAUKEE"

team_data.sort_values('CleanName',inplace=True)
pomeroy_names = team_data[['Team','CleanName']].drop_duplicates(['Team','CleanName'], keep='first').copy()

#pomeroy_names

#### Merge the sports ref and pomeroy names to create a crosswalk for merging the datasets

In [32]:
names_crosswalk = pd.merge(sportsref_names,pomeroy_names,on='CleanName',how='outer')

### Clean up Opp names in the game data and then add it to the names crosswalk

In [35]:
conn = sqlite3.connect("basketball_data.db")
game_data = pd.read_sql_query("select * from game_data;", conn)
conn.close()

In [36]:
game_data.columns.values

array(['index', 'Date', 'Location', 'Opp', 'Win', 'Tm_Pts', 'Opp_Pts',
       'Tm_FG', 'Tm_FGA', 'Tm_FG_pct', 'Tm_3P', 'Tm_3PA', 'Tm_3P_pct',
       'Tm_FT', 'Tm_FTA', 'Tm_FT_pct', 'Tm_ORB', 'Tm_TRB', 'Tm_AST',
       'Tm_STL', 'Tm_BLK', 'Tm_TOV', 'Tm_PF', 'Opp_FG', 'Opp_FGA',
       'Opp_FG_pct', 'Opp_3P', 'Opp_3PA', 'Opp_3P_pct', 'Opp_FT',
       'Opp_FTA', 'Opp_FT_pct', 'Opp_ORB', 'Opp_TRB', 'Opp_AST', 'Opp_STL',
       'Opp_BLK', 'Opp_TOV', 'Opp_PF', 'Tm', 'Year'], dtype=object)

In [37]:
testing_opp_names = game_data[['Opp']].drop_duplicates(keep='first')

testing_opp_names.loc[:,"CleanName"] = testing_opp_names.Opp
testing_opp_names.CleanName.replace(to_replace="-",value=" ",inplace=True,regex=True)
testing_opp_names.CleanName = testing_opp_names.CleanName.str.upper()
testing_opp_names.CleanName.replace(to_replace="\&",value="",inplace=True,regex=True)
testing_opp_names.CleanName.replace(to_replace="\'",value="",inplace=True,regex=True)
testing_opp_names.CleanName.replace(to_replace="[\(\)]",value="",inplace=True,regex=True)
testing_opp_names.ix[testing_opp_names.CleanName == "ST. JOHNS","CleanName"] = "ST JOHNS NY"
testing_opp_names.CleanName.replace(to_replace=" ST\.",value=" STATE",inplace=True,regex=True)
testing_opp_names.CleanName.replace(to_replace="UC ",value="CALIFORNIA ",inplace=True,regex=True)
testing_opp_names.ix[testing_opp_names.CleanName == "CENTRAL CONNECTICUT STATE",'CleanName'] = "CENTRAL CONNECTICUT"  
testing_opp_names.ix[testing_opp_names.CleanName == "CENTENARY LA",'CleanName'] = "CENTENARY"
#testing_opp_names.ix[testing_opp_names.CleanName == "ALBANY NY",'CleanName'] = "ALBANY"
testing_opp_names.ix[testing_opp_names.CleanName == "UCF", 'CleanName'] = "CENTRAL FLORIDA"
testing_opp_names.ix[testing_opp_names.CleanName == "UNC", 'CleanName'] = "NORTH CAROLINA"
testing_opp_names.ix[testing_opp_names.CleanName == "UCONN", 'CleanName'] = "CONNECTICUT"
testing_opp_names.ix[testing_opp_names.CleanName == "DETROIT MERCY",'CleanName'] = "DETROIT"
testing_opp_names.ix[testing_opp_names.CleanName == "BOWLING GREEN STATE",'CleanName'] = "BOWLING GREEN"
testing_opp_names.ix[testing_opp_names.CleanName == "NEVADA LAS VEGAS",'CleanName'] = "UNLV"
testing_opp_names.ix[testing_opp_names.CleanName == "VIRGINIA COMMONWEALTH",'CleanName'] = "VCU"
testing_opp_names.ix[testing_opp_names.CleanName == "TEXAS CHRISTIAN",'CleanName'] = "TCU"
testing_opp_names.ix[testing_opp_names.CleanName == "SOUTHERN CALIFORNIA",'CleanName'] = "USC"
testing_opp_names.ix[testing_opp_names.CleanName == "SOUTH CAROLINA UPSTATE",'CleanName'] = "USC UPSTATE"
testing_opp_names.ix[testing_opp_names.CleanName == "TEXAS SAN ANTONIO",'CleanName'] = "UTSA"
testing_opp_names.ix[testing_opp_names.CleanName == "TEXAS EL PASO",'CleanName'] = "UTEP"
testing_opp_names.ix[testing_opp_names.CleanName == "SOUTHERN METHODIST",'CleanName'] = "SMU"
testing_opp_names.ix[testing_opp_names.CleanName == "LOUISIANA STATE",'CleanName'] = "LSU"
testing_opp_names.ix[testing_opp_names.CleanName == "SAINT MARYS CA",'CleanName'] = "SAINT MARYS"
testing_opp_names.ix[testing_opp_names.CleanName == "MARYLAND BALTIMORE COUNTY",'CleanName'] = "UMBC"
testing_opp_names.ix[testing_opp_names.CleanName == "LOYOLA IL", 'CleanName'] = "LOYOLA CHICAGO"
testing_opp_names.ix[testing_opp_names.CleanName == "PENNSYLVANIA", 'CleanName'] = "PENN"
testing_opp_names.ix[testing_opp_names.CleanName == "SAINT FRANCIS PA","CleanName"] = "ST FRANCIS PA"
testing_opp_names.ix[testing_opp_names.CleanName == "MISSOURI KANSAS CITY","CleanName"] = "UMKC"
testing_opp_names.ix[testing_opp_names.CleanName == "UMASS","CleanName"] = "MASSACHUSETTS"
testing_opp_names.ix[testing_opp_names.CleanName == "MASSACHUSETTS LOWELL","CleanName"] = "UMASS LOWELL"
testing_opp_names.ix[testing_opp_names.CleanName == "TEXAS ARLINGTON","CleanName"] = "UT ARLINGTON"
testing_opp_names.ix[testing_opp_names.CleanName == "LOYOLA (IL)","CleanName"] = "LOYOLA CHICAGO"
testing_opp_names.ix[testing_opp_names.CleanName == "LOYOLA (MD)","CleanName"] = "LOYOLA MD"
testing_opp_names.ix[testing_opp_names.CleanName == "BYU",'CleanName'] = "BRIGHAM YOUNG"
testing_opp_names.ix[testing_opp_names.CleanName == "UNIVERSITY OF CALIFORNIA",'CleanName'] = "CALIFORNIA"
testing_opp_names.ix[testing_opp_names.CleanName == "UCSB",'CleanName'] = "CALIFORNIA SANTA BARBARA"
testing_opp_names.ix[testing_opp_names.CleanName == "TEXAS RIO GRANDE VALLEY",'CleanName'] = "TEXAS PAN AMERICAN"
testing_opp_names.ix[testing_opp_names.CleanName == "ETSU",'CleanName'] = "EAST TENNESSEE STATE"
testing_opp_names.ix[testing_opp_names.CleanName == "UIC", 'CleanName'] = "ILLINOIS CHICAGO"
testing_opp_names.ix[testing_opp_names.CleanName == "PITT", 'CleanName'] = "PITTSBURGH"
testing_opp_names.ix[testing_opp_names.CleanName == "OLE MISS", 'CleanName'] = "MISSISSIPPI"
testing_opp_names.ix[testing_opp_names.CleanName == "LIU BROOKLYN",'CleanName'] = "LONG ISLAND UNIVERSITY"
testing_opp_names.ix[testing_opp_names.CleanName == "MOUNT STATE MARYS","CleanName"] = "MOUNT ST MARYS"
testing_opp_names.ix[testing_opp_names.CleanName == "SOUTHERN MISS",'CleanName'] = "SOUTHERN MISSISSIPPI"
testing_opp_names.ix[testing_opp_names.CleanName == "UT MARTIN",'CleanName'] = "TENNESSEE MARTIN"
testing_opp_names.ix[testing_opp_names.CleanName == "ST. JOSEPHS",'CleanName'] = "SAINT JOSEPHS"
testing_opp_names.ix[testing_opp_names.CleanName == "ST. PETERS",'CleanName'] = "SAINT PETERS"
testing_opp_names.ix[testing_opp_names.CleanName == "WILLIAM  MARY",'CleanName'] = "WILLIAM MARY"
testing_opp_names.CleanName.replace(to_replace="\.",value="",inplace=True,regex=True)


full_xwalk = pd.merge(testing_opp_names,names_crosswalk,on='CleanName',how='outer') #there are schools that show up in Opp
# but not url_name or Team. There is no game or team level data available for these teams.
full_xwalk = full_xwalk[['CleanName','url_name','Opp','Team']].copy()

In [40]:
conn = sqlite3.connect("basketball_data.db")
full_xwalk.to_sql("name_xwalk",conn,if_exists="replace")
conn.close()