In [1]:
import pandas as pd
import numpy as np
from nameparser import HumanName
import re


In [21]:
# Input file is the Roster Download from College Website. It is a compilation of the available years.
df = pd.read_csv('CornellW.csv')

In [22]:
# Examine the columns, because there are a few different formats used by most colleges
df.columns

Index(['Field4', 'Field7_Text', 'Field7_Link', 'Field1', 'Field8', 'Field9',
       'Field11', 'Field2_Text', 'Field3_Text'],
      dtype='object')

In [23]:
# Rename columns with meaningful labels
df = df.rename(columns = {'Field7_Text':'Name1','Field7_Link':'RosterLink','Field1':'ClassYear', 
                          'Field8':'Hometown','Field9':'HS','Field11':'RosterYear',
                          'Field2_Text':'Name2', 'Field3_Text':'Name3'})

In [24]:
# Note the 'Name' is in 3 possible columns, because the download of each roster by year 
# has a few different formats. Field4 is a duplicate name column, and will be ignored
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 9 columns):
Field4        0 non-null float64
Name1         115 non-null object
RosterLink    115 non-null object
ClassYear     414 non-null object
Hometown      414 non-null object
HS            414 non-null object
RosterYear    414 non-null object
Name2         115 non-null object
Name3         299 non-null object
dtypes: float64(1), object(8)
memory usage: 29.2+ KB


In [25]:
# Preview the data
df.head()

Unnamed: 0,Field4,Name1,RosterLink,ClassYear,Hometown,HS,RosterYear,Name2,Name3
0,,Maria Adiaconitei,https://cornellbigred.com/sports/womens-tennis...,Sophomore,"Bethesda, Md.",Laurel Springs HS,2019-20 Women's Tennis Roster,Maria Adiaconitei,
1,,Sarah Campbell,https://cornellbigred.com/sports/womens-tennis...,Sophomore,"Sudbury, Mass.",Lincoln Sudbury HS,2019-20 Women's Tennis Roster,Sarah Campbell,
2,,Michelle Deng,https://cornellbigred.com/sports/womens-tennis...,Freshman,"Temple City, Calif.",Arcadia HS,2019-20 Women's Tennis Roster,Michelle Deng,
3,,Fatima El Ashram,https://cornellbigred.com/sports/womens-tennis...,Freshman,"Zamalek, Cairo, Egypt",Weil College Prep,2019-20 Women's Tennis Roster,Fatima El Ashram,
4,,Valerie Ho,https://cornellbigred.com/sports/womens-tennis...,Sophomore,"Silver Spring, Md.",Montgomery Blair HS,2019-20 Women's Tennis Roster,Valerie Ho,


In [26]:
# Combine and rename the 'Name' columns, to get a single column with no missing values
# The Name2 field was redundant so can be ignored
df = df.assign(**{
    'Name': df['Name1'].fillna(df['Name3'])})
df = df.drop(['Name1', 'Name2', 'Name3', 'Field4'] ,axis=1)

In [27]:
# Preview again after consolidating Names columns
df.head()

Unnamed: 0,RosterLink,ClassYear,Hometown,HS,RosterYear,Name
0,https://cornellbigred.com/sports/womens-tennis...,Sophomore,"Bethesda, Md.",Laurel Springs HS,2019-20 Women's Tennis Roster,Maria Adiaconitei
1,https://cornellbigred.com/sports/womens-tennis...,Sophomore,"Sudbury, Mass.",Lincoln Sudbury HS,2019-20 Women's Tennis Roster,Sarah Campbell
2,https://cornellbigred.com/sports/womens-tennis...,Freshman,"Temple City, Calif.",Arcadia HS,2019-20 Women's Tennis Roster,Michelle Deng
3,https://cornellbigred.com/sports/womens-tennis...,Freshman,"Zamalek, Cairo, Egypt",Weil College Prep,2019-20 Women's Tennis Roster,Fatima El Ashram
4,https://cornellbigred.com/sports/womens-tennis...,Sophomore,"Silver Spring, Md.",Montgomery Blair HS,2019-20 Women's Tennis Roster,Valerie Ho


In [28]:
# Some roster downloads have duplicates because of workarounds for the Octoparse roster looping. 
# Check the row count after dropping duplicates 
df.drop_duplicates(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 414 entries, 0 to 413
Data columns (total 6 columns):
RosterLink    115 non-null object
ClassYear     414 non-null object
Hometown      414 non-null object
HS            414 non-null object
RosterYear    414 non-null object
Name          414 non-null object
dtypes: object(6)
memory usage: 22.6+ KB


In [29]:
# Reorder and sort the columns
df = df[['Name', 'RosterYear', 'ClassYear', 'Hometown', 'HS', 'RosterLink']]
df.sort_values(by=['Name', 'RosterYear'], inplace=True)
df.head()

Unnamed: 0,Name,RosterYear,ClassYear,Hometown,HS,RosterLink
363,Ainslee Haffner,1987-88 Women's Tennis Roster,Freshman,"Schenectady, N.Y.",Niskayuna HS,
354,Ainslee Haffner,1988-89 Women's Tennis Roster,Sophomore,"Schenectady, N.Y.",Niskayuna HS,
343,Ainslee Haffner,1989-90 Women's Tennis Roster,Junior,"Schenectady, N.Y.",Niskayuna HS,
186,Akane Kokubo,2001-02 Women's Tennis Roster,Freshman,"Battle Creek, Mich.",Lakeview HS,
171,Akane Kokubo,2002-03 Women's Tennis Roster,Sophomore,"Battle Creek, Mich.",Lakeview HS,


In [30]:
# Describe is a quick way to see how many unique values are in each column 
df.describe()

Unnamed: 0,Name,RosterYear,ClassYear,Hometown,HS,RosterLink
count,414,414,414,414,414,115
unique,167,37,6,150,151,115
top,Jenn Boyer,1994-95 Women's Tennis Roster,Freshman,"Ithaca, N.Y.",Roslyn HS,https://cornellbigred.com/sports/womens-tennis...
freq,5,16,145,8,10,1


In [31]:
# Preview the groupby() stats, which will be used for aggreating to one row per unique name
df.groupby(['Name']).describe()

Unnamed: 0_level_0,RosterYear,RosterYear,RosterYear,RosterYear,ClassYear,ClassYear,ClassYear,ClassYear,Hometown,Hometown,Hometown,Hometown,HS,HS,HS,HS,RosterLink,RosterLink,RosterLink,RosterLink
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
Ainslee Haffner,3,3,1989-90 Women's Tennis Roster,1,3,3,Sophomore,1,3,1,"Schenectady, N.Y.",3,3,1,Niskayuna HS,3,0,0,,
Akane Kokubo,4,4,2004-05 Women's Tennis Roster,1,4,4,Sophomore,1,4,1,"Battle Creek, Mich.",4,4,1,Lakeview HS,4,0,0,,
Alex DelPrete,3,3,2000-01 Women's Tennis Roster,1,3,3,Sophomore,1,3,1,"Rye, N.Y.",3,3,1,Horace Mann HS,3,0,0,,
Alexandra D'Ascenzo,4,4,2016-17 Women's Tennis Roster,1,4,4,Sophomore,1,4,1,"West Bloomfield, Mich.",4,4,1,Frankel Jewish Academy,4,4,4,https://cornellbigred.com/sports/womens-tennis...,1
Ali Oshinky,1,1,1989-90 Women's Tennis Roster,1,1,1,Freshman,1,1,1,"Fairfax, Va.",1,1,1,W.T. Woodson HS,1,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Tamara John,3,3,2006-07 Women's Tennis Roster,1,3,3,Sophomore,1,3,2,"Macedon, N.Y.",2,3,1,Penfield HS,3,1,1,https://cornellbigred.com/sports/womens-tennis...,1
Valerie Ho,2,2,2019-20 Women's Tennis Roster,1,2,2,Sophomore,1,2,1,"Silver Spring, Md.",2,2,1,Montgomery Blair HS,2,2,2,https://cornellbigred.com/sports/womens-tennis...,1
Vivian Sam,1,1,1997-98 Women's Tennis Roster,1,1,1,Freshman,1,1,1,"Shanghai, China",1,1,1,Robert Louis Stevenson HS (Calif.),1,0,0,,
Wan Chen,4,4,1991-92 Women's Tennis Roster,1,4,4,Sophomore,1,4,1,"Kissimee, Fla.",4,4,1,Osceola HS,4,0,0,,


In [32]:
# Create copy of ClassYear in order to aggregate count and keep original unique aggregation when grouping
df['YearsPlayed'] = df['ClassYear']

In [34]:
# Create 'Year' by 1) converting RosterYear to a list
# 2) Loop through the list to use RegEx split function
# 3) Use list comprehension to extract first column of each split row
# 4) Convert string to integer and add 1 to calculate value from first year before dash in yyyy-yy
rosterlist = df['RosterYear'].tolist()
Year=[]
for i in rosterlist:
    Split=int(re.split(r'\D+', i)[0])+1
    Year.append(Split)
df['Year'] = Year

In [35]:
# Use groupby() with aggregation to consolidate columns and condense to one row per Name
dfgroup = df.groupby('Name').aggregate({'Name':'first', 'Year':'max', 'YearsPlayed':'count',\
'RosterYear':'unique', 'ClassYear':'unique', 'Hometown':'first', 'HS':'first', 'RosterLink':'last'})

In [36]:
# Add a column for College, and populate value
dfgroup['College'] = 'Cornell University'
dfgroup['Criteria'] = ' Cornell'
dfgroup['Gender'] = 'F'

In [37]:
dfgroup.info()

<class 'pandas.core.frame.DataFrame'>
Index: 167 entries, Ainslee Haffner to Weatherly Schwab
Data columns (total 11 columns):
Name           167 non-null object
Year           167 non-null int64
YearsPlayed    167 non-null int64
RosterYear     167 non-null object
ClassYear      167 non-null object
Hometown       167 non-null object
HS             167 non-null object
RosterLink     56 non-null object
College        167 non-null object
Criteria       167 non-null object
Gender         167 non-null object
dtypes: int64(2), object(9)
memory usage: 15.7+ KB


In [38]:
dfgroup

Unnamed: 0_level_0,Name,Year,YearsPlayed,RosterYear,ClassYear,Hometown,HS,RosterLink,College,Criteria,Gender
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Ainslee Haffner,Ainslee Haffner,1990,3,"[1987-88 Women's Tennis Roster, 1988-89 Women'...","[Freshman, Sophomore, Junior]","Schenectady, N.Y.",Niskayuna HS,,Cornell University,Cornell,F
Akane Kokubo,Akane Kokubo,2005,4,"[2001-02 Women's Tennis Roster, 2002-03 Women'...","[Freshman, Sophomore, Junior, Senior]","Battle Creek, Mich.",Lakeview HS,,Cornell University,Cornell,F
Alex DelPrete,Alex DelPrete,2002,3,"[1999-2000 Women's Tennis Roster, 2000-01 Wome...","[Freshman, Sophomore, Junior]","Rye, N.Y.",Horace Mann HS,,Cornell University,Cornell,F
Alexandra D'Ascenzo,Alexandra D'Ascenzo,2017,4,"[2013-14 Women's Tennis Roster, 2014-15 Women'...","[Freshman, Sophomore, Junior, Senior]","West Bloomfield, Mich.",Frankel Jewish Academy,https://cornellbigred.com/sports/womens-tennis...,Cornell University,Cornell,F
Ali Oshinky,Ali Oshinky,1990,1,[1989-90 Women's Tennis Roster],[Freshman],"Fairfax, Va.",W.T. Woodson HS,,Cornell University,Cornell,F
...,...,...,...,...,...,...,...,...,...,...,...
Tamara John,Tamara John,2008,3,"[2005-06 Women's Tennis Roster, 2006-07 Women'...","[Freshman, Sophomore, Junior]","Macedon, N.Y.",Penfield HS,https://cornellbigred.com/sports/womens-tennis...,Cornell University,Cornell,F
Valerie Ho,Valerie Ho,2020,2,"[2018-19 Women's Tennis Roster, 2019-20 Women'...","[Freshman, Sophomore]","Silver Spring, Md.",Montgomery Blair HS,https://cornellbigred.com/sports/womens-tennis...,Cornell University,Cornell,F
Vivian Sam,Vivian Sam,1998,1,[1997-98 Women's Tennis Roster],[Freshman],"Shanghai, China",Robert Louis Stevenson HS (Calif.),,Cornell University,Cornell,F
Wan Chen,Wan Chen,1994,4,"[1990-91 Women's Tennis Roster, 1991-92 Women'...","[Freshman, Sophomore, Junior, Senior]","Kissimee, Fla.",Osceola HS,,Cornell University,Cornell,F


In [39]:
# Use a for loop with nameparser utility HumanName to parse names into First, Middle, Last
namelist = dfgroup['Name'].tolist()
First=[]
Last=[]
Middle=[]
for name in namelist:
    parsename = HumanName(name)
    First.append(parsename.first)
    Last.append(parsename.last)
    Middle.append(parsename.middle)

In [40]:
# Display the elements of parsename.HumanName for last row. title, suffix, nickname are not applicable 
parsename

<HumanName : [
	title: '' 
	first: 'Weatherly' 
	middle: '' 
	last: 'Schwab' 
	suffix: ''
	nickname: ''
]>

In [41]:
# Use RegEx to identify Middle values with a period vs. longer names with no period
# Note RegEx syntax "?<!\w\" means don't split the character e.g. keep the period
MidSplit=[]
for i in Middle:
    MidSplit.append(re.split(r'\.*\W+', i)[0])


In [42]:
MiddleName=[]
for i in MidSplit:
    if len(i) > 1:
        MiddleName.append(" "+i)
    else:
        MiddleName.append("")


In [43]:
dfgroup['First'] = First
dfgroup['MidInit'] = Middle
dfgroup['MidName'] = MiddleName
dfgroup['Last'] = Last
dfgroup['FirstLast'] = dfgroup['First'].map(str)+dfgroup['MidName']+' '+dfgroup['Last'].map(str)

In [49]:
dfgroup.info()

<class 'pandas.core.frame.DataFrame'>
Index: 167 entries, Katherine Nguyen to Jeneane Schmidt
Data columns (total 16 columns):
Name           167 non-null object
Year           167 non-null int64
YearsPlayed    167 non-null int64
RosterYear     167 non-null object
ClassYear      167 non-null object
Hometown       167 non-null object
HS             167 non-null object
RosterLink     56 non-null object
College        167 non-null object
Criteria       167 non-null object
Gender         167 non-null object
First          167 non-null object
MidInit        167 non-null object
MidName        167 non-null object
Last           167 non-null object
FirstLast      167 non-null object
dtypes: int64(2), object(14)
memory usage: 22.2+ KB


In [None]:
# Write the new format to a csv file
#dfgroup.to_csv('CornellW_group.csv', index=False)

In [None]:
# Unique name count is 229, with Roster year count 39. There is a gap missing 1966-67 through 2000-2001, or 34 missing years. 
# 227 LinkedIn URL values were found, but many are likely to be non-matching. 
# The earliest roster year is 1946-47, and there will be dropff of found URL's at some point
# Collapse to unique names, and create boolean for roster years (39 columns)

In [51]:
dfpdf = pd.read_csv('CornellWpdf.csv')

In [52]:
dfpdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 5 columns):
FirstLast      105 non-null object
Last           105 non-null object
First          105 non-null object
Year           105 non-null int64
YearsPlayed    105 non-null int64
dtypes: int64(2), object(3)
memory usage: 4.2+ KB


In [70]:
dfappend = dfgroup.append(dfpdf)

In [71]:
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)

In [72]:
dfappend.info()

<class 'pandas.core.frame.DataFrame'>
Index: 272 entries, Katherine Nguyen to 104
Data columns (total 16 columns):
ClassYear      167 non-null object
College        167 non-null object
Criteria       167 non-null object
First          272 non-null object
FirstLast      272 non-null object
Gender         167 non-null object
HS             167 non-null object
Hometown       167 non-null object
Last           272 non-null object
MidInit        167 non-null object
MidName        167 non-null object
Name           167 non-null object
RosterLink     56 non-null object
RosterYear     167 non-null object
Year           272 non-null int64
YearsPlayed    272 non-null int64
dtypes: int64(2), object(14)
memory usage: 36.1+ KB


In [73]:
dfappend.reset_index(drop=True)
dfappend.set_index(['FirstLast'], inplace=True, drop=False)

In [74]:
dfappend['College'] = 'Cornell University'
dfappend['Criteria'] = ' Cornell'
dfappend['Gender'] = 'F'

In [75]:
cols_to_order = ['Criteria','FirstLast', 'Year', 'YearsPlayed', 'RosterLink', 'Hometown', 'HS']
new_columns = cols_to_order + (dfappend.columns.drop(cols_to_order).tolist())
dfappend = dfappend[new_columns]

In [76]:
# Write the new format to a csv file
dfappend.to_csv('CornellW_RosterGroup.csv', index=True)

In [77]:
dfappend['FirstLast'].to_csv('NamelistCornellW.csv', index=False, header=True)
dfappend['Criteria'].to_csv('CriterialistCornellW.csv', index=False, header=True)

In [80]:
dfappend.sort_values(by=['Year'], inplace=True)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
dfappend

Unnamed: 0_level_0,Criteria,FirstLast,Year,YearsPlayed,RosterLink,Hometown,HS,ClassYear,College,First,Gender,Last,MidInit,MidName,Name,RosterYear
FirstLast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Blythe Tracy,Cornell,Blythe Tracy,1972,1,,,,,Cornell University,Blythe,F,Tracy,,,,
Valerie DeMelville,Cornell,Valerie DeMelville,1972,1,,,,,Cornell University,Valerie,F,DeMelville,,,,
Wendy Jennis,Cornell,Wendy Jennis,1973,1,,,,,Cornell University,Wendy,F,Jennis,,,,
Sue Coan,Cornell,Sue Coan,1973,1,,,,,Cornell University,Sue,F,Coan,,,,
Moira Hearne,Cornell,Moira Hearne,1974,1,,,,,Cornell University,Moira,F,Hearne,,,,
Roberta Frank,Cornell,Roberta Frank,1974,1,,,,,Cornell University,Roberta,F,Frank,,,,
Alice Bron,Cornell,Alice Bron,1974,1,,,,,Cornell University,Alice,F,Bron,,,,
Lucy Babcox,Cornell,Lucy Babcox,1974,1,,,,,Cornell University,Lucy,F,Babcox,,,,
Sherllyn Burnett,Cornell,Sherllyn Burnett,1975,1,,,,,Cornell University,Sherllyn,F,Burnett,,,,
Martha Golar,Cornell,Martha Golar,1975,1,,,,,Cornell University,Martha,F,Golar,,,,
