In [174]:
import pandas as pd
import numpy as np
from nameparser import HumanName
import re


In [242]:
# Input file is the Roster Download from College Website. It is a compilation of the available years.
df = pd.read_excel('YaleW2010.xlsx')

In [243]:
# Examine the columns, because there are a few different formats used by most colleges
df.columns

Index(['Field1_Text', 'Field1_Link', 'Field2', 'Field3', 'Field4', 'Field5',
       'Field6'],
      dtype='object')

In [244]:
# Rename columns with meaningful labels
df = df.rename(columns = {'Field1_Text':'Name','Field1_Link':'RosterLink','Field3':'ClassYear', 
                          'Field4':'Hometown','Field5':'HS','Field6':'RosterYear'})

In [245]:
# Note the 'Name' is in 3 possible columns, because the download of each roster by year 
# has a few different formats. Field4 is a duplicate name column, and will be ignored
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 7 columns):
Name          102 non-null object
RosterLink    102 non-null object
Field2        0 non-null float64
ClassYear     102 non-null object
Hometown      102 non-null object
HS            102 non-null object
RosterYear    102 non-null object
dtypes: float64(1), object(6)
memory usage: 5.7+ KB


In [246]:
# Preview the data
df.head()

Unnamed: 0,Name,RosterLink,Field2,ClassYear,Hometown,HS,RosterYear
0,Vicky Brook,https://yalebulldogs.com/sports/womens-tennis/...,,Sophomore,"London, England",Seven Oaks School,2009-10 Women's Tennis
1,Lindsay Clark,https://yalebulldogs.com/sports/womens-tennis/...,,Junior,"Bernardsville, NJ",Bernardsville High School,2009-10 Women's Tennis
2,Silia DeFilippis,https://yalebulldogs.com/sports/womens-tennis/...,,Junior,"Short Hills, NJ",Millburn High School,2009-10 Women's Tennis
3,Elizabeth Epstein,https://yalebulldogs.com/sports/womens-tennis/...,,First Year,"Chicago, Ill.",Francis Parker School,2009-10 Women's Tennis
4,Stephanie Kent,https://yalebulldogs.com/sports/womens-tennis/...,,Sophomore,"Washington Depot, Conn.",Shepaug Valley High School,2009-10 Women's Tennis


In [247]:
# Combine and rename the 'Name' columns, to get a single column with no missing values
# The Name2 field was redundant so can be ignored
# For CornellM there is no 'Field4'
#df = df.assign(**{
#    'Name': df['Name1'].fillna(df['Name3'])})
#df = df.drop(['Name1', 'Name2', 'Name3'] ,axis=1)

In [248]:
# Preview again after consolidating Names columns
df.head()

Unnamed: 0,Name,RosterLink,Field2,ClassYear,Hometown,HS,RosterYear
0,Vicky Brook,https://yalebulldogs.com/sports/womens-tennis/...,,Sophomore,"London, England",Seven Oaks School,2009-10 Women's Tennis
1,Lindsay Clark,https://yalebulldogs.com/sports/womens-tennis/...,,Junior,"Bernardsville, NJ",Bernardsville High School,2009-10 Women's Tennis
2,Silia DeFilippis,https://yalebulldogs.com/sports/womens-tennis/...,,Junior,"Short Hills, NJ",Millburn High School,2009-10 Women's Tennis
3,Elizabeth Epstein,https://yalebulldogs.com/sports/womens-tennis/...,,First Year,"Chicago, Ill.",Francis Parker School,2009-10 Women's Tennis
4,Stephanie Kent,https://yalebulldogs.com/sports/womens-tennis/...,,Sophomore,"Washington Depot, Conn.",Shepaug Valley High School,2009-10 Women's Tennis


In [249]:
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)

In [250]:
#This name was on different rosters, once without an MI
#df['Name'][477] = 'James P. Shoffner'

In [251]:
# Some roster downloads have duplicates because of workarounds for the Octoparse roster looping. 
# Check the row count after dropping duplicates 
df.drop_duplicates(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102 entries, 0 to 101
Data columns (total 7 columns):
Name          102 non-null object
RosterLink    102 non-null object
Field2        0 non-null float64
ClassYear     102 non-null object
Hometown      102 non-null object
HS            102 non-null object
RosterYear    102 non-null object
dtypes: float64(1), object(6)
memory usage: 6.4+ KB


In [252]:
# Reorder and sort the columns
df = df[['Name', 'RosterYear', 'ClassYear', 'Hometown', 'HS', 'RosterLink']]
df.sort_values(by=['Name', 'RosterYear'], inplace=True)
df.head()

Unnamed: 0,Name,RosterYear,ClassYear,Hometown,HS,RosterLink
89,Amber Li,2011-12 Women's Tennis,First Year,"Tampa, Fla.",HB Plant,https://yalebulldogs.com/sports/womens-tennis/...
81,Amber Li,2012-13 Women's Tennis,Sophomore,"Tampa, Fla.",HB Plant,https://yalebulldogs.com/sports/womens-tennis/...
71,Amber Li,2013-14 Women's Tennis,Junior,"Tampa, Fla.",HB Plant,https://yalebulldogs.com/sports/womens-tennis/...
57,Amy Yang,2015-16 Women's Tennis,First Year,"John's Creek, Ga.",Northview,https://yalebulldogs.com/sports/womens-tennis/...
44,Amy Yang,2016-17 Women's Tennis,Sophomore,"John's Creek, Ga.",Northview,https://yalebulldogs.com/sports/womens-tennis/...


In [258]:
df

Unnamed: 0,Name,RosterYear,ClassYear,Hometown,HS,RosterLink
89,Amber Li,2011-12 Women's Tennis,First Year,"Tampa, Fla.",HB Plant,https://yalebulldogs.com/sports/womens-tennis/...
81,Amber Li,2012-13 Women's Tennis,Sophomore,"Tampa, Fla.",HB Plant,https://yalebulldogs.com/sports/womens-tennis/...
71,Amber Li,2013-14 Women's Tennis,Junior,"Tampa, Fla.",HB Plant,https://yalebulldogs.com/sports/womens-tennis/...
57,Amy Yang,2015-16 Women's Tennis,First Year,"John's Creek, Ga.",Northview,https://yalebulldogs.com/sports/womens-tennis/...
44,Amy Yang,2016-17 Women's Tennis,Sophomore,"John's Creek, Ga.",Northview,https://yalebulldogs.com/sports/womens-tennis/...
35,Amy Yang,2017-18 Women's Tennis,Junior,"John's Creek, Ga.",Northview,https://yalebulldogs.com/sports/womens-tennis/...
28,Amy Yang,2018-19 Women's Tennis,Senior,Economics,"John's Creek, Ga.",https://yalebulldogs.com/sports/womens-tennis/...
100,Annie Sullivan,2010-11 Women's Tennis,First Year,"Deerfield, Ill.",Woodlands Academy,https://yalebulldogs.com/sports/womens-tennis/...
91,Annie Sullivan,2011-12 Women's Tennis,Sophomore,"Deerfield, Ill.",Woodlands Academy,https://yalebulldogs.com/sports/womens-tennis/...
83,Annie Sullivan,2012-13 Women's Tennis,Junior,"Deerfield, Ill.",Woodlands Academy,https://yalebulldogs.com/sports/womens-tennis/...


In [257]:
# Replacing a name because same person listed with different "spelling" on roster for different year
df.replace({'Name': 'Stephanie Kent'}, 'Steph Kent', inplace=True)

In [259]:
# Describe is a quick way to see how many unique values are in each column 
df.describe()

Unnamed: 0,Name,RosterYear,ClassYear,Hometown,HS,RosterLink
count,102,102,102,102,102,102
unique,40,11,4,50,47,102
top,Elizabeth Zordani,2015-16 Women's Tennis,First Year,"Tucson, Ariz.",Salpointe Catholic,https://yalebulldogs.com/sports/womens-tennis/...
freq,4,13,30,4,7,1


In [260]:
# Preview the groupby() stats, which will be used for aggreating to one row per unique name
df.groupby(['Name']).describe()

Unnamed: 0_level_0,RosterYear,RosterYear,RosterYear,RosterYear,ClassYear,ClassYear,ClassYear,ClassYear,Hometown,Hometown,Hometown,Hometown,HS,HS,HS,HS,RosterLink,RosterLink,RosterLink,RosterLink
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
Amber Li,3,3,2011-12 Women's Tennis,1,3,3,Junior,1,3,1,"Tampa, Fla.",3,3,1,HB Plant,3,3,3,https://yalebulldogs.com/sports/womens-tennis/...,1
Amy Yang,4,4,2016-17 Women's Tennis,1,4,4,Senior,1,4,2,"John's Creek, Ga.",3,4,2,Northview,3,4,4,https://yalebulldogs.com/sports/womens-tennis/...,1
Annie Sullivan,4,4,2012-13 Women's Tennis,1,4,4,Senior,1,4,1,"Deerfield, Ill.",4,4,1,Woodlands Academy,4,4,4,https://yalebulldogs.com/sports/womens-tennis/...,1
Blair Seideman,2,2,2011-12 Women's Tennis,1,2,2,Sophomore,1,2,2,"Brookville, N.Y.",1,2,1,Jericho,2,2,2,https://yalebulldogs.com/sports/womens-tennis/...,1
Carol Finke,3,3,2014-15 Women's Tennis,1,3,3,Junior,1,3,1,"Glencoe, Ill.",3,3,1,New Trier,3,3,3,https://yalebulldogs.com/sports/womens-tennis/...,1
Caroline Amos,4,4,2016-17 Women's Tennis,1,4,4,Senior,1,4,2,"Tucson, Ariz.",3,4,2,Salpointe Catholic,3,4,4,https://yalebulldogs.com/sports/womens-tennis/...,1
Caroline Dunleavy,2,2,2018-19 Women's Tennis,1,2,2,Junior,1,2,2,"Darien, Conn.",1,2,2,Greenwich Academy,1,2,2,https://yalebulldogs.com/sports/womens-tennis/...,1
Caroline Lynch,4,4,2016-17 Women's Tennis,1,4,4,Senior,1,4,2,"Slane, Ireland",2,4,1,Rathdown School,4,4,4,https://yalebulldogs.com/sports/womens-tennis/...,1
Chelsea Kung,1,1,2019-20 Women's Tennis Roster,1,1,1,First Year,1,1,1,"Fort Worth, TX",1,1,1,Keller Central High School,1,1,1,https://yalebulldogs.com/sports/womens-tennis/...,1
Courtney Amos,4,4,2015-16 Women's Tennis,1,4,4,Senior,1,4,2,"Tucson, Ari.",3,4,1,Salpointe Catholic,4,4,4,https://yalebulldogs.com/sports/womens-tennis/...,1


In [261]:
# Create copy of ClassYear in order to aggregate count and keep original unique aggregation when grouping
df['YearsPlayed'] = df['ClassYear']

In [262]:
# Create 'Year' by 1) converting RosterYear to a list
# 2) Loop through the list to use RegEx split function
# 3) Use list comprehension to extract first column of each split row
# 4) Convert string to integer and add 1 to calculate value from first year before dash in yyyy-yy
rosterlist = df['RosterYear'].tolist()
Year=[]
for i in rosterlist:
    Split=int(re.split(r'\D+', i)[0])+1
    Year.append(Split)
df['Year'] = Year

In [263]:
# Use groupby() with aggregation to consolidate columns and condense to one row per Name
dfgroup = df.groupby('Name').aggregate({'Name':'first', 'Year':'max', 'YearsPlayed':'count',\
'RosterYear':'unique', 'ClassYear':'unique', 'Hometown':'first', 'HS':'first', 'RosterLink':'last'})

In [264]:
# Add a column for College, Criteris, and Gender
dfgroup['College'] = 'Yale University'
dfgroup['Criteria'] = ' Yale'
dfgroup['Gender'] = 'W'

In [265]:
dfgroup.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40 entries, Amber Li to Vivian Cheng
Data columns (total 11 columns):
Name           40 non-null object
Year           40 non-null int64
YearsPlayed    40 non-null int64
RosterYear     40 non-null object
ClassYear      40 non-null object
Hometown       40 non-null object
HS             40 non-null object
RosterLink     40 non-null object
College        40 non-null object
Criteria       40 non-null object
Gender         40 non-null object
dtypes: int64(2), object(9)
memory usage: 3.8+ KB


In [266]:
dfgroup.sort_values('Year')

Unnamed: 0_level_0,Name,Year,YearsPlayed,RosterYear,ClassYear,Hometown,HS,RosterLink,College,Criteria,Gender
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sarah Lederhandler,Sarah Lederhandler,2010,1,[2009-10 Women's Tennis],[Senior],"Miami, FL",Ransom Everglades Upper School,https://yalebulldogs.com/sports/womens-tennis/...,Yale University,Yale,W
Jessica Rhee,Jessica Rhee,2010,1,[2009-10 Women's Tennis],[Senior],"Dayton, OH",Centerville High School,https://yalebulldogs.com/sports/womens-tennis/...,Yale University,Yale,W
Jenna Ritz,Jenna Ritz,2010,1,[2009-10 Women's Tennis],[First Year],"Thousand Oaks, Calif.",Thousand Oaks High School,https://yalebulldogs.com/sports/womens-tennis/...,Yale University,Yale,W
Lindsay Clark,Lindsay Clark,2011,2,"[2009-10 Women's Tennis, 2010-11 Women's Tennis]","[Junior, Senior]","Bernardsville, NJ",Bernardsville High School,https://yalebulldogs.com/sports/womens-tennis/...,Yale University,Yale,W
Stevi Petrelli,Stevi Petrelli,2011,2,"[2009-10 Women's Tennis, 2010-11 Women's Tennis]","[Junior, Senior]","Harrison, NY",Greenwich Academy,https://yalebulldogs.com/sports/womens-tennis/...,Yale University,Yale,W
Silia DeFilippis,Silia DeFilippis,2011,2,"[2009-10 Women's Tennis, 2010-11 Women's Tennis]","[Junior, Senior]","Short Hills, NJ",Millburn High School,https://yalebulldogs.com/sports/womens-tennis/...,Yale University,Yale,W
Kim Szokol,Kim Szokol,2011,1,[2010-11 Women's Tennis],[First Year],"Winnetka, Ill.",New Trier,https://yalebulldogs.com/sports/womens-tennis/...,Yale University,Yale,W
Steph Kent,Steph Kent,2012,3,"[2010-11 Women's Tennis, 2011-12 Women's Tenni...","[Junior, Senior, Sophomore]","Washington Depot, Conn.",Shepaug Valley High School,https://yalebulldogs.com/sports/womens-tennis/...,Yale University,Yale,W
Blair Seideman,Blair Seideman,2012,2,"[2010-11 Women's Tennis, 2011-12 Women's Tennis]","[First Year, Sophomore]","Glen Head, N.Y.",Jericho,https://yalebulldogs.com/sports/womens-tennis/...,Yale University,Yale,W
Vicky Brook,Vicky Brook,2012,3,"[2009-10 Women's Tennis, 2010-11 Women's Tenni...","[Sophomore, Junior, Senior]","London, England",Seven Oaks School,https://yalebulldogs.com/sports/womens-tennis/...,Yale University,Yale,W


In [268]:
# Use a for loop with nameparser utility HumanName to parse names into First, Middle, Last
namelist = dfgroup['Name'].tolist()
First=[]
Last=[]
Middle=[]
for name in namelist:
    parsename = HumanName(name)
    First.append(parsename.first)
    Last.append(parsename.last)
    Middle.append(parsename.middle)

In [269]:
# Display the elements of parsename.HumanName for last row. title, suffix, nickname are not applicable 
parsename

<HumanName : [
	title: '' 
	first: 'Vivian' 
	middle: '' 
	last: 'Cheng' 
	suffix: ''
	nickname: ''
]>

In [270]:
# Use RegEx to identify Middle values with a period vs. longer names with no period
# Note RegEx syntax "?<!\w\" means don't split the character e.g. keep the period
MidSplit=[]
for i in Middle:
    MidSplit.append(re.split(r'\.*\W+', i)[0])


In [271]:
MiddleName=[]
for i in MidSplit:
    if len(i) > 1:
        MiddleName.append(" "+i)
    else:
        MiddleName.append("")


In [272]:
dfgroup['First'] = First
dfgroup['MidInit'] = Middle
dfgroup['MidName'] = MiddleName
dfgroup['Last'] = Last
dfgroup['FirstLast'] = dfgroup['First'].map(str)+dfgroup['MidName']+' '+dfgroup['Last'].map(str)

In [273]:
dfgroup.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40 entries, Amber Li to Vivian Cheng
Data columns (total 16 columns):
Name           40 non-null object
Year           40 non-null int64
YearsPlayed    40 non-null int64
RosterYear     40 non-null object
ClassYear      40 non-null object
Hometown       40 non-null object
HS             40 non-null object
RosterLink     40 non-null object
College        40 non-null object
Criteria       40 non-null object
Gender         40 non-null object
First          40 non-null object
MidInit        40 non-null object
MidName        40 non-null object
Last           40 non-null object
FirstLast      40 non-null object
dtypes: int64(2), object(14)
memory usage: 5.3+ KB


In [274]:
# Unique name count is 229, with Roster year count 39. There is a gap missing 1966-67 through 2000-2001, or 34 missing years. 
# 227 LinkedIn URL values were found, but many are likely to be non-matching. 
# The earliest roster year is 1946-47, and there will be dropff of found URL's at some point
# Collapse to unique names, and create boolean for roster years (39 columns)

In [275]:
dfpdf = pd.read_csv('YaleWpdf.csv')

In [276]:
dfpdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 6 columns):
FirstLast      219 non-null object
Last           219 non-null object
First          219 non-null object
MidInit        105 non-null object
Year           219 non-null int64
YearsPlayed    219 non-null int64
dtypes: int64(2), object(4)
memory usage: 10.4+ KB


In [277]:
dfpdf

Unnamed: 0,FirstLast,Last,First,MidInit,Year,YearsPlayed
0,Elizabeth Auchincloss,Auchincloss,Elizabeth,L.,1973,2
1,Sheila Ford,Ford,Sheila,F.,1973,1
2,Sarah Fox,Fox,Sarah,M.,1973,1
3,Sally MacPartland,MacPartland,Sally,A.,1973,2
4,Diane Straus,Straus,Diane,E.,1973,2
5,Lucy Daggett,Daggett,Lucy,L.,1974,2
6,A. Julianna Gulya,Gulya,A. Julianna,,1974,1
7,Linden Havemeyer,Havemeyer,Linden,,1974,3
8,Louise Lippincott,Lippincott,Louise,,1974,1
9,Margaret Mercer,Mercer,Margaret,C.,1974,3


In [278]:
#dfpdf['FirstLastlower'] = dfpdf['FirstLast'].str.lower()

In [279]:
#dfname = dfgroup[['FirstLast']]
#dfname['namelower'] = dfgroup['FirstLast'].str.lower()

In [280]:
#df2 = pd.merge(dfname, dfpdf,
#                     left_on='namelower', right_on='FirstLastlower', how='outer',suffixes=('_R','_L'))
#df2.rename(columns={'FirstLast_L': 'FirstLast'}, inplace=True)

In [281]:
#df2 = df2[(df2['namelower'] != df2['FirstLastlower'])]

In [282]:
#df2.dropna(subset = ['FirstLast'], inplace=True)

In [283]:
#df2.drop(columns=['FirstLast_R', 'namelower', 'FirstLastlower'], inplace=True)

In [284]:
#df2 = df2[df2.FirstLast != 'Tom RATCHFORD']

In [285]:
#df2 = df2[df2.FirstLast != 'Zach DEAN']

In [286]:
#df2.info()

In [287]:
#df2

In [288]:
#dfappend = dfgroup.append(df2)

In [289]:
dfappend = dfgroup.append(dfpdf)

In [290]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [291]:
#dfappend.info()

In [292]:
dfappend.reset_index(drop=True)
dfappend.set_index(['FirstLast'], inplace=True, drop=False)

In [293]:
# Update College, Critera, and Gender for appended names
dfappend['College'] = 'Yale University'
dfappend['Criteria'] = ' Yale'
dfappend['Gender'] = 'W'

In [294]:
cols_to_order = ['Criteria','FirstLast', 'Year', 'YearsPlayed', 'RosterLink', 'Hometown', 'HS']
new_columns = cols_to_order + (dfappend.columns.drop(cols_to_order).tolist())
dfappend = dfappend[new_columns]

In [295]:
# Write the new format to a csv file
#dfappend.to_csv('CornellM_RosterGroup.csv', index=True)

In [296]:
dfappend.sort_values(by=['Year'], inplace=True)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
dfappend

Unnamed: 0_level_0,Criteria,FirstLast,Year,YearsPlayed,RosterLink,Hometown,HS,ClassYear,College,First,Gender,Last,MidInit,MidName,Name,RosterYear
FirstLast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Sally MacPartland,Yale,Sally MacPartland,1973,2,,,,,Yale University,Sally,W,MacPartland,A.,,,
Elizabeth Auchincloss,Yale,Elizabeth Auchincloss,1973,2,,,,,Yale University,Elizabeth,W,Auchincloss,L.,,,
Sheila Ford,Yale,Sheila Ford,1973,1,,,,,Yale University,Sheila,W,Ford,F.,,,
Sarah Fox,Yale,Sarah Fox,1973,1,,,,,Yale University,Sarah,W,Fox,M.,,,
Diane Straus,Yale,Diane Straus,1973,2,,,,,Yale University,Diane,W,Straus,E.,,,
Lucy Daggett,Yale,Lucy Daggett,1974,2,,,,,Yale University,Lucy,W,Daggett,L.,,,
Linden Havemeyer,Yale,Linden Havemeyer,1974,3,,,,,Yale University,Linden,W,Havemeyer,,,,
Louise Lippincott,Yale,Louise Lippincott,1974,1,,,,,Yale University,Louise,W,Lippincott,,,,
Margaret Mercer,Yale,Margaret Mercer,1974,3,,,,,Yale University,Margaret,W,Mercer,C.,,,
Deborah Rhode,Yale,Deborah Rhode,1974,1,,,,,Yale University,Deborah,W,Rhode,L.,,,


In [297]:
dfappend.info()

<class 'pandas.core.frame.DataFrame'>
Index: 259 entries, Sally MacPartland to Kathy Wang
Data columns (total 16 columns):
Criteria       259 non-null object
FirstLast      259 non-null object
Year           259 non-null int64
YearsPlayed    259 non-null int64
RosterLink     40 non-null object
Hometown       40 non-null object
HS             40 non-null object
ClassYear      40 non-null object
College        259 non-null object
First          259 non-null object
Gender         259 non-null object
Last           259 non-null object
MidInit        145 non-null object
MidName        40 non-null object
Name           40 non-null object
RosterYear     40 non-null object
dtypes: int64(2), object(14)
memory usage: 34.4+ KB


In [298]:
# Write the new format to a csv file
dfappend.to_csv('YaleW2_group.csv', index=True)

In [299]:
dfappend['FirstLast'].to_csv('NamelistYaleW2.csv', index=False, header=True)
dfappend['Criteria'].to_csv('CriterialistYaleW2.csv', index=False, header=True)