In [27]:
import pandas as pd
from glob import glob

In [28]:
# Read Data
files = glob('data/*.csv')
# https://stackoverflow.com/questions/65132425/how-to-read-all-csv-files-in-a-folder-in-pandas
data = []

for file in files:
    csv = pd.read_csv(file, index_col=False)
    data.append(csv)
# Ignore index to have continuous index after concatenation.
# https://stackoverflow.com/questions/35528119/pandas-recalculate-index-after-a-concatenation
uil = pd.concat(data, ignore_index=True)


In [29]:
# Change columns to correct dtypes
dtypes = {
    'contestant': 'string',
    'school' : 'category',
    'score' : 'float32',
    'place' : 'int8',
    'points' : 'float32',
    'medal' : 'category',
    'advance' : 'category',
    'meet_level' : 'category',
    'level_num' : 'category',
    'year': 'int16',
    'event': 'category',
    'conference' : 'category'
}
uil = uil.astype(dtype=dtypes)

In [30]:
uil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434189 entries, 0 to 434188
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   contestant  434189 non-null  string  
 1   school      434189 non-null  category
 2   score       434189 non-null  float32 
 3   place       434189 non-null  int8    
 4   points      106895 non-null  float32 
 5   medal       106683 non-null  category
 6   advance     70765 non-null   category
 7   meet_level  434189 non-null  category
 8   level_num   434189 non-null  category
 9   year        434189 non-null  int16   
 10  event       434189 non-null  category
 11  conference  434189 non-null  category
dtypes: category(7), float32(2), int16(1), int8(1), string(1)
memory usage: 11.3 MB


### Check for Duplicate Rows

In [35]:
uil[uil.duplicated() == True]

Unnamed: 0,contestant,school,score,place,points,medal,advance,meet_level,level_num,year,event,conference
6918,"Staten, Stephanie","Lit Cypress-Mrceville H S, Orange",6.0,20,,,,D,20,2004,NUM,4A
6919,"Staten, Stephanie","Lit Cypress-Mrceville H S, Orange",6.0,20,,,,D,20,2004,NUM,4A
15210,"Trevino, Steve","Moody H S, Corpus Christi",4.0,31,,,,D,29,2004,SCI,5A
22345,"Trevino, Jimmy","Olton H S, Olton",0.0,13,,,,R,1,2005,CSC,2A
39882,"Wright, Cameron","Falls City H S, Falls City",0.0,27,,,,D,30,2006,NUM,1A
74304,",",",",0.0,16,,,,D,29,2007,NUM,4A
76413,"Jesus, Saucedo","Del Valle H S, El Paso",0.0,12,,,,D,2,2007,CSC,4A
90565,"Orwig, Kaitlyn","Valley Mills HS, Valley Mills",0.0,24,,,,D,25,2008,MTH,1A
146391,"Sanders, Brittany","Mabank HS, Mabank",10.0,10,,,,D,15,2010,NUM,4A
157195,"Capps, Rachel","Vernon H S, Vernon",26.0,32,,,,R,1,2010,SCI,3A


### Check for Missing Values

In [25]:
uil.isna().sum()

contestant         0
school             0
score              0
place              0
points        327294
medal         327506
advance       363424
meet_level         0
level_num          0
year               0
event              0
conference         0
dtype: int64

#### Points

In [26]:
uil[uil['points'].isna() == True]

Unnamed: 0,contestant,school,score,place,points,medal,advance,meet_level,level_num,year,event,conference
6,"Taliaferro, Julianna","Stratford H S, Stratford",58.0,7,,,,D,2,2004,NUM,1A
7,"Johnson, Lucas","Wellington H S, Wellington",54.0,8,,,,D,2,2004,NUM,1A
8,"Hampton, Heather","Wheeler H S, Wheeler",54.0,8,,,,D,2,2004,NUM,1A
9,"Morris, Shay","Wellington H S, Wellington",53.0,10,,,,D,2,2004,NUM,1A
10,"Parsley, Andrew","White Deer H S, White Deer",52.0,11,,,,D,2,2004,NUM,1A
...,...,...,...,...,...,...,...,...,...,...,...,...
434184,"Becker, Michael","Midway H S, Waco",86.0,30,,,,S,1,2022,SCI,6A
434185,"Wang, Richard","Clear Lake HS, Houston",80.0,31,,,,S,1,2022,SCI,6A
434186,"Homiller, David","Vista Ridge H S, Cedar Park",80.0,31,,,,S,1,2022,SCI,6A
434187,"Anderson, Caroline","Central HS, San Angelo",62.0,33,,,,S,1,2022,SCI,6A
