In [1]:
# Import dependencies
import pandas as pd

In [2]:
# Read in csv file
df = pd.read_csv('resources/historical_data.csv')

# Use the unnamed: 0 column as the placement for contestants
df = df.rename(columns={'Unnamed: 0': 'result'})

In [3]:
# Merge the data in the Job and Occupation columns
df['Occupation'] = df['Occupation'].fillna(df['Job'])

df.head()

Unnamed: 0,result,Name,Age,Hometown,Job,Eliminated,season,Occupation,Outcome,Place,Ref,Arrived
0,0,Amanda Marsh,23.0,"Chanute, Kansas",Event Planner,Winner,1,Event Planner,,,,
1,1,Trista Rehn,29.0,"St. Louis, Missouri",Miami Heat Dancer,Runner-up,1,Miami Heat Dancer,,,,
2,2,Shannon Oliver,24.0,"Dallas, Texas",Financial Management Consultant,Week 5,1,Financial Management Consultant,,,,
3,3,Kimberly Karels,24.0,"Tempe, Arizona",Nanny,Week 4,1,Nanny,,,,
4,4,Cathy Grimes,22.0,"Terre Haute, Indiana",Graduate Student,Week 3,1,Graduate Student,,,,


In [4]:
# Drop unnecessary columns
df = df[['Name', 'Age', 'season', 'Hometown', 'Occupation', 'result']]
df = df.rename(columns={'Name': 'name',
                        'Age': 'age',
                        'Hometown': 'hometown',
                        'Occupation': 'occupation'})

# Investigate the dataset to see what dtypes need to be converted
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612 entries, 0 to 611
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        612 non-null    object
 1   age         611 non-null    object
 2   season      612 non-null    int64 
 3   hometown    612 non-null    object
 4   occupation  612 non-null    object
 5   result      612 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 28.8+ KB


In [6]:
# Convert the datatypes
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['season'] = df['season'].astype(str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612 entries, 0 to 611
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        612 non-null    object 
 1   age         608 non-null    float64
 2   season      612 non-null    object 
 3   hometown    612 non-null    object 
 4   occupation  612 non-null    object 
 5   result      612 non-null    int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 28.8+ KB


In [7]:
# Investigate the increase in NULL values for Age
null_df = df[df['age'].isna()]

null_df

Unnamed: 0,name,age,season,hometown,occupation,result
42,Cosetta Blanca,,9,Italy,Dancer,17
408,Maquel Cooper,,22,(Returned to competition),(Returned to competition),15
439,Alayah Benavidez,,24,(Returned to competition),(Returned to competition),16
494,Susie Evans,,26,(Returned to competition),(Returned to competition),3


In [8]:
# These are duplicates and can be dropped
# Minus Cosetta (sorry)
df = df.dropna()

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 608 entries, 0 to 611
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        608 non-null    object 
 1   age         608 non-null    float64
 2   season      608 non-null    object 
 3   hometown    608 non-null    object 
 4   occupation  608 non-null    object 
 5   result      608 non-null    int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 33.2+ KB
