In [1]:
# Import dependencies
import pandas as pd

In [2]:
# Read in csv file
df = pd.read_csv('resources/historical_data.csv')

# Use the unnamed: 0 column as the placement for contestants
df = df.rename(columns={'Unnamed: 0': 'result'})

In [3]:
# Merge the data in the Job and Occupation columns
df['Occupation'] = df['Occupation'].fillna(df['Job'])

df.head()

Unnamed: 0,result,Name,Age,Hometown,Job,Eliminated,season,Occupation,Outcome,Place,Ref,Arrived
0,0,Amanda Marsh,23.0,"Chanute, Kansas",Event Planner,Winner,1,Event Planner,,,,
1,1,Trista Rehn,29.0,"St. Louis, Missouri",Miami Heat Dancer,Runner-up,1,Miami Heat Dancer,,,,
2,2,Shannon Oliver,24.0,"Dallas, Texas",Financial Management Consultant,Week 5,1,Financial Management Consultant,,,,
3,3,Kimberly Karels,24.0,"Tempe, Arizona",Nanny,Week 4,1,Nanny,,,,
4,4,Cathy Grimes,22.0,"Terre Haute, Indiana",Graduate Student,Week 3,1,Graduate Student,,,,


In [4]:
# Drop unnecessary columns
df = df[['Name', 'Age', 'season', 'Hometown', 'Occupation', 'result']]
df = df.rename(columns={'Name': 'name',
                        'Age': 'age',
                        'Hometown': 'hometown',
                        'Occupation': 'occupation'})

# Investigate the dataset to see what dtypes need to be converted
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612 entries, 0 to 611
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        612 non-null    object
 1   age         611 non-null    object
 2   season      612 non-null    int64 
 3   hometown    612 non-null    object
 4   occupation  612 non-null    object
 5   result      612 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 28.8+ KB


In [5]:
# Convert the datatypes
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['season'] = df['season'].astype(str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612 entries, 0 to 611
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        612 non-null    object 
 1   age         608 non-null    float64
 2   season      612 non-null    object 
 3   hometown    612 non-null    object 
 4   occupation  612 non-null    object 
 5   result      612 non-null    int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 28.8+ KB


In [6]:
# Investigate the increase in NULL values for Age
null_df = df[df['age'].isna()]

null_df

Unnamed: 0,name,age,season,hometown,occupation,result
42,Cosetta Blanca,,9,Italy,Dancer,17
408,Maquel Cooper,,22,(Returned to competition),(Returned to competition),15
439,Alayah Benavidez,,24,(Returned to competition),(Returned to competition),16
494,Susie Evans,,26,(Returned to competition),(Returned to competition),3


In [7]:
# These are duplicates and can be dropped
# Minus Cosetta (sorry)
df = df.dropna()

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 608 entries, 0 to 611
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        608 non-null    object 
 1   age         608 non-null    float64
 2   season      608 non-null    object 
 3   hometown    608 non-null    object 
 4   occupation  608 non-null    object 
 5   result      608 non-null    int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 33.2+ KB


In [8]:
# Preview the dataset to continue cleaning
df.head(50)

Unnamed: 0,name,age,season,hometown,occupation,result
0,Amanda Marsh,23.0,1,"Chanute, Kansas",Event Planner,0
1,Trista Rehn,29.0,1,"St. Louis, Missouri",Miami Heat Dancer,1
2,Shannon Oliver,24.0,1,"Dallas, Texas",Financial Management Consultant,2
3,Kimberly Karels,24.0,1,"Tempe, Arizona",Nanny,3
4,Cathy Grimes,22.0,1,"Terre Haute, Indiana",Graduate Student,4
5,Christina Stencil,28.0,1,"Bonita, California",Attorney,5
6,LaNease Adams,23.0,1,"Playa Del Rey, California",Actress,6
7,Rhonda Rittenhouse,28.0,1,"Woodward, Oklahoma",Commercial Real Estate Agent,7
8,Alexa Jurgielewicz,27.0,1,"Beverly Hills, California",Special Ed. Teacher,8
9,Amy Anzel,28.0,1,"Yonkers, New York",Production Coordinator,9


In [9]:
# Remove the number link references
pattern = r'\[\d+\]'
df['name'] = df['name'].str.replace(pattern, '')

df

  df['name'] = df['name'].str.replace(pattern, '')


Unnamed: 0,name,age,season,hometown,occupation,result
0,Amanda Marsh,23.0,1,"Chanute, Kansas",Event Planner,0
1,Trista Rehn,29.0,1,"St. Louis, Missouri",Miami Heat Dancer,1
2,Shannon Oliver,24.0,1,"Dallas, Texas",Financial Management Consultant,2
3,Kimberly Karels,24.0,1,"Tempe, Arizona",Nanny,3
4,Cathy Grimes,22.0,1,"Terre Haute, Indiana",Graduate Student,4
...,...,...,...,...,...,...
607,Erin Landry,28.0,23,"Plano, Texas",Cinderella,25
608,"Adrianne ""Jane"" Averbukh",26.0,23,"West Hollywood, California",Social Worker,26
609,Laura Pellerito,26.0,23,"Dallas, Texas",Accountant,27
610,Revian Chang,24.0,23,"Plano, Texas",Nurse,28


In [11]:
# Split the hometown into two additional columns for city and state
df[['city', 'state', 'other']] = df['hometown'].str.split(',', expand=True)

df.head()

Unnamed: 0,name,age,season,hometown,occupation,result,city,state,other
0,Amanda Marsh,23.0,1,"Chanute, Kansas",Event Planner,0,Chanute,Kansas,
1,Trista Rehn,29.0,1,"St. Louis, Missouri",Miami Heat Dancer,1,St. Louis,Missouri,
2,Shannon Oliver,24.0,1,"Dallas, Texas",Financial Management Consultant,2,Dallas,Texas,
3,Kimberly Karels,24.0,1,"Tempe, Arizona",Nanny,3,Tempe,Arizona,
4,Cathy Grimes,22.0,1,"Terre Haute, Indiana",Graduate Student,4,Terre Haute,Indiana,


In [15]:
# Inspectthe unique instances in other
df['other'].unique()

# Create DataFrame to view other data
x = df.loc[df['other'] ==' Illinois']

x

Unnamed: 0,name,age,season,hometown,occupation,result,city,state,other
37,April Jacobs,23.0,9,"Hyde Park, Chicago, Illinois",Model,12,Hyde Park,Chicago,Illinois


In [19]:
# Replace values in city with Chicago and state with Illinois
df.at[37, 'hometown'] = 'Chicago, Illinois'
df.at[37, 'city'] = 'Chicago'
df.at[37, 'state'] = 'Illinois'

In [23]:
# Drop the other column
df.drop(columns='other', inplace=True)

In [29]:
# Split the name column similar to the hometown
df[['first', 'x', 'last']] = df['name'].str.split(' ', expand=True)

In [32]:
# Find all the nicknames
df['last'] = df['last'].fillna(df['x'])
df.drop(columns='x', inplace=True)

In [34]:
# Preview the DataFrame
df.head()

Unnamed: 0,name,age,season,hometown,occupation,result,city,state,first,last
0,Amanda Marsh,23.0,1,"Chanute, Kansas",Event Planner,0,Chanute,Kansas,Amanda,Marsh
1,Trista Rehn,29.0,1,"St. Louis, Missouri",Miami Heat Dancer,1,St. Louis,Missouri,Trista,Rehn
2,Shannon Oliver,24.0,1,"Dallas, Texas",Financial Management Consultant,2,Dallas,Texas,Shannon,Oliver
3,Kimberly Karels,24.0,1,"Tempe, Arizona",Nanny,3,Tempe,Arizona,Kimberly,Karels
4,Cathy Grimes,22.0,1,"Terre Haute, Indiana",Graduate Student,4,Terre Haute,Indiana,Cathy,Grimes
