## Python Mini-Project: Data Clean-Up, Pt. 1

Pay close attention to the prompts to help guide you through this task.

In [1]:
# Dependencies
import pandas as pd
import os

In [2]:
# load CSV
filePath = os.path.join('Resources','2016-FCC-New-Coders-Survey-Data.csv')

In [3]:
# Read with pandas
fcc_df = pd.read_csv(filePath, low_memory=False)
fcc_df.head(3)

Unnamed: 0,Age,AttendedBootcamp,BootcampFinish,BootcampFullJobAfter,BootcampLoanYesNo,BootcampMonthsAgo,BootcampName,BootcampPostSalary,BootcampRecommend,ChildrenNumber,...,ResourceSoloLearn,ResourceStackOverflow,ResourceTreehouse,ResourceUdacity,ResourceUdemy,ResourceW3Schools,ResourceYouTube,SchoolDegree,SchoolMajor,StudentDebtOwe
0,28.0,0.0,,,,,,,,,...,,,,,,,,"some college credit, no degree",,20000.0
1,22.0,0.0,,,,,,,,,...,,,,,1.0,,,"some college credit, no degree",,
2,19.0,0.0,,,,,,,,,...,,,,,,,,high school diploma or equivalent (GED),,


In [4]:
# Inspect all columns
len(fcc_df.columns)

113

In [5]:
# Extract only columns 0, 1, 2, 3, 4, 7, 8, 9, 10,11, 29, 30, 32, 36, 37, 45, 48, 56, 110, 111
# Use iloc to accomplish this (remember that you need to take the position of the rows and columns into account)
lim_fcc_df = fcc_df.iloc[:, [0, 1, 2, 3, 4, 7, 8, 9, 10,11, 29, 30, 32, 36, 37, 45, 48, 56, 110, 111]]
print(len(lim_fcc_df.columns))
lim_fcc_df.columns

20


Index(['Age', 'AttendedBootcamp', 'BootcampFinish', 'BootcampFullJobAfter',
       'BootcampLoanYesNo', 'BootcampPostSalary', 'BootcampRecommend',
       'ChildrenNumber', 'CityPopulation', 'CodeEventBootcamp', 'CountryLive',
       'EmploymentField', 'EmploymentStatus', 'Gender', 'HasChildren',
       'HoursLearning', 'Income', 'JobRoleInterest', 'SchoolDegree',
       'SchoolMajor'],
      dtype='object')

In [6]:
# Change "0" to "No" and "1" to "Yes" in response columns
#fixed_fcc_df.replace(to_replace='AttendedBootcamp', value={0.0 :"No", 1.:"Yes"}, inplace=True)
#fixed_fcc_df = lim_fcc_df.replace(to_replace='AttendedBootcamp', value={0.0 :"No", 1.0:"Yes"}, inplace=True, method="None")
fixed_fcc_df = lim_fcc_df.replace({0.0 :"No", 1.0:"Yes"})

#fixed_fcc_df = lim_fcc_df['AttendedBootcamp'].replace({0.0 :"No", 1.:"Yes"})
# Hint use the df.replace function
#fixed_fcc_df.head(3)

In [7]:
# Calculate total number of respondents in survey
# consider using the len() function
total_students = len(fixed_fcc_df)
#summary_df

In [8]:
# Extract rows corresponding only to people who attended a bootcamp
# Filter using df.loc on the AttendedBootcamp column
bc_only_df = fixed_fcc_df.loc[fixed_fcc_df['AttendedBootcamp'] == "Yes"]
#bc_only_df.head(3)

In [9]:
# Calculate average age of attendees
# Pull out the `Age` series and take the average of it
avg_age = round(bc_only_df['Age'].mean())

In [10]:
# Calculate how many people attended a bootcamp
# Count the values of the AttendedBootcamp column
total_at_bc = len(bc_only_df)
total_at_bc

953

In [11]:
# Calculate how many attendees hold degrees
# It is easier to do this with `value_counts() count values for the SchoolDegree column
#print(bc_only_df['SchoolDegree'].unique())
bc_only_df['SchoolDegree'].value_counts()

bachelor's degree                           462
some college credit, no degree              116
master's degree (non-professional)           96
professional degree (MBA, MD, JD, etc.)      39
high school diploma or equivalent (GED)      38
associate's degree                           32
trade, technical, or vocational training     24
some high school                             10
Ph.D.                                         8
no high school (secondary school)             7
Name: SchoolDegree, dtype: int64

In [12]:
count_boots = bc_only_df.replace({
    'Ph.D.' : "Yes","bachelor's degree" : "Yes",
     'professional degree (MBA, MD, JD, etc.)' : "Yes",
     "master's degree (non-professional)" : "Yes","associate's degree" : "Yes"
    })

clean_degree_df = fixed_fcc_df.replace({
    'Ph.D.' : "Yes","bachelor's degree" : "Yes",
     'professional degree (MBA, MD, JD, etc.)' : "Yes",
     "master's degree (non-professional)" : "Yes","associate's degree" : "Yes"
    })

In [13]:
# Count the number of records where the person is a degree holder
# There are several ways to approach this. You can look for people who have degrees
# or for people who don't have degrees depending on the value of the SchoolDegree column
print(f"{len(count_boots.loc[count_boots['SchoolDegree'] == 'Yes'])} of {len(count_boots)} bootcamp students hold higher education degrees.")

637 of 953 bootcamp students hold higher education degrees.


In [14]:
# Count number of attendees who self-identify as male; female; or are of non-binary gender identification
total_bc_students = len(bc_only_df)
bc_male_count = bc_only_df['AttendedBootcamp'].loc[bc_only_df['Gender'] == 'male'].count()
bc_female_count = bc_only_df['AttendedBootcamp'].loc[bc_only_df['Gender'] == 'female'].count()
bc_nb_count = total_bc_students - bc_male_count - bc_female_count
print(f" male: {bc_male_count} female: {bc_female_count} non-binary: {bc_nb_count} of {total_bc_students} total")
bc_only_df['Gender'].unique()


 male: 496 female: 326 non-binary: 131 of 953 total


array(['male', 'female', 'genderqueer', nan, 'trans', 'agender'],
      dtype=object)

In [15]:
# Calculate percentage of respondents who attended a bootcamp
percent_bc = total_bc_students / total_students * 100

print(f"{percent_bc}% of students attended a bootcamp")

6.101152368758003% of students attended a bootcamp


In [16]:
# Calculate percentage of respondents belonging to each gender
filled_fcc_df = pd.DataFrame()
filled_fcc_df = fixed_fcc_df.fillna('na')
total_male = len(filled_fcc_df.loc[filled_fcc_df['Gender'] == 'male'])
total_female = len(filled_fcc_df.loc[filled_fcc_df['Gender'] == 'female'])
total_na = len(filled_fcc_df.loc[filled_fcc_df['Gender'] == 'na'])
total_students = total_students - total_na
total_nb = total_students - total_male - total_female
percent_male = total_male / total_students * 100
percent_female = total_female / total_students * 100
percent_nb = total_nb / total_students * 100
print(f" male: {total_male}({percent_male}) female: {total_female}({percent_female}) non-binary: {total_nb}({percent_nb}) of {total_students} total")
total_na

 male: 10766(78.32096609922885) female: 2840(20.660555798050343) non-binary: 140(1.0184781027207914) of 13746 total


1874

In [17]:
# Calculate percentage of attendees with a college degree
print(f"{len(count_boots.loc[count_boots['SchoolDegree'] == 'Yes'])} of {len(count_boots)} bootcamp students hold higher education degrees.")
total_degree_holders = len(clean_degree_df.loc[clean_degree_df['SchoolDegree'] == 'Yes'])
percent_degree_holders = total_degree_holders / total_students *100

637 of 953 bootcamp students hold higher education degrees.


In [18]:
# Calculate average post-bootcamp salary

mean_post_salary = count_boots['BootcampPostSalary'].mean()
mean_post_salary

63740.50606060606

In [19]:
# Create a new table consolidating above calculations
summary_df = pd.DataFrame({
    'Total Surveyed' : [total_students],
    'Avg. Age' : [avg_age],
    'Total Bootcamp Attendees' : [total_at_bc],
    '% Attended Bootcamp' : [float(percent_bc)],
    '% Male' : [float(percent_male)],
    '% Female' : [float(percent_female)],
    '% Non-Binary Gender' : [float(percent_nb)],
    'Has a Degree' : [percent_degree_holders],
    'Average Post Bootcamp Salary' : [float(mean_post_salary)]
}).round(2)
#avg_age
summary_df

Unnamed: 0,Total Surveyed,Avg. Age,Total Bootcamp Attendees,% Attended Bootcamp,% Male,% Female,% Non-Binary Gender,Has a Degree,Average Post Bootcamp Salary
0,13746,31,953,6.1,78.32,20.66,1.02,62.49,63740.51


In [20]:
# Improve formatting before outputting spreadsheet
summary_df['Average Post Bootcamp Salary'] = summary_df['Average Post Bootcamp Salary'].map("${:,.2f}".format)
summary_df['% Male'] = summary_df['% Male'].map("{:,.2f}%".format)
summary_df['% Female'] = summary_df['% Female'].map("{:,.2f}%".format)
summary_df['% Non-Binary Gender'] = summary_df['% Non-Binary Gender'].map("{:,.2f}%".format)
summary_df['% Attended Bootcamp'] = summary_df['% Attended Bootcamp'].map("{:,.2f}%".format)
summary_df

Unnamed: 0,Total Surveyed,Avg. Age,Total Bootcamp Attendees,% Attended Bootcamp,% Male,% Female,% Non-Binary Gender,Has a Degree,Average Post Bootcamp Salary
0,13746,31,953,6.10%,78.32%,20.66%,1.02%,62.49,"$63,740.51"


In [21]:
# Export to Excel
# Use df.to_excel to export to excel. Don't include the indexes
summary_df.to_excel("./output/bootcamp_summary_report.xls", index=False)
