In [1]:
#Members only

#Throughout the course so far, you've been exposed to some common problems that you may encounter with your data, from data type
#constraints, data range constrains, uniqueness constraints, and now membership constraints for categorical values.

#In this exercise, you will map hypothetical problems to their respective categories.

#Drag the items into the correct bucket

#Membership Constraint                                Other Constraint

#A month column with the value 14.                    A age column with values above 130.
#A has_loan column with the value 12.                 A revenue column represented as a string.
#A day_of_week column with the value Suntermonday.    A birthdate column with values in the future.
#A GPA column containing a Z- grade.

In [2]:
#Finding consistency

import pandas as pd
airlines = pd.read_csv('datasets/airlines_final.csv', index_col=0)
airlines.loc[4, 'cleanliness'] = 'Unacceptable'
airlines.loc[18, 'cleanliness'] = 'Unacceptable'
airlines.loc[100, 'cleanliness'] = 'Unacceptable'
airlines['cleanliness'] = airlines['cleanliness'].astype('category')
airlines['safety'] = airlines['safety'].astype('category')
airlines['satisfaction'] = airlines['satisfaction'].astype('category')
categories = pd.read_csv('datasets/categories.csv')

#In this exercise and throughout this chapter, you'll be working with the airlines DataFrame which contains survey responses on
#the San Francisco Airport from airline customers.

#The DataFrame contains flight metadata such as the airline, the destination, waiting times as well as answers to key questions
#regarding cleanliness, safety, and satisfaction. Another DataFrame named categories was created, containing all correct
#possible values for the survey columns.

#In this exercise, you will use both of these DataFrames to find survey answers with inconsistent values, and drop them,
#effectively performing an outer and inner join on both these DataFrames as seen in the video exercise. The pandas package has
#been imported as pd, and the airlines and categories DataFrames are in your environment.

# Print categories DataFrame
print(categories, '\n')

# Print unique values of survey columns in airlines
print('Cleanliness: ', airlines['cleanliness'].unique(), "\n")
print('Safety: ', airlines['safety'].unique(), "\n")
print('Satisfaction: ', airlines['satisfaction'].unique(), "\n")

      cleanliness           safety          satisfaction
0           Clean          Neutral        Very satisfied
1         Average        Very safe               Neutral
2  Somewhat clean    Somewhat safe    Somewhat satisfied
3  Somewhat dirty      Very unsafe  Somewhat unsatisfied
4           Dirty  Somewhat unsafe      Very unsatisfied 

Cleanliness:  ['Clean', 'Average', 'Unacceptable', 'Somewhat clean', 'Somewhat dirty', 'Dirty']
Categories (6, object): ['Clean', 'Average', 'Unacceptable', 'Somewhat clean', 'Somewhat dirty', 'Dirty'] 

Safety:  ['Neutral', 'Very safe', 'Somewhat safe', 'Very unsafe', 'Somewhat unsafe']
Categories (5, object): ['Neutral', 'Very safe', 'Somewhat safe', 'Very unsafe', 'Somewhat unsafe'] 

Satisfaction:  ['Very satisfied', 'Neutral', 'Somewhat satsified', 'Somewhat unsatisfied', 'Very unsatisfied']
Categories (5, object): ['Very satisfied', 'Neutral', 'Somewhat satsified', 'Somewhat unsatisfied', 'Very unsatisfied'] 



In [3]:
#Question

#Take a look at the output. Out of the cleanliness, safety and satisfaction columns, which one has an inconsistent category and
#what is it?

#Possible Answers

#cleanliness because it has an Unacceptable category.*

#cleanliness because it has a Terribly dirty category.

#satisfaction because it has a Very satisfied category.

#safety because it has a Neutral category.

In [4]:
# Find the cleanliness category in airlines not in categories
cat_clean = set(airlines['cleanliness']).difference(categories['cleanliness'])

# Find rows with that category
cat_clean_rows = airlines['cleanliness'].isin(cat_clean)

# Print rows with inconsistent category
print(airlines[cat_clean_rows], '\n')

# Print rows with consistent categories only
print(airlines[~cat_clean_rows]) #~ -> Devuelve todo excepto las filas inconsistentes (es decir, en este caso no devuelve las
                                 #     filas con la categoria "Unacceptable" en la columna "cleanliness")

       id        day           airline  destination  dest_region dest_size  \
4    2992  Wednesday          AMERICAN        MIAMI      East US       Hub   
18   2913     Friday  TURKISH AIRLINES     ISTANBUL  Middle East       Hub   
100  2321  Wednesday         SOUTHWEST  LOS ANGELES      West US       Hub   

    boarding_area   dept_time  wait_min   cleanliness         safety  \
4     Gates 50-59  2018-12-31     559.0  Unacceptable      Very safe   
18   Gates 91-102  2018-12-31     225.0  Unacceptable      Very safe   
100   Gates 20-39  2018-12-31     130.0  Unacceptable  Somewhat safe   

           satisfaction  
4    Somewhat satsified  
18   Somewhat satsified  
100  Somewhat satsified   

        id       day        airline        destination    dest_region  \
0     1351   Tuesday    UNITED INTL             KANSAI           Asia   
1      373    Friday         ALASKA  SAN JOSE DEL CABO  Canada/Mexico   
2     2820  Thursday          DELTA        LOS ANGELES        West US   


In [5]:
#Categories of errors

#In the video exercise, you saw how to address common problems affecting categorical variables in your data, including white
#spaces and inconsistencies in your categories, and the problem of creating new categories and mapping existing ones to new
#ones.

#To get a better idea of the toolkit at your disposal, you will be mapping functions and methods from pandas and Python used to
#address each type of problem.

#Drag the items into the correct bucket

#White spaces and inconsistency    Creating or remapping categories

#.str.strip()                      pandas.cut()
#.str.upper()                      .replace()
#.str.lower()                      pandas.qcut()

In [6]:
#Inconsistent categories

import pandas as pd
airlines = pd.read_csv('datasets/airlines_final.csv', index_col=0)

#In this exercise, you'll be revisiting the airlines DataFrame from the previous lesson.

#As a reminder, the DataFrame contains flight metadata such as the airline, the destination, waiting times as well as answers to
#key questions regarding cleanliness, safety, and satisfaction on the San Francisco Airport.

#In this exercise, you will examine two categorical columns from this DataFrame, dest_region and dest_size respectively, assess
#how to address them and make sure that they are cleaned and ready for analysis. The pandas package has been imported as pd, and
#the airlines DataFrame is in your environment.

# Print unique values of both columns
print(airlines['dest_region'].unique(), '\n')
print(airlines['dest_size'].unique())

['Asia' 'Canada/Mexico' 'West US' 'East US' 'Midwest US' 'EAST US'
 'Middle East' 'Europe' 'eur' 'Central/South America'
 'Australia/New Zealand' 'middle east'] 

['Hub' 'Small' '    Hub' 'Medium' 'Large' 'Hub     ' '    Small'
 'Medium     ' '    Medium' 'Small     ' '    Large' 'Large     ']


In [7]:
#Question

#From looking at the output, what do you think is the problem with these columns?

#Possible Answers

#The dest_region column has only inconsistent values due to capitalization.

#The dest_region column has inconsistent values due to capitalization and has one value that needs to be remapped.

#The dest_size column has only inconsistent values due to leading and trailing spaces.

#Both 2 and 3 are correct.*

In [8]:
# Lower dest_region column and then replace "eur" with "europe"
airlines['dest_region'] = airlines['dest_region'].str.lower() 
airlines['dest_region'] = airlines['dest_region'].replace({'eur':'europe'})

# Remove white spaces from `dest_size`
airlines['dest_size'] = airlines['dest_size'].str.strip()

# Verify changes have been effected
print(airlines['dest_region'].unique(), '\n')
print(airlines['dest_size'].unique())

['asia' 'canada/mexico' 'west us' 'east us' 'midwest us' 'middle east'
 'europe' 'central/south america' 'australia/new zealand'] 

['Hub' 'Small' 'Medium' 'Large']


In [9]:
#Remapping categories

import pandas as pd
airlines = pd.read_csv('datasets/airlines_final.csv', index_col=0)
airlines['day'] = airlines['day'].astype('category')
import numpy as np

#To better understand survey respondents from airlines, you want to find out if there is a relationship between certain
#responses and the day of the week and wait time at the gate.

#The airlines DataFrame contains the day and wait_min columns, which are categorical and numerical respectively. The day column
#contains the exact day a flight took place, and wait_min contains the amount of minutes it took travelers to wait at the gate.
#To make your analysis easier, you want to create two new categorical variables:

#wait_type: 'short' for 0-60 min, 'medium' for 60-180 and long for 180+
#day_week: 'weekday' if day is in the weekday, 'weekend' if day is in the weekend.

#The pandas and numpy packages have been imported as pd and np. Let's create some new categorical data!

# Create ranges for categories
label_ranges = [0, 60, 180, np.inf]
label_names = ['short', 'medium', 'long']

# Create wait_type column
airlines['wait_type'] = pd.cut(airlines['wait_min'], bins = label_ranges, 
                                labels = label_names)

# Create mappings and replace
mappings = {'Monday':'weekday', 'Tuesday':'weekday', 'Wednesday': 'weekday', 
            'Thursday': 'weekday', 'Friday': 'weekday', 
            'Saturday': 'weekend', 'Sunday': 'weekend'}

airlines['day_week'] = airlines['day'].replace(mappings)

In [10]:
print(airlines['wait_type'].unique(), '\n')
print(airlines['day_week'].unique())

['medium', 'long', 'short']
Categories (3, object): ['short' < 'medium' < 'long'] 

['weekday' 'weekend']


In [11]:
assert airlines['wait_type'].unique().tolist() == ['medium', 'long', 'short']
assert airlines['day_week'].unique().tolist() == ['weekday', 'weekend']

In [12]:
#Removing titles and taking names

import pandas as pd
airlines = pd.read_csv('datasets/airlines_full_name.csv')

#While collecting survey respondent metadata in the airlines DataFrame, the full name of respondents was saved in the full_name
#column. However upon closer inspection, you found that a lot of the different names are prefixed by honorifics such as "Dr.",
#"Mr.", "Ms." and "Miss".

#Your ultimate objective is to create two new columns named first_name and last_name, containing the first and last names of
#respondents respectively. Before doing so however, you need to remove honorifics.

#The airlines DataFrame is in your environment, alongside pandas as pd.

# Replace "Dr." with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace("Dr.","")

# Replace "Mr." with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace("Mr.","")

# Replace "Miss" with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace("Miss","")

# Replace "Ms." with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace("Ms.","")

# Assert that full_name has no honorifics
assert airlines['full_name'].str.contains('Ms.|Mr.|Miss|Dr.').any() == False

  app.launch_new_instance()


In [13]:
#Keeping it descriptive

import pandas as pd
airlines = pd.read_csv('datasets/airlines_survey_response.csv', sep=';')

#To further understand travelers' experiences in the San Francisco Airport, the quality assurance department sent out a
#qualitative questionnaire to all travelers who gave the airport the worst score on all possible categories. The objective
#behind this questionnaire is to identify common patterns in what travelers are saying about the airport.

#Their response is stored in the survey_response column. Upon a closer look, you realized a few of the answers gave the shortest
#possible character amount without much substance. In this exercise, you will isolate the responses with a character count
#higher than 40 , and make sure your new DataFrame contains responses with 40 characters or more using an assert statement.

#The airlines DataFrame is in your environment, and pandas is imported as pd.

# Store length of each row in survey_response column
resp_length = airlines['survey_response'].str.len()

# Find rows in airlines where resp_length > 40
airlines_survey = airlines[resp_length > 40]

# Assert minimum survey_response length is > 40
assert airlines_survey['survey_response'].str.len().min() > 40

# Print new survey_response column
print(airlines_survey['survey_response'])

18    The airport personnell forgot to alert us of d...
19    The food in the airport was really really expe...
20    One of the other travelers was really loud and...
21    I don't remember answering the survey with the...
22    The airport personnel kept ignoring my request...
23    The chair I sat in was extremely uncomfortable...
24    I wish you were more like other airports, the ...
25    I was really unsatisfied with the wait times b...
27    The flight was okay, but I didn't really like ...
28    We were really slowed down by security measure...
29    There was a spill on the aisle next to the bat...
30    I felt very unsatisfied by how long the flight...
Name: survey_response, dtype: object
