## Membership constraints


In [1]:
import pandas as pd
import numpy as np

### Finding consistency

In [3]:
airlines = pd.read_csv('./dataset/airlines_new.csv')
categories = pd.DataFrame(
    {
        'cleanliness': ['Clean', 'Average', 'Somewhat clean', 'Somewhat dirty', 'Dirty'],
        'safety': ['Neutral', 'Very safe', 'Somewhat safe', 'Very unsafe', 'Somewhat unsafe'],
        'satisfaction': ['Very satisfied', 'Neutral', 'Somewhat satisfied', 'Somewhat unsatisfied', 'Very unsatisfied']
    }
)

airlines = airlines.drop(labels=['Unnamed: 0'], axis=1)

In [4]:
airlines.head()

Unnamed: 0,id,full_name,day,airline,destination,dest_region,dest_size,boarding_area,dept_time,wait_min,cleanliness,safety,satisfaction
0,1351,Melodie Stuart,Tuesday,UNITED INTL,KANSAI,Asia,Hub,Gates 91-102,2018-12-31,115.0,Clean,Neutral,Very satisfied
1,373,Dominic Shannon,Friday,ALASKA,SAN JOSE DEL CABO,Canada/Mexico,Small,Gates 50-59,2018-12-31,135.0,Clean,Very safe,Very satisfied
2,2820,Quintessa Tillman,Thursday,DELTA,LOS ANGELES,West US,Hub,Gates 40-48,2018-12-31,70.0,Average,Somewhat safe,Neutral
3,1157,Dr. Christine Nicholson,Tuesday,SOUTHWEST,LOS ANGELES,West US,Hub,Gates 20-39,2018-12-31,190.0,Clean,Very safe,Somewhat satsified
4,2992,Regina Clements,Wednesday,AMERICAN,MIAMI,East US,Hub,Gates 50-59,2018-12-31,559.0,Somewhat clean,Very safe,Somewhat satsified


In [5]:
# Print categories DataFrame
print(categories)

# Print unique values of survey columns in airlines
print('Cleanliness: ', airlines['cleanliness'].unique(), "\n")
print('Safety: ', airlines['safety'].unique(), '\n')
print('Satisfaction: ', airlines['satisfaction'].unique(), '\n')

      cleanliness           safety          satisfaction
0           Clean          Neutral        Very satisfied
1         Average        Very safe               Neutral
2  Somewhat clean    Somewhat safe    Somewhat satisfied
3  Somewhat dirty      Very unsafe  Somewhat unsatisfied
4           Dirty  Somewhat unsafe      Very unsatisfied
Cleanliness:  ['Clean' 'Average' 'Somewhat clean' 'Somewhat dirty'] 

Safety:  ['Neutral' 'Very safe' 'Somewhat safe' 'Very unsafe'] 

Satisfaction:  ['Very satisfied' 'Neutral' 'Somewhat satsified' 'Somewhat unsatisfied'] 



In [6]:
# Find the cleanliness category in airlines not in categories
cat_clean = set(airlines['cleanliness']).difference(categories['cleanliness'])

# Find rows with that category
cat_clean_rows = airlines['cleanliness'].isin(cat_clean)

# Print rows with inconsistent category
print(airlines[cat_clean_rows])

# Print rows with consistent categories only
print(airlines[~cat_clean_rows])

Empty DataFrame
Columns: [id, full_name, day, airline, destination, dest_region, dest_size, boarding_area, dept_time, wait_min, cleanliness, safety, satisfaction]
Index: []
       id                full_name        day           airline  \
0    1351           Melodie Stuart    Tuesday       UNITED INTL   
1     373          Dominic Shannon     Friday            ALASKA   
2    2820        Quintessa Tillman   Thursday             DELTA   
3    1157  Dr. Christine Nicholson    Tuesday         SOUTHWEST   
4    2992          Regina Clements  Wednesday          AMERICAN   
..    ...                      ...        ...               ...   
195   819         Ms. Vanna Rivera     Sunday            ALASKA   
196  2924          Miss Venus Lowe     Friday  TURKISH AIRLINES   
197  2245          Amethyst Nieves   Thursday         SOUTHWEST   
198   238      Miss Vivian Foreman  Wednesday          AMERICAN   
199  1077       Miss Wendy Griffin     Friday        AIR CANADA   

           destination

## Categorical Variable

### Inconsistent categories

In [7]:
# Print unique values of both columns
print(airlines['dest_region'].unique())
print(airlines['dest_size'].unique())

['Asia' 'Canada/Mexico' 'West US' 'East US' 'Midwest US' 'EAST US'
 'Middle East' 'Europe' 'eur' 'Central/South America'
 'Australia/New Zealand' 'middle east']
['Hub' 'Small' '    Hub' 'Medium' 'Large' 'Hub     ' '    Small'
 'Medium     ' '    Medium']


In [8]:
# Lower dest_region column and then replace "eur" with "europe"
airlines['dest_region'] = airlines['dest_region'].str.lower()
airlines['dest_region'] = airlines['dest_region'].replace({'eur':'europe'})

In [9]:
# Remove white spaces from 'dest_size'
airlines['dest_size'] = airlines['dest_size'].str.strip()

# Verify changes have been effected
print(airlines['dest_region'].unique())
print(airlines['dest_size'].unique())

['asia' 'canada/mexico' 'west us' 'east us' 'midwest us' 'middle east'
 'europe' 'central/south america' 'australia/new zealand']
['Hub' 'Small' 'Medium' 'Large']


### Remapping categories

In [10]:
# Create ranges for categories
label_ranges = [0, 60, 180, np.inf]
label_names = ['short', 'medium', 'long']

# Create wait_type column
airlines['wait_type'] = pd.cut(airlines['wait_min'], 
                               bins=label_ranges,
                               labels=label_names)

# Create mappings and replace
mappings = {
    'Monday': 'weekday',
    'Tuesday': 'weekday',
    'Wednesday': 'weekday',
    'Thursday': 'weekday',
    'Friday': 'weekday',
    'Saturday': 'weekend',
    'Sunday': 'weekend'
}

airlines['day_week'] = airlines['day'].replace(mappings)

## Cleaning text data

### Removing titles and taking names

In [15]:
# Replace 'Dr.' with empty string ''
airlines['full_name'] = airlines['full_name'].str.replace('Dr.', '')

# Replace 'Mr.' with empty string ''
airlines['full_name'] = airlines['full_name'].str.replace('Mr.', '')

# Replace 'Miss' with empty string ''
airlines['full_name'] = airlines['full_name'].str.replace('Miss', '')

# Replace 'Ms.' with empty string ''
airlines['full_name'] = airlines['full_name'].str.replace('Ms.', '')

# Assert that full_name has no honorifics
assert airlines['full_name'].str.contains('Ms.|Mr.|Miss|Dr.').any() == False

### Keeping it descriptive

In [16]:
# Store length of each row in survey_response column
resp_length = airlines['survey_response'].str.len()

# Find rows in airlines where resp_length > 40
airlines_survey = airlines[resp_length > 40]

# Assert minimum survey_response length is > 40
assert airlines_survey['survey_response'].str.len().min() > 40

# Print new survey_response column
print(airlines_survey['survey_response'])

KeyError: 'survey_response'