In [46]:
import pandas as pd
import re

df = pd.read_csv("data/airline.csv")
df

Unnamed: 0,airline_name,link,title,author,author_country,date,content,aircraft,type_traveller,cabin_flown,route,overall_rating,seat_comfort_rating,cabin_staff_rating,food_beverages_rating,inflight_entertainment_rating,ground_service_rating,wifi_connectivity_rating,value_money_rating,recommended
0,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,D Ito,Germany,2015-04-10,Outbound flight FRA/PRN A319. 2 hours 10 min f...,,,Economy,,7.0,4.0,4.0,4.0,0.0,,,4.0,1
1,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,Ron Kuhlmann,United States,2015-01-05,Two short hops ZRH-LJU and LJU-VIE. Very fast ...,,,Business Class,,10.0,4.0,5.0,4.0,1.0,,,5.0,1
2,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,E Albin,Switzerland,2014-09-14,Flew Zurich-Ljubljana on JP365 newish CRJ900. ...,,,Economy,,9.0,5.0,5.0,4.0,0.0,,,5.0,1
3,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,Tercon Bojan,Singapore,2014-09-06,Adria serves this 100 min flight from Ljubljan...,,,Business Class,,8.0,4.0,4.0,3.0,1.0,,,4.0,1
4,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,L James,Poland,2014-06-16,WAW-SKJ Economy. No free snacks or drinks on t...,,,Economy,,4.0,4.0,2.0,1.0,2.0,,,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41391,wizz-air,/airline-reviews/wizz-air,Wizz Air customer review,A Bland,United Kingdom,2010-07-15,This airline is terrible! Timetable changes (m...,,,Economy,,,,,,,,,1.0,0
41392,wizz-air,/airline-reviews/wizz-air,Wizz Air customer review,S Scoots,Belgium,2010-06-23,We often fly with Wizzair to/from Charleroi/Bu...,,,Economy,,8.0,,,,,,,4.0,1
41393,wizz-air,/airline-reviews/wizz-air,Wizz Air customer review,C O'Keeffe,Ireland,2010-06-19,Avoid Wizzair! A group of us had our outgoing ...,,,Economy,,,,,,,,,1.0,0
41394,wizz-air,/airline-reviews/wizz-air,Wizz Air customer review,L Jahoda,Czech Republic,2010-06-16,PRG-LTN and LTN-PRG were rather good flights. ...,,,Economy,,5.0,,,,,,,4.0,1


In [15]:
df = pd.read_csv('data/airline.csv')
# Cols kept only content, cabin_flown, type_traveller, and overall_rating columns.
col1 = ['content', 'type_traveller', 'cabin_flown', 'overall_rating']
df = df[col1]

# For overall rating and content, missing values will have those rows removed.
df = df.dropna(subset = ['overall_rating', 'content'])

# Cabin Flown Col and Traveler Type Col with no Resp, replace missing with "Unknown".
df ['type_traveller'] = df ['type_traveller'].fillna('Unknown')
df ['cabin_flown'] = df ['cabin_flown'].fillna('Unknown')

# Normalizing whitespace and removal of repeated punctuation. 
def clean_content (response):
    if pd.isna (response):
        return response
    response = re.sub(r'\n+', ' ', str(response))
    response = re.sub(r'\s+', ' ', response).strip()
    response = re.sub(r'([.!?,:;]){2,}', r'\1', response)
    return response

df ['content'] = df ['content'].apply(clean_content)
df ['type_traveller'] = df ['type_traveller'].apply(clean_content)
df ['cabin_flown'] = df ['cabin_flown'].apply(clean_content)

# New CSV.
df.to_csv('airline_cleaned.csv', index = False)

In [20]:
# Categorical variables, cabin flown and traveler type, pivoted to dummy variables.
d_vars = df[['type_traveller', 'cabin_flown']].copy()
d_vars = pd.get_dummies(d_vars, columns = ['type_traveller', 'cabin_flown'], prefix = ['traveler', 'cabin'], dtype = int)

#New CSV.
d_vars.to_csv('airline_dummies_only.csv', index = False)

In [25]:
# Complete cleaned, with dummies.
df2 = pd.read_csv('airline_cleaned.csv')
new = df2.copy()
combined = pd.concat([new, d_vars], axis = 1)

#New CSV.
combined.to_csv('airline_all_cols.csv', index = False)

In [42]:
# Making sure all dummy (n-1) bases are available (type_traveller). 
cat_traveler = df2['type_traveller'].unique() 

for base in cat_traveler:
    df3 = d_vars.copy()
    col2 = f'traveler_{base}'
    df3 = df3.drop(columns = [col2])
    file = f'bases/Traveler/traveler_base_{col2}.csv'
    df3.to_csv(file, index = False)  
    

In [43]:
# Making sure all dummy (n-1) bases are available (cabin_flown). 
cat_cabin = df2['cabin_flown'].unique() 

for base in cat_cabin:
    df3 = d_vars.copy()
    col3 = f'cabin_{base}'
    df3 = df3.drop(columns = [col3])
    file2 = f'bases/Cabin/cabin_base_{col3}.csv'
    df3.to_csv(file2, index = False)  