In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

%matplotlib inline

In [2]:
df = pd.read_csv('Data/airline_final.csv',parse_dates=['date'],keep_default_na=False)

In [3]:
df.tail()

Unnamed: 0,airline_name,author,date,content,type_traveller,cabin_flown,overall_rating,seat_comfort_rating,cabin_staff_rating,food_beverages_rating,inflight_entertainment_rating,ground_service_rating,value_money_rating,recommended,hubs,country
80515,xl airways france,r dorlen,2017-09-02,miami to paris. so reading most of the other...,solo leisure,economy class,7,3,5,4,1,3,5,1,paris charles de gaulle,france
80516,xl airways france,m lazado,2017-08-31,paris to los angeles. i was skeptical about ...,couple leisure,economy class,10,3,5,5,5,5,5,1,paris charles de gaulle,france
80517,xl airways france,p layvar,2017-08-11,san francisco to paris cdg. worst experience...,family leisure,economy class,1,1,5,1,1,1,1,0,paris charles de gaulle,france
80518,xl airways france,h speccisi,2017-08-11,"paris to san francisco, the aircraft was dir...",couple leisure,economy class,3,1,3,1,1,4,2,0,paris charles de gaulle,france
80519,xl airways france,m ramiro,2017-07-31,san francisco to paris. i was a bit nervous ...,couple leisure,economy class,10,3,5,4,3,5,5,1,paris charles de gaulle,france


In [4]:
# Check shape
df.shape

(80520, 16)

In [5]:
# Check null
df.isnull().sum()

airline_name                     0
author                           0
date                             0
content                          0
type_traveller                   0
cabin_flown                      0
overall_rating                   0
seat_comfort_rating              0
cabin_staff_rating               0
food_beverages_rating            0
inflight_entertainment_rating    0
ground_service_rating            0
value_money_rating               0
recommended                      0
hubs                             0
country                          0
dtype: int64

## Item-Based Collaborative Filtering

- The Airline will be the index
- The Author will be the column
- The Overall Rating will be the Value

### Create Pivot Table

In [6]:
# Using Pivot Table
pivot = pd.pivot_table(df,index="airline_name",columns="author",values="overall_rating")
pivot.head()

author,moam ben-shalom,-,- t mcdonough,a aamir,a abbado,a abdelbadee,a abdo,a abril,a acosta,a adam,...,zunyi xie,zuozhou li,zuqi zhao,zust roland,zuzana daridova,zvi rosen,zy cao,ã–mer yazä±cä±,ã–zgã¼n gã¼ndoäÿdu,ã‰amonn mac an bheatha
airline_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
adria airways,,,,,,,,,,,...,,,,,,,,,,
aegean airlines,,,,,,,,,,,...,,,,,,,,,,
aer lingus,,,,,,,,,,,...,,,,,,,,,,
aerocaribbean,,,,,,,,,,,...,,,,,,,,,,
aeroflot russian airlines,,,,,,,,,,,...,,,,,,4.0,,,,


In [7]:
pivot.shape

(371, 55019)

### Create Sparse Matrix

In [8]:
sparse_pivot = sparse.csr_matrix(pivot.fillna(0))
print(sparse_pivot)

  (0, 840)	5.0
  (0, 903)	2.0
  (0, 1453)	9.0
  (0, 1847)	5.0
  (0, 2763)	8.333333333333334
  (0, 3359)	8.0
  (0, 3531)	7.0
  (0, 4510)	5.0
  (0, 4527)	9.0
  (0, 4739)	1.0
  (0, 4984)	3.0
  (0, 5545)	2.0
  (0, 6489)	8.0
  (0, 8290)	3.0
  (0, 10555)	1.0
  (0, 11133)	7.0
  (0, 11415)	3.0
  (0, 11477)	1.0
  (0, 11674)	1.0
  (0, 13949)	9.0
  (0, 13961)	9.0
  (0, 17373)	3.0
  (0, 17982)	10.0
  (0, 17984)	9.333333333333334
  (0, 18095)	1.0
  :	:
  (367, 49396)	1.0
  (367, 50503)	9.0
  (367, 51615)	8.0
  (367, 53143)	1.0
  (367, 54625)	2.0
  (368, 5723)	6.0
  (368, 24092)	6.0
  (369, 24736)	10.0
  (369, 43845)	7.0
  (369, 44284)	9.0
  (370, 1962)	5.0
  (370, 2027)	6.0
  (370, 6281)	5.0
  (370, 15493)	4.0
  (370, 15628)	3.0
  (370, 20808)	4.0
  (370, 21130)	5.0
  (370, 23841)	4.0
  (370, 31766)	6.0
  (370, 38505)	2.0
  (370, 40387)	5.0
  (370, 44982)	7.0
  (370, 46714)	7.0
  (370, 47665)	2.0
  (370, 54789)	1.0


### Cosine Similarity

In [9]:
recommender = pairwise_distances(sparse_pivot,metric='cosine')

In [10]:
recommender.shape

(371, 371)

### Create Distance Dataframe

In [11]:
recommender_df = pd.DataFrame(recommender,columns=pivot.index,index=pivot.index)
recommender_df.head()

airline_name,adria airways,aegean airlines,aer lingus,aerocaribbean,aeroflot russian airlines,aerogal aerolineas galapagos,aerolineas argentinas,aeromexico,aerosur,afriqiyah airways,...,westjet,wideroe,wings air,wizz air,wow air,xiamen airlines,xl airways france,yakutia airlines,yangon airways,yemenia
airline_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
adria airways,0.0,0.949077,0.987885,1.0,0.978923,1.0,0.983616,0.978396,1.0,1.0,...,1.0,1.0,1.0,0.988961,1.0,1.0,1.0,1.0,1.0,1.0
aegean airlines,0.949077,0.0,0.982295,1.0,0.985923,1.0,0.98881,0.986616,1.0,1.0,...,1.0,1.0,1.0,0.989405,0.984729,0.9887,1.0,1.0,1.0,1.0
aer lingus,0.987885,0.982295,0.0,1.0,0.993144,1.0,0.993086,0.995912,1.0,1.0,...,1.0,1.0,1.0,0.980754,0.998471,0.993536,1.0,1.0,1.0,1.0
aerocaribbean,1.0,1.0,1.0,0.0,0.94609,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
aeroflot russian airlines,0.978923,0.985923,0.993144,0.94609,0.0,1.0,0.994266,0.992623,1.0,1.0,...,1.0,1.0,1.0,0.990583,0.998587,0.970777,0.986437,1.0,1.0,1.0


### Evaluate Recommender Performance

In [12]:
df.head()

Unnamed: 0,airline_name,author,date,content,type_traveller,cabin_flown,overall_rating,seat_comfort_rating,cabin_staff_rating,food_beverages_rating,inflight_entertainment_rating,ground_service_rating,value_money_rating,recommended,hubs,country
0,adria airways,d ito,2015-04-10,outbound flight fra/prn a319. 2 hours 10 min f...,,economy,7,4,4,4,0,4,4,1,pristina international,kosovo
1,adria airways,ron kuhlmann,2015-01-05,two short hops zrh-lju and lju-vie. very fast ...,,business class,10,4,5,4,1,5,5,1,pristina international,kosovo
2,adria airways,e albin,2014-09-14,flew zurich-ljubljana on jp365 newish crj900. ...,,economy,9,5,5,4,0,5,5,1,pristina international,kosovo
3,adria airways,tercon bojan,2014-09-06,adria serves this 100 min flight from ljubljan...,,business class,8,4,4,3,1,4,4,1,pristina international,kosovo
4,adria airways,l james,2014-06-16,waw-skj economy. no free snacks or drinks on t...,,economy,4,4,2,1,2,2,2,0,pristina international,kosovo


In [15]:
search = "singapore airline"

airlines = (df.loc[df['airline_name'].str.contains(search),'airline_name']).unique()

for airline in airlines:
    print(airline)
    print('Average overall rating: ',pivot.loc[airline,:].mean())
    print('Number of review: ',pivot.loc[airline,:].count())
    
    print('')
    print('5 recommended airline: ')
    recommended_airline = recommender_df[airline].sort_values()[1:6]
    print(recommended_airline)
    print('')
    print('************************')

singapore airlines
Average overall rating:  7.334841357926819
Number of review:  901

5 recommended airline: 
airline_name
silkair                   0.876184
cathay pacific airways    0.934092
garuda indonesia          0.935895
qantas airways            0.936182
thai airways              0.940914
Name: singapore airlines, dtype: float64

************************


In [None]:
# SVD = user to airlines
# U = 'User to Concept'
# E = 'Str of concept'
# Vt = 'Movie to Concept'
'movie', 'user','concept'

## Content-based filtering

In [16]:
# Make a copy of df
content_df = df.copy()

In [18]:
content_df.head()

Unnamed: 0,airline_name,author,date,content,type_traveller,cabin_flown,overall_rating,seat_comfort_rating,cabin_staff_rating,food_beverages_rating,inflight_entertainment_rating,ground_service_rating,value_money_rating,recommended,hubs,country
0,adria airways,d ito,2015-04-10,outbound flight fra/prn a319. 2 hours 10 min f...,,economy,7,4,4,4,0,4,4,1,pristina international,kosovo
1,adria airways,ron kuhlmann,2015-01-05,two short hops zrh-lju and lju-vie. very fast ...,,business class,10,4,5,4,1,5,5,1,pristina international,kosovo
2,adria airways,e albin,2014-09-14,flew zurich-ljubljana on jp365 newish crj900. ...,,economy,9,5,5,4,0,5,5,1,pristina international,kosovo
3,adria airways,tercon bojan,2014-09-06,adria serves this 100 min flight from ljubljan...,,business class,8,4,4,3,1,4,4,1,pristina international,kosovo
4,adria airways,l james,2014-06-16,waw-skj economy. no free snacks or drinks on t...,,economy,4,4,2,1,2,2,2,0,pristina international,kosovo


In [19]:
content_df.shape

(80520, 16)

heathrow                                   3524
fort lauderdale hollywood international    3032
o'hare international                       2921
dallas/fort worth international            2918
dublin airport                             2498
                                           ... 
ted stevens anchorage international           1
rosario islas malvinas international          1
halifax stanfield international               1
nantes atlantique                             1
dushanbe international                        1
Name: hubs, Length: 282, dtype: int64