# Construct Dataset from API request - Tourpedia Dataset

In [0]:
import pandas as pd
import numpy as np
import requests
import json

In [0]:
class CONSTRUCT_DATASETS(object):
  
  #Constructor 
  def __init__(self, city):
    self.city = city
    self.places_url = 'http://tour-pedia.org/api/getPlaces?location={}&category=poi'.format(city)
    self.places_filename = 'tourpedia_{}_poi.csv'.format(city)
    self.reviews_filename = 'tourpedia_{}_reviews.csv'.format(city)
    self.places_df = pd.DataFrame()
    self.reviews_df = pd.DataFrame()
  
  """
    Function to execute an API request and return the response as a dataframe
    url - API request URL
    success - API request status
    df - the JSON response as a pandas dataframe
  """
  def get_api_response(self, url):
    # Get the API response
    response = requests.get(url)
    # Check if the API request was successful
    if(response.status_code != 200):
      success = False
      df = pd.DataFrame()
    else:
      success = True
      data = response.text
      # Load the response as json data
      json_data = json.loads(data)
      # Convert the json list to a pandas dataframe
      df = pd.DataFrame(json_data)
    return success, df
  
  """
    Function to construct list of places dataset for the city
  """
  def construct_places_dataset(self):
    success, self.places_df = get_api_response(self.places_url)
    if(success == False or self.places_df.empty == True):
      print('No Dataset for {} exists!'.format(self.place))
    else:
      # Let us clean the dataset
      # We want to select only relevant columns: id, name, originalId, details, reviews
      self.places_df = self.places_df[['id','originalId', 'name', 'details', 'reviews']]
      # Few rows have misaligned values
      # We want to select the rows that only have unique numeric value for the place id
      self.places_df = self.places_df[self.places_df['id'].astype(str).str.isdigit()]
      # Drop the rows that contain null/nan values
      self.places_df = self.places_df.dropna()
      # Save the Places dataframe to csv file
      self.places_df.to_csv(self.places_filename, index = None, header=True)
  
  """
    Function to construct reviews dataset for all the places of the city
  """
  def construct_reviews_dataset(self):
    # Iterate through each row and get the reviews of that place
    for i in self.places_df.index:
      success, temp_reviews_df = self.get_api_response(self.places_df.get_value(i,'reviews'))
      # Only if API call is success & not empty construct reviews dataframe for that place
      if(success == True and temp_reviews_df.empty == False):
        temp_reviews_df = temp_reviews_df[['language', 'text', 'time']]
        temp_reviews_df['id'] = self.places_df.get_value(i,'id')
        temp_reviews_df['originalId'] = self.places_df.get_value(i,'originalId')
        temp_reviews_df['details'] = self.places_df.get_value(i,'details')
        # Construct a reviews dataframe of all the places
        self.reviews_df = pd.concat([self.reviews_df,temp_reviews_df])
    # Check if reviews dataframe is empty
    if(self.reviews_df.empty == True):
      print('No Reviews dataset for {}!'.format(self.place))
    else:
      # Save the Reviews dataframe to reviews_london_poi.csv file
      self.reviews_df.to_csv(self.reviews_filename, index = None, header=True)

## Construct places dataset for London as ''tourpedia_London_poi.csv" using the class CONSTRUCT_DATASETS

In [0]:
construct_datasets = CONSTRUCT_DATASETS('London')
construct_datasets.construct_places_dataset()

### Open and explore "tourpedia_London_poi.csv" dataset

In [79]:
df = pd.read_csv('tourpedia_London.csv') 
df.keys()
df.head()
df.shape

(2547, 5)

In [80]:
construct_datasets.places_df.shape

(2547, 5)

#### Check if the places data is clean!

In [86]:
print('Are there any null values in my dataset now?', df.isnull().values.any())
print('Does id column contain only unique values?', df['id'].is_unique)
print('Does originalId column contain only unique values?', df['originalId'].is_unique)

Are there any null values in my dataset now? False
Does id column contain only unique values? True
Does originalId column contain only unique values? True


## Construct reviews dataset for London as "tourpedia_London_reviews.csv"

In [81]:
construct_datasets.construct_reviews_dataset()



### Let us explore the reviews dataset

In [83]:
df_rev = pd.read_csv('tourpedia_London_reviews.csv')
df_rev.keys()

Index(['language', 'text', 'time', 'id', 'originalId', 'details'], dtype='object')

In [84]:
df_rev.shape

(10934, 6)

In [75]:
# Get all the available language codes for the reviews
df_rev['language'].unique()

array(['en', 'ca', 'nl', 'es', 'ja', 'it', 'pt', 'ru', 'af', 'ar', 'ro',
       'fr', 'no', 'ko', 'da', 'lt', 'lv', 'de', 'tr', 'pl', 'vi', 'hr',
       'et', 'eu', 'id', 'so', 'fa', 'False', 'tl', 'bg', 'sv', 'sq',
       'sw', 'cs', 'fi', 'sl', 'hu', 'th', 'zh-cn', 'sk', 'el'],
      dtype=object)

In [94]:
df_rev[df_rev['language'] == 'ja'].shape

(11, 6)

In [95]:
df_rev[df_rev['language'] == 'en'].shape

(10127, 6)

In [99]:
df_sk = df_rev[df_rev['language'] == 'ca'].head()
df_sk['text']

3               Its a pub / restaurant
467                     Urgh... Excel.
707                          EUSTON!!!
2364    AVOID TERMINAL 4 at all costs.
2855                       Great café!
Name: text, dtype: object

In [91]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [93]:
from nltk.corpus import wordnet 
if not wordnet.synsets('Hi hello deepi!'):
  print('not english')
else:
  print('english')

not english
