# Day 3
## Scraping tables

In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver

In [60]:
browser = webdriver.Chrome('/Users/Softwares/chromedriver')

In [16]:
url = 'https://www.imdb.com/india/top-rated-indian-movies/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=600ca544-31f5-4bd8-ae38-ea4014c93bab&pf_rd_r=3A1GWZGCTDEBZJESJJCS&pf_rd_s=right-4&pf_rd_t=15061&pf_rd_i=homepage&ref_=hm_india_tr_rhs_1'

In [9]:
tables = pd.read_html(url)

In [10]:
len(tables)

2

In [14]:
movies = tables[0]

## Method 2: Using Selenium

In [67]:
browser.get(url)

In [62]:
sel = 'table.chart.full-width[data-caller-name="imdb-featured-india"]'
table_tag = browser.find_element_by_css_selector(sel)
table_tag

<selenium.webdriver.remote.webelement.WebElement (session="ce3faca80d94f6f909be053616b71d8b", element="0.6763218785445602-1")>

### Get column names

In [63]:
thead = table_tag.find_element_by_css_selector('thead')
tr = thead.find_element_by_css_selector('tr')
th_tags = tr.find_elements_by_css_selector('th')
len(th_tags)

5

In [32]:
th_tags = table_tag.find_elements_by_css_selector('thead > tr > th')
len(th_tags)

5

In [34]:
col_names = [th.text for th in th_tags]
col_names

['', 'Rank & Title', 'IMDb Rating', 'Your Rating', '']

### Get data 

In [57]:
col_names = [th.text for th in th_tags]
col_names[0] = 'Poster'
col_names[-1] = 'Watchlist'
imdb = pd.DataFrame(columns=col_names)
tr_tags = table_tag.find_elements_by_css_selector('tbody > tr')

for tr in tr_tags:
    td_tags = tr.find_elements_by_css_selector('td')
    td_data = [td.text for td in td_tags]
    row_series = pd.Series(td_data, index=col_names)
    imdb = imdb.append(row_series, ignore_index=True)

In [58]:
imdb.head()

Unnamed: 0,Poster,Rank & Title,IMDb Rating,Your Rating,Watchlist
0,,1. Anand (1971),8.7,,
1,,2. Drishyam (2013),8.6,,
2,,3. Nayakan (1987),8.6,,
3,,4. Anbe Sivam (2003),8.5,,
4,,5. Gol Maal (1979),8.5,,


In [65]:
col_names = [th.text for th in th_tags]
col_names[0] = 'Poster'
col_names[-1] = 'Watchlist'
imdb = pd.DataFrame(columns=col_names)
tr_tags = table_tag.find_elements_by_css_selector('tbody > tr')

for tr in tr_tags:
    td_tags = tr.find_elements_by_css_selector('td')
    td_data = [td.text for td in td_tags]

    img_src = td_tags[0].find_element_by_css_selector('img')
    img_src = img_src.get_attribute('src')
    td_data[0] = img_src
    
    row_series = pd.Series(td_data, index=col_names)
    imdb = imdb.append(row_series, ignore_index=True)
imdb.head()

Unnamed: 0,Poster,Rank & Title,IMDb Rating,Your Rating,Watchlist
0,https://m.media-amazon.com/images/M/MV5BNmZkMT...,1. Anand (1971),8.7,,
1,https://m.media-amazon.com/images/M/MV5BYmY3Mz...,2. Drishyam (2013),8.6,,
2,https://m.media-amazon.com/images/M/MV5BZDhkMT...,3. Nayakan (1987),8.6,,
3,https://m.media-amazon.com/images/M/MV5BNWQxY2...,4. Anbe Sivam (2003),8.5,,
4,https://m.media-amazon.com/images/M/MV5BMjA4OT...,5. Gol Maal (1979),8.5,,


In [83]:
import urllib
import os
img_url = imdb.loc[0, 'Poster']
movie_name = imdb.loc[0, 'Rank & Title']
img_extn = img_url[img_url.rfind('.'):]
folder_name = 'movies_images'

if not folder_name in glob.glob('*'):
    os.makedirs(folder_name)

img_name = folder_name + '/' + movie_name + img_extn
    
print(img_url, img_name)
urllib.request.urlretrieve(img_url, img_name)

https://m.media-amazon.com/images/M/MV5BNmZkMTMzNmEtMWU5NC00MjEzLWE5MzktYzRlMmQyMzk0YmM1XkEyXkFqcGdeQXVyNTA4NzY1MzY@._V1_UX45_CR0,0,45,67_AL_.jpg movies_images/1. Anand (1971).jpg


('movies_images/1. Anand (1971).jpg', <http.client.HTTPMessage at 0x112bb50b8>)

In [89]:
folder_name = 'movies_images'
if not folder_name in glob.glob('*'):
    os.makedirs(folder_name)
    
for index, row in imdb.iterrows():
    img_url = row['Poster']
    movie_name = row['Rank & Title']
    img_extn = img_url[img_url.rfind('.'):]
    img_name = folder_name + '/' + movie_name + img_extn
    urllib.request.urlretrieve(img_url, img_name)

In [90]:
imdb.to_csv('/datasets/imdb_top250_indian_movies.csv', index=False)

In [77]:
x = 'abc.news.abc.JPG'
x.split('.')[-1]

'JPG'

## API

In [95]:
import requests

In [113]:
weather_api_key = '703aee13b24a4a7ea6683632191103'
loc = 'Shimoga'
url = 'http://api.apixu.com/v1/current.json?key=703aee13b24a4a7ea6683632191103&q=%s' % loc
result = requests.get(url).json()

In [114]:
temp = result['current']['temp_c']
temp

35.9

In [116]:
cities = ['Chennai', 'Bangalore', 'Hyderabad', 'Pune',
          'London', 'Coimbatore', 'Kashmir']

In [117]:
df_weather = pd.DataFrame()
for loc in cities:
    url = 'http://api.apixu.com/v1/current.json?key=703aee13b24a4a7ea6683632191103&q=%s' % loc
    result = requests.get(url).json()
    curr_row = {
        'Place': loc,
        'Country': result['location']['country'],
        'Temp_in_cel': result['current']['temp_c'],
        'wind_in_kph': result['current']['wind_kph'],
        'lat': result['location']['lat'],
        'lon': result['location']['lon']
    }
    df_weather = df_weather.append(curr_row, ignore_index=True)

In [118]:
df_weather

Unnamed: 0,Country,Place,Temp_in_cel,lat,lon,wind_in_kph
0,India,Chennai,35.0,13.08,80.28,16.9
1,India,Bangalore,33.0,12.98,77.58,13.0
2,India,Hyderabad,38.0,17.38,78.47,15.1
3,India,Pune,31.7,18.53,73.87,11.9
4,United Kingdom,London,6.0,51.52,-0.11,25.9
5,India,Coimbatore,35.0,10.99,76.96,15.1
6,Pakistan,Kashmir,21.5,29.03,67.95,1.8


## Get resturants names in Bangalore

In [129]:
key = 
keys = {'user-key': key}
url = 'https://developers.zomato.com/api/v2.1/categories'
categories = requests.get(url, headers=keys).json()

In [134]:
df_category = pd.DataFrame()
for category in categories['categories']:
    df_category = df_category.append(
        category['categories'], ignore_index=True
    )
df_category

Unnamed: 0,id,name
0,1.0,Delivery
1,2.0,Dine-out
2,3.0,Nightlife
3,4.0,Catching-up
4,5.0,Takeaway
5,6.0,Cafes
6,7.0,Daily Menus
7,8.0,Breakfast
8,9.0,Lunch
9,10.0,Dinner
