# Reading CSV and Json Files

In [100]:
import csv
import json
import ast

In [101]:
path = "/home/gary/OneDrive/DataScience/Python-Data-Products-for-Predictive-Analytics-Specialization/Basic Data Processing and Visualization/Week2/datasets/amazon_reviews_us_Gift_Card_v1_00.tsv"

In [102]:
ls

[0m[01;31m2010_SAT.tsv.gz[0m  amazon_reviews_us_Gift_Card_v1_00.tsv  example.json


In [103]:
f = open(path)

In [104]:
#specified delimiter to be used, "\tab"
reader = csv.reader(f, delimiter = '\t')

In [105]:
#first line is the header/column names of the file
next(reader)

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [106]:
next(reader)

['US',
 '24371595',
 'R27ZP1F1CD0C3Y',
 'B004LLIL5A',
 '346014806',
 'Amazon eGift Card – Celebrate',
 'Gift Card',
 '5',
 '0',
 '0',
 'N',
 'Y',
 'Five Stars',
 'Great birthday gift for a young adult.',
 '2015-08-31']

## Reading CSV & Json Files

In [107]:
path = "/home/gary/OneDrive/DataScience/Python-Data-Products-for-Predictive-Analytics-Specialization/Basic Data Processing and Visualization/Week2/datasets/example.json"

In [108]:
f = open(path)

In [109]:
line = f.readline()

In [110]:
line

'{"_id":"5c1a010ae61b49b43c4b4864","index":0,"age":35,"eyeColor":"green","name":"Wiggins Holman","address":"247 Thatford Avenue, Oneida,Puerto Rico, 7233","friends":[{"id":0,"name":"Carmela Hampton"},{"id":1,"name":"Lynda Pittman"},{"id":2,"name":"Cleveland Noble"}]}\n'

In [111]:
# while this works, it is not the safest way to run code, since it's easy to run malicious code with eval.
d = eval(line)

In [112]:
d

{'_id': '5c1a010ae61b49b43c4b4864',
 'index': 0,
 'age': 35,
 'eyeColor': 'green',
 'name': 'Wiggins Holman',
 'address': '247 Thatford Avenue, Oneida,Puerto Rico, 7233',
 'friends': [{'id': 0, 'name': 'Carmela Hampton'},
  {'id': 1, 'name': 'Lynda Pittman'},
  {'id': 2, 'name': 'Cleveland Noble'}]}

In [113]:
d['_id']

'5c1a010ae61b49b43c4b4864'

# AST library
An Abstract Syntax Tree is a simplified syntactic tree representation of a programming language’s source code. Each node of the tree stands for an statement occurring in the code. This trees don’t show the entire syntactic clutter, just the important information for analyzing the code. 

If it showed the entire structure it would be a Concrete Syntax Tree, but it’s usually better to simplify it because the information we use when building compilers can be found on an abstract syntax tree.

Python comes with a library built-in that makes it easier to work with Abstract Syntax Trees. The ast library helps processing Python abstract syntactic trees. The main purpose of the module is helping showing how the current grammar looks like.

source:more info @ https://medium.com/@SergioPaniego/abstract-syntax-trees-in-python-ast-library-9bfd705ef9f1

In [114]:
#ast, abstract syntax tree library prevent execution of code with unintended output
#ast validates we are reading only dictionary objects.
ast.literal_eval(line)

{'_id': '5c1a010ae61b49b43c4b4864',
 'index': 0,
 'age': 35,
 'eyeColor': 'green',
 'name': 'Wiggins Holman',
 'address': '247 Thatford Avenue, Oneida,Puerto Rico, 7233',
 'friends': [{'id': 0, 'name': 'Carmela Hampton'},
  {'id': 1, 'name': 'Lynda Pittman'},
  {'id': 2, 'name': 'Cleveland Noble'}]}

In [115]:
#similarly, the json object is another safer option
import json

In [116]:
json.loads(line)

{'_id': '5c1a010ae61b49b43c4b4864',
 'index': 0,
 'age': 35,
 'eyeColor': 'green',
 'name': 'Wiggins Holman',
 'address': '247 Thatford Avenue, Oneida,Puerto Rico, 7233',
 'friends': [{'id': 0, 'name': 'Carmela Hampton'},
  {'id': 1, 'name': 'Lynda Pittman'},
  {'id': 2, 'name': 'Cleveland Noble'}]}

# Processing Structured Data in Python

In [122]:
pwd

'/home/gary/OneDrive/DataScience/Python-Data-Products-for-Predictive-Analytics-Specialization/Basic Data Processing and Visualization/Week2/datasets'

In [124]:
ls

[0m[01;31m2010_SAT.tsv.gz[0m                        [01;31mamazon_reviews_us_Gift_Card_v1_00.tsv.gz[0m
amazon_reviews_us_Gift_Card_v1_00.tsv  example.json


In [126]:
import gzip
path = "/home/gary/OneDrive/DataScience/Python-Data-Products-for-Predictive-Analytics-Specialization/Basic Data Processing and Visualization/Week2/datasets/amazon_reviews_us_Gift_Card_v1_00.tsv.gz"

# zipped file is 12mb, unzipped 39mb.
f = gzip.open(path, 'rt')

In [127]:
import csv
reader = csv.reader(f, delimiter = '\t')

In [128]:
header = next(reader)

In [129]:
header

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [131]:
dataset = []

In [132]:
for line in reader:              #file is read one line at a time
    line = line[:-3]              # drop the text fields
    if line[-1] == 'Y':           #dicard unverified reviews 
        dataset.append(line)

In [133]:
dataset[0]

['US',
 '24371595',
 'R27ZP1F1CD0C3Y',
 'B004LLIL5A',
 '346014806',
 'Amazon eGift Card - Celebrate',
 'Gift Card',
 '5',
 '0',
 '0',
 'N',
 'Y']

### The above method works, but an even more robust and effecient way to do this is with the dict module

In [135]:
for line in reader:
    d = dict(zip(header, line))
    for field in ['helpful_votes', 'star_rating', 'total_votes']:
        d[field] = int(d[field])
    for field in ['verified_purchase', 'vine']:
        if d[field] == 'Y':
            d[field] = True
        else:
            d[field] = False
    dataset.append(d)

In [136]:
dataset[0]

['US',
 '24371595',
 'R27ZP1F1CD0C3Y',
 'B004LLIL5A',
 '346014806',
 'Amazon eGift Card - Celebrate',
 'Gift Card',
 '5',
 '0',
 '0',
 'N',
 'Y']

In [139]:
ls

[0m[01;31m2010_SAT.tsv.gz[0m                           example.json
amazon_reviews_us_Gift_Card_v1_00.tsv     yelp_academic_dataset_business.json
[01;31mamazon_reviews_us_Gift_Card_v1_00.tsv.gz[0m


In [140]:
pwd

'/home/gary/OneDrive/DataScience/Python-Data-Products-for-Predictive-Analytics-Specialization/Basic Data Processing and Visualization/Week2/datasets'

In [143]:
path = '/home/gary/OneDrive/DataScience/Python-Data-Products-for-Predictive-Analytics-Specialization/Basic Data Processing and Visualization/Week2/datasets/yelp_academic_dataset_business.json'

In [144]:
f = open(path, 'r')

In [146]:
lines = []
for i in range(50000):
    lines.append(f.readline())

In [147]:
lines[0]

'{"business_id":"f9NumwFMBDn751xgFiRbNA","name":"The Range At Lake Norman","address":"10913 Bailey Rd","city":"Cornelius","state":"NC","postal_code":"28031","latitude":35.4627242,"longitude":-80.8526119,"stars":3.5,"review_count":36,"is_open":1,"attributes":{"BusinessAcceptsCreditCards":"True","BikeParking":"True","GoodForKids":"False","BusinessParking":"{\'garage\': False, \'street\': False, \'validated\': False, \'lot\': True, \'valet\': False}","ByAppointmentOnly":"False","RestaurantsPriceRange2":"3"},"categories":"Active Life, Gun\\/Rifle Ranges, Guns & Ammo, Shopping","hours":{"Monday":"10:0-18:0","Tuesday":"11:0-20:0","Wednesday":"10:0-18:0","Thursday":"11:0-20:0","Friday":"11:0-20:0","Saturday":"11:0-20:0","Sunday":"13:0-18:0"}}\n'

In [148]:
d = eval(lines[0])

In [149]:
d

{'business_id': 'f9NumwFMBDn751xgFiRbNA',
 'name': 'The Range At Lake Norman',
 'address': '10913 Bailey Rd',
 'city': 'Cornelius',
 'state': 'NC',
 'postal_code': '28031',
 'latitude': 35.4627242,
 'longitude': -80.8526119,
 'stars': 3.5,
 'review_count': 36,
 'is_open': 1,
 'attributes': {'BusinessAcceptsCreditCards': 'True',
  'BikeParking': 'True',
  'GoodForKids': 'False',
  'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
  'ByAppointmentOnly': 'False',
  'RestaurantsPriceRange2': '3'},
 'categories': 'Active Life, Gun\\/Rifle Ranges, Guns & Ammo, Shopping',
 'hours': {'Monday': '10:0-18:0',
  'Tuesday': '11:0-20:0',
  'Wednesday': '10:0-18:0',
  'Thursday': '11:0-20:0',
  'Friday': '11:0-20:0',
  'Saturday': '11:0-20:0',
  'Sunday': '13:0-18:0'}}

In [150]:
d['business_id']

'f9NumwFMBDn751xgFiRbNA'

In [151]:
d['stars']

3.5

In [152]:
import json

In [153]:
d = json.loads(lines[0])

In [154]:
d

{'business_id': 'f9NumwFMBDn751xgFiRbNA',
 'name': 'The Range At Lake Norman',
 'address': '10913 Bailey Rd',
 'city': 'Cornelius',
 'state': 'NC',
 'postal_code': '28031',
 'latitude': 35.4627242,
 'longitude': -80.8526119,
 'stars': 3.5,
 'review_count': 36,
 'is_open': 1,
 'attributes': {'BusinessAcceptsCreditCards': 'True',
  'BikeParking': 'True',
  'GoodForKids': 'False',
  'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
  'ByAppointmentOnly': 'False',
  'RestaurantsPriceRange2': '3'},
 'categories': 'Active Life, Gun/Rifle Ranges, Guns & Ammo, Shopping',
 'hours': {'Monday': '10:0-18:0',
  'Tuesday': '11:0-20:0',
  'Wednesday': '10:0-18:0',
  'Thursday': '11:0-20:0',
  'Friday': '11:0-20:0',
  'Saturday': '11:0-20:0',
  'Sunday': '13:0-18:0'}}

In [156]:
d['business_id']

'f9NumwFMBDn751xgFiRbNA'

### Reading Json Files in Python using json library objects

In [157]:
path = '/home/gary/OneDrive/DataScience/Python-Data-Products-for-Predictive-Analytics-Specialization/Basic Data Processing and Visualization/Week2/datasets/yelp_academic_dataset_business.json'

In [159]:
f = open(path, 'r', encoding = 'utf8')

In [162]:
dataset = []
for i in range(50000):
    dataset.append(json.loads(f.readline()))

In [163]:
dataset[0]

{'business_id': 'f9NumwFMBDn751xgFiRbNA',
 'name': 'The Range At Lake Norman',
 'address': '10913 Bailey Rd',
 'city': 'Cornelius',
 'state': 'NC',
 'postal_code': '28031',
 'latitude': 35.4627242,
 'longitude': -80.8526119,
 'stars': 3.5,
 'review_count': 36,
 'is_open': 1,
 'attributes': {'BusinessAcceptsCreditCards': 'True',
  'BikeParking': 'True',
  'GoodForKids': 'False',
  'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
  'ByAppointmentOnly': 'False',
  'RestaurantsPriceRange2': '3'},
 'categories': 'Active Life, Gun/Rifle Ranges, Guns & Ammo, Shopping',
 'hours': {'Monday': '10:0-18:0',
  'Tuesday': '11:0-20:0',
  'Wednesday': '10:0-18:0',
  'Thursday': '11:0-20:0',
  'Friday': '11:0-20:0',
  'Saturday': '11:0-20:0',
  'Sunday': '13:0-18:0'}}

In [166]:
dataset[0]['hours']

{'Monday': '10:0-18:0',
 'Tuesday': '11:0-20:0',
 'Wednesday': '10:0-18:0',
 'Thursday': '11:0-20:0',
 'Friday': '11:0-20:0',
 'Saturday': '11:0-20:0',
 'Sunday': '13:0-18:0'}