In [1]:
import json
import xml.etree.ElementTree as ET
import csv

In [29]:
def find_oldest_movie(file_path):
    try:
        res = []
        with open(file_path) as f:
            for line in f:
                res.append(line.split(';')[0])
        return res
    except IOError:
        print(f'Unable to read file {file_path}')
    
file_path = 'data_sample/movies.txt'
find_oldest_movie(file_path)

['The Dark Knight',
 'Heat',
 'Inception',
 'Kill Bill: Vol. 1',
 'Gladiator',
 'Saving Private Ryan',
 'Terminator 2: Judgment Day',
 'The Bourne Ultimatum',
 'The Dark Knight Rises',
 'The Matrix']

In [30]:
def remove_useless_data(file_path):
    try:
        res = []
        with open(file_path) as f:
            print(f'Reading file {file_path}')
            head = ['Name', 'Candidate', 'Time', 'State']
            for line in f:
                col_val = line.split(',')
                useless_data = False
                for i, val in enumerate(col_val):
                    if (head[i] != 'Time') and (val == ''):
                        print('Useless data' + '>' * 5 + line)
                        useless_data = True
                        break
                if useless_data:
                    continue
                res.append(line)
        return ''.join(res)
    except IOError:
        print(f'Unable to read file {file_path}')
        
file_path2 = 'data_sample/election.txt'
print(remove_useless_data(file_path2))

Reading file data_sample/election.txt
Useless data>>>>>Richard Gere,,2018-09-21 11:41:53,FL

Useless data>>>>>,John Doe,2018-09-21 12:50:29,WA

Tom Cruise,John Doe,2018-09-21 14:23:03,CA
Helen Hunt,Jane Doe,2018-09-21 17:02:12,AL
Barack Obama,Jane Doe,2018-09-21 10:00:37,IL
Michael Jackson,Jane Doe,2018-09-21 16:11:57
Bill Gates,John Doe,2018-09-21 13:25:18,NY



### Find the post with the most popular comments

In [31]:
def find_largest_post(comment_like):
    largest_like_post = list(comment_like.keys())[0]
    max_likes = 0
    for post, like in comment_like.items():
        if like > max_likes:
            largest_like_post = post
    return largest_like_post
            
def count_comment_like(post):
    post_id = post['id']
    comment_like = 0
    if not post['comments']:
        print(f'In post id {post_id}, it gets {comment_like} comment likes')
        return (post_id, 0)
    for comment in post['comments']:
        # iterate each comment
        if 'like_count' in comment.keys():
            comment_like += comment['like_count']
    print(f'In post id {post_id}, it gets {comment_like} comment likes')
    return (post_id, comment_like)
        
def find_most_popular_comments(file_path):
    try:
        with open(file_path) as json_file:
            post_data = json.load(json_file) # load json file
            if not post_data:
                return 
            comment_like = {} # a dict to store result: post_id : comment_count 
            for post in post_data: # traversal json file
                post_id, likes = count_comment_like(post) # return post_id and comment like count
                if post['id'] in comment_like.keys():
                       comment_like[post_id] += likes
                else:
                    comment_like[post_id] = likes
            largest_like_post = find_largest_post(comment_like)
            print(f'The most popular post id is {largest_like_post}, it got {comment_like[largest_like_post]} comment likes')
            return {largest_like_post, comment_like[largest_like_post]}
    except IOError:
        print(f'Unable to read file {file_paht}')
        
file_path3 = 'data_sample/posts.txt'
find_most_popular_comments(file_path3)

In post id 56, it gets 123 comment likes
In post id 145, it gets 320 comment likes
In post id 73, it gets 65 comment likes
In post id 976, it gets 0 comment likes
In post id 38, it gets 107 comment likes
In post id 21, it gets 27 comment likes
In post id 80, it gets 94 comment likes
The most popular post id is 80, it got 94 comment likes


{80, 94}

### USD transactions

In [32]:
def print_transaction(elems_USD):
    if not elems_USD:
        print('-' * 5 + 'No USD transaction!' + '-' * 5)
        return
    for elem in elems_USD.values():
        print('-' * 10 + 'Wow! USD transaction' + '-' *10)
        for sub_elem in elem:
            print(f'{sub_elem.tag}: {sub_elem.text}')
        print('-' * 40)
        
def remove_transaction(tree, elems_USD):
    tree_root = tree.getroot()
    for node in elems_USD.keys():
        print(f'node to remove {node}')
        tree_root.remove(tree_root[node])
    return tree

def print_xml(tree):
    for elem in tree.getroot():
        for sub_elem in elem:
            print(f'<tag>: {sub_elem.tag}, <attrib>: {sub_elem.attrib}, <text>: {sub_elem.text}')
    
def filter_transaction(file_path):
    try:
        with open(file_path) as f:
            tree = ET.parse(file_path4)
            elems_USD = {}
            tree_root = tree.getroot()
            for node, elem in enumerate(tree_root):
                for sub_elem in elem:
                    if sub_elem.tag == 'amount':
                        if sub_elem.attrib['currency'] == 'USD':
                            elems_USD[node] = elem  
                            print(f'node to remove {node}')
                            tree_root.remove(elem)
            print_transaction(elems_USD)
            print_xml(tree)
    except IOError:
        print(f'Unable to read file {file_path}')
        
file_path4 = 'data_sample/transactions.txt'
filter_transaction(file_path4)

node to remove 0
node to remove 2
----------Wow! USD transaction----------
from: 465345
to: 46548743
amount: 2350
----------------------------------------
----------Wow! USD transaction----------
from: 38644
to: 8756113
amount: 8000
----------------------------------------
<tag>: from, <attrib>: {}, <text>: 12334545
<tag>: to, <attrib>: {}, <text>: 8676
<tag>: amount, <attrib>: {'currency': 'CAD'}, <text>: 3600
<tag>: from, <attrib>: {}, <text>: 8547467
<tag>: to, <attrib>: {}, <text>: 958687
<tag>: amount, <attrib>: {'currency': 'EUR'}, <text>: 1200
<tag>: from, <attrib>: {}, <text>: 167465
<tag>: to, <attrib>: {}, <text>: 9233425
<tag>: amount, <attrib>: {'currency': 'HUF'}, <text>: 9500000
<tag>: from, <attrib>: {}, <text>: 232635
<tag>: to, <attrib>: {}, <text>: 4864623
<tag>: amount, <attrib>: {'currency': 'RMB'}, <text>: 20000


In [33]:
# file_path4 = 'transactions.txt'
# tree = ET.ElementTree(ET.fromstring(read_file(file_path4)))
# for elem in tree.getroot():
#     for sub_elem in elem:
#         print(f'tag: {sub_elem.tag}, attrib: {sub_elem.attrib}, text: {sub_elem.text}')

### Exam Performance

In [34]:
def int_float_convert(a_str):
#     print(f'current str: {a_str}')
    return int(a_str) if '.' not in a_str else float(a_str)

def time_convert(time_str):
    if (not time_str) or (time_str == '0'):
        return 0
    second = 0
    tmp_str = ''
    for ch in time_str:
        if ch == ':':
            return sum(x * int(t) for x, t in zip([1, 60, 3600], time_str[::-1].split(":")))
        elif ch == 'h':
            second += int_float_convert(tmp_str) * 3600
            tmp_str = ''
        elif ch == 'm':
            second += int_float_convert(tmp_str) * 60
            tmp_str = ''
        elif ch == 's':
            second += int_float_convert(tmp_str)
            tmp_str = ''
        else:
#             print(f'adding {ch}')
            tmp_str += ch
    return second

def compare_user(user_score):
    highest = list(user_score.keys())[0]
    highest_score = user_score[highest]
    for key, val in user_score.items():
        if val > highest_score:
            highest_score = val
            highest = key
    return (highest, highest_score)

def find_best(file_path):
    try:
        with open(file_path) as tsv_file:
            reader = csv.DictReader(tsv_file, dialect= 'excel-tab')
            user_score = {}
            for row in reader:
                user_time = time_convert(row['time'])
                user_point = row['points']
                if not user_time:
                    continue
                # TODO: To check if user has other exam score
                user_score[row['user']] = round(int(user_point) / user_time * 60, 5) 
            return compare_user(user_score)
    except IOError:
        print(f'Unable to read file {file_path}')
        
file_path5 = 'data_sample/exams.txt'
find_best(file_path5)

('5', 0.63333)

### Transform data

In [35]:
def export_to_json(dict_data):
    with open('users-json.txt', 'w') as export_file:
        json.dump(dict_data, export_file)

def convert_csv_to_json(file_path):
    try:
        with open(file_path) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter= ',')
            res = []
            for i, row in enumerate(csv_reader):
                if i == 0:
                    head = row
                    json_keys = dict.fromkeys(row)
#                     print('head ' + ' '.join(row))
                    continue
#                 print(row)
                res.append({head[x] : row[x] for x in range(len(row))})
            export_to_json(res)
            return res
    except IOError:
        print(f'Unable to read file {file_path}')
        
file_path6 = 'data_sample/users.txt'
convert_csv_to_json(file_path6)

[{'id': '1',
  'first_name': 'Rick',
  'last_name': 'Stoney',
  'email': 'rstoney0@bloomberg.com',
  'gender': 'Male',
  'ip_address': '187.75.94.108'},
 {'id': '2',
  'first_name': 'Birch',
  'last_name': 'Urling',
  'email': 'burling1@japanpost.jp',
  'gender': 'Male',
  'ip_address': '230.122.244.143'},
 {'id': '3',
  'first_name': 'Stephenie',
  'last_name': 'Hatherell',
  'email': 'shatherell2@google.es',
  'gender': 'Female',
  'ip_address': '202.103.94.126'},
 {'id': '4',
  'first_name': 'Rip',
  'last_name': 'Ference',
  'email': 'rference3@deliciousdays.com',
  'gender': 'Male',
  'ip_address': '57.20.120.136'},
 {'id': '5',
  'first_name': 'Howard',
  'last_name': 'Vedyaev',
  'email': 'hvedyaev4@tamu.edu',
  'gender': 'Male',
  'ip_address': '118.154.112.159'},
 {'id': '6',
  'first_name': 'Bjorn',
  'last_name': 'Hablot',
  'email': 'bhablot5@linkedin.com',
  'gender': 'Male',
  'ip_address': '30.99.4.199'},
 {'id': '7',
  'first_name': 'Avram',
  'last_name': 'Wilds',
  'e

In [23]:
def export_to_xml(dict_data):
    pass

def build_elem(node, padding= ''):
    list_type = isinstance(node, list)
    dict_type = isinstance(node, dict)

    res_sub_elem = []
    if list_type:
        for sub_elem in node:
            res_sub_elem.append(build_elem(sub_elem, padding))
        return '\n'.join(res_sub_elem)
        
    if dict_type:
        for tag_name, text in node.items():
            sub_text = build_elem(text, '\t' + padding)
            # elem_format <tag>text</tag
            sub_elem = f'<{tag_name}>{text}</{tag_name}>'
            res_sub_elem.append(sub_elem)
        return '\n'.join(res_sub_elem)
    
def convert_json_to_xml(file_path, delimiter= ''):
    """
    TODO: Optimize this function and make it robust
    """
    try:
        with open(file_path) as json_file:
            flower_data = json.load(json_file)
            res = ['<?xml version="1.0" encoding="UTF-8"?>']
            if not flower_data:
                return 
            res.append(build_elem(flower_data))
            return '\n'.join(res)
    except IOError:
        print(f'Unable to read file {file_path}')
        
file_path7 = 'data_sample/flowers.txt'
print(convert_json_to_xml(file_path7))

<?xml version="1.0" encoding="UTF-8"?>
<id>1</id>
<name>Pretty Sneezeweed</name>
<color>Maroon</color>
<origin>Serbia</origin>
<id>2</id>
<name>Tryon's Woodsia</name>
<color>Khaki</color>
<origin>Portugal</origin>
<id>3</id>
<name>Sycamore Fig</name>
<color>Crimson</color>
<origin>Sweden</origin>
<id>4</id>
<name>Brown Bentgrass</name>
<color>Purple</color>
<origin>Philippines</origin>
<id>5</id>
<name>Rinodina Lichen</name>
<color>Red</color>
<origin>Nigeria</origin>
<id>6</id>
<name>Angelon</name>
<color>Khaki</color>
<origin>Luxembourg</origin>
<id>7</id>
<name>Nodding Beggartick</name>
<color>Violet</color>
<origin>Indonesia</origin>
<id>8</id>
<name>Encinitis False Willow</name>
<color>Pink</color>
<origin>Morocco</origin>
<id>9</id>
<name>Japanese Sweet Coltsfoot</name>
<color>Aquamarine</color>
<origin>Indonesia</origin>
<id>10</id>
<name>American Beachgrass</name>
<color>Orange</color>
<origin>Russia</origin>
<id>11</id>
<name>Clustered Lady's Mantle</name>
<color>Mauv</color>


In [16]:
def extract_time(time_data):
    return time_data.split(' ')[0]

def export_to_json(dict_data):
    with open('logs-count-A66.json', 'w') as export_file:
        json.dump(dict_data, export_file)

def extract_info(file_path):
    try:
        with open(file_path) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter= ',')
            target_info = 'A66 - 04 FÕBEJÁRAT (F-1) Door #1'
            head_index = [1, 5, 12]
            res = {}
            for row in csv_reader:
                card_number = row[12]
                chip = row[5]
                date_stamp = extract_time(row[1])
                if row[5] == target_info:
                    if card_number in res.keys():
                        if date_stamp in res[card_number].keys():
                            res[card_number][date_stamp] += 1
                        else:
                            res[card_number][date_stamp] = 1
                    else:
                        res[card_number] = {date_stamp : 1}
            export_to_json(res)
            return res
    except IOError:
        print(f'Unable to read file {file_path}')
        
        #                 print(extract_time(row[1]) + ' ' + row[5] + ' ' + row[12])
file_path8 = 'data_sample/logs.csv'
extract_info(file_path8)

{'00215:09895': {'2019.01.02.': 2,
  '2019.01.04.': 3,
  '2019.01.05.': 1,
  '2019.01.07.': 2,
  '2019.01.08.': 1,
  '2019.01.09.': 1,
  '2019.01.10.': 1,
  '2019.01.11.': 2,
  '2019.01.14.': 1,
  '2019.01.15.': 3,
  '2019.01.16.': 2,
  '2019.01.17.': 3,
  '2019.01.18.': 1,
  '2019.01.21.': 3,
  '2019.01.22.': 1,
  '2019.01.23.': 1,
  '2019.01.24.': 1,
  '2019.01.25.': 1,
  '2019.01.28.': 3,
  '2019.01.29.': 1,
  '2019.01.30.': 1,
  '2019.01.31.': 2},
 '00110:57041': {'2019.01.02.': 1,
  '2019.01.03.': 2,
  '2019.01.04.': 1,
  '2019.01.07.': 2,
  '2019.01.08.': 1,
  '2019.01.09.': 2,
  '2019.01.10.': 2,
  '2019.01.11.': 1,
  '2019.01.14.': 2,
  '2019.01.15.': 1,
  '2019.01.16.': 2,
  '2019.01.17.': 1,
  '2019.01.18.': 2,
  '2019.01.21.': 2,
  '2019.01.22.': 1,
  '2019.01.23.': 1,
  '2019.01.24.': 1,
  '2019.01.25.': 1,
  '2019.01.28.': 1,
  '2019.01.29.': 1,
  '2019.01.30.': 2,
  '2019.01.31.': 2},
 '00008:58673': {'2019.01.02.': 2,
  '2019.01.03.': 1,
  '2019.01.04.': 3,
  '2019.01.07

In [None]:
a