In [165]:
import json
import xml.etree.ElementTree as ET
import csv

In [1]:
def read_file(file_path):
    try:
        res = []
        with open(file_path) as f:
            for line in f:
                res.append(line)
        return ''.join(res)
    except IOError:
        print(f'Unable to read file {file_path}')

In [2]:
movie_file = 'movies.txt'
movie = read_file(movie_file)
print(movie)

The Dark Knight;2008;Christopher Nolan
Heat;1995;Michael Mann
Inception;2010;Christopher Nolan
Kill Bill: Vol. 1;2003;Quentin Tarantino
Gladiator;2000;Ridley Scott
Saving Private Ryan;1998;Steven Spielberg
Terminator 2: Judgment Day;1991;James Cameron
The Bourne Ultimatum;2007;Paul Greengrass
The Dark Knight Rises;2012;Christopher Nolan
The Matrix;1999;Lana Wachowski, Lilly Wachowski



In [9]:
def find_oldest_movie(file_path):
    try:
        res = []
        with open(file_path) as f:
            for line in f:
                res.append(line.split(';')[0])
        return res
    except IOError:
        print(f'Unable to read file {file_path}')
    
file_path = 'movies.txt'
find_oldest_movie(file_path)

['The Dark Knight',
 'Heat',
 'Inception',
 'Kill Bill: Vol. 1',
 'Gladiator',
 'Saving Private Ryan',
 'Terminator 2: Judgment Day',
 'The Bourne Ultimatum',
 'The Dark Knight Rises',
 'The Matrix']

In [80]:
def remove_useless_data(file_path):
    try:
        res = []
        with open(file_path) as f:
            print(f'Reading file {file_path}')
            head = ['Name', 'Candidate', 'Time', 'State']
            for line in f:
                col_val = line.split(',')
                useless_data = False
                for i, val in enumerate(col_val):
                    if (head[i] != 'Time') and (val == ''):
                        print('Useless data' + '>' * 5 + line)
                        useless_data = True
                        break
                if useless_data:
                    continue
                res.append(line)
        return ''.join(res)
    except IOError:
        print(f'Unable to read file {file_path}')
        
file_path2 = 'election.txt'
print(remove_useless_data(file_path2))

Reading file election.txt
Useless data>>>>>Richard Gere,,2018-09-21 11:41:53,FL

Useless data>>>>>,John Doe,2018-09-21 12:50:29,WA

Tom Cruise,John Doe,2018-09-21 14:23:03,CA
Helen Hunt,Jane Doe,2018-09-21 17:02:12,AL
Barack Obama,Jane Doe,2018-09-21 10:00:37,IL
Michael Jackson,Jane Doe,2018-09-21 16:11:57
Bill Gates,John Doe,2018-09-21 13:25:18,NY



In [120]:
def find_largest_post(comment_like):
    largest_like_post = list(comment_like.keys())[0]
    max_likes = 0
    for post, like in comment_like.items():
        if like > max_likes:
            largest_like_post = post
    return largest_like_post
            
def count_comment_like(post):
    post_id = post['id']
    comment_like = 0
    if not post['comments']:
        print(f'In post id {post_id}, it gets {comment_like} comment likes')
        return (post_id, 0)
    for comment in post['comments']:
        # iterate each comment
        if 'like_count' in comment.keys():
            comment_like += comment['like_count']
    print(f'In post id {post_id}, it gets {comment_like} comment likes')
    return (post_id, comment_like)
        
def find_most_popular_comments(file_path):
    try:
        with open(file_path) as f:
            post_data = json.load(f) # load json file
            if not post_data:
                return 
            comment_like = {} # a dict to store result: post_id : comment_count 
            for post in post_data: # traversal json file
                post_id, likes = count_comment_like(post) # return post_id and comment like count
                if post['id'] in comment_like.keys():
                       comment_like[post_id] += likes
                else:
                    comment_like[post_id] = likes
            largest_like_post = find_largest_post(comment_like)
            print(f'The most popular post id is {largest_like_post}, it got {comment_like[largest_like_post]} comment likes')
            return {largest_like_post, comment_like[largest_like_post]}
    except IOError:
        print(f'Unable to read file {file_paht}')
        
find_most_popular_comments(file_path3)

In post id 56, it gets 123 comment likes
In post id 145, it gets 320 comment likes
In post id 73, it gets 65 comment likes
In post id 976, it gets 0 comment likes
In post id 38, it gets 107 comment likes
In post id 21, it gets 27 comment likes
In post id 80, it gets 94 comment likes
The most popular post id is 80, it got 94 comment likes


{80, 94}

In [163]:
def print_transaction(elems_USD):
    if not elems_USD:
        print('-' * 5 + 'No USD transaction!' + '-' * 5)
        return
    for elem in elems_USD.values():
        print('-' * 10 + 'Wow! USD transaction' + '-' *10)
        for sub_elem in elem:
            print(f'{sub_elem.tag}: {sub_elem.text}')
        print('-' * 40)
        
def remove_transaction(tree, elems_USD):
    tree_root = tree.getroot()
    for node in elems_USD.keys():
        print(f'node to remove {node}')
        tree_root.remove(tree_root[node])
    return tree

def print_xml(tree):
    for elem in tree.getroot():
        for sub_elem in elem:
            print(f'<tag>: {sub_elem.tag}, <attrib>: {sub_elem.attrib}, <text>: {sub_elem.text}')
    
def filter_transaction(file_path):
    try:
        with open(file_path) as f:
            tree = ET.parse(file_path4)
            elems_USD = {}
            tree_root = tree.getroot()
            for node, elem in enumerate(tree_root):
                for sub_elem in elem:
                    if sub_elem.tag == 'amount':
                        if sub_elem.attrib['currency'] == 'USD':
                            elems_USD[node] = elem  
                            print(f'node to remove {node}')
                            tree_root.remove(elem)
            print_transaction(elems_USD)
            print_xml(tree)
    except IOError:
        print(f'Unable to read file {file_path}')
        
file_path4 = 'transactions.txt'
filter_transaction(file_path4)

node to remove 0
node to remove 2
----------Wow! USD transaction----------
from: 465345
to: 46548743
amount: 2350
----------------------------------------
----------Wow! USD transaction----------
from: 38644
to: 8756113
amount: 8000
----------------------------------------
<tag>: from, <attrib>: {}, <text>: 12334545
<tag>: to, <attrib>: {}, <text>: 8676
<tag>: amount, <attrib>: {'currency': 'CAD'}, <text>: 3600
<tag>: from, <attrib>: {}, <text>: 8547467
<tag>: to, <attrib>: {}, <text>: 958687
<tag>: amount, <attrib>: {'currency': 'EUR'}, <text>: 1200
<tag>: from, <attrib>: {}, <text>: 167465
<tag>: to, <attrib>: {}, <text>: 9233425
<tag>: amount, <attrib>: {'currency': 'HUF'}, <text>: 9500000
<tag>: from, <attrib>: {}, <text>: 232635
<tag>: to, <attrib>: {}, <text>: 4864623
<tag>: amount, <attrib>: {'currency': 'RMB'}, <text>: 20000


In [164]:
# file_path4 = 'transactions.txt'
# tree = ET.ElementTree(ET.fromstring(read_file(file_path4)))
# for elem in tree.getroot():
#     for sub_elem in elem:
#         print(f'tag: {sub_elem.tag}, attrib: {sub_elem.attrib}, text: {sub_elem.text}')

In [182]:
int('.5')

ValueError: invalid literal for int() with base 10: '.5'

In [185]:
float('10')

10.0

In [184]:
float('.1')

0.1

'..........'

In [198]:
def time_convert(time_str):
    if (not time_str) or (time_str == '0'):
        return 0
    second = 0
    for ch in time_str:
        tmp_str = ''
        if ch == ':':
            return sum(x * int(t) for x, t in zip([1, 60, 3600], time_str[::-1].split(":")))
        elif ch == 'h':
            second += float(tmp_str) * 3600
        elif ch == 'm':
            second += float(tmp_str) * 60
        elif ch == 's':
            second += float(tmp_str)
        else:
            print(f'adding {ch}')
            tmp_str += ch
    return second

def find_best(file_path):
    try:
        with open(file_path) as tsv_file:
            reader = csv.DictReader(tsv_file, dialect= 'excel-tab')
            for row in reader:
                print(row['time'])
    except IOError:
        print(f'Unable to read file {file_path}')
        
file_path5 = 'exams.txt'
find_best(file_path5)

3600s
1h2m20s
600s
32m
.5h
1h12m38s
65m
98s
1:02:08
null


In [199]:
time_convert('1:02:08')

adding 1


4880

In [200]:
time_convert('.5h')

adding .
adding 5


ValueError: could not convert string to float: 

In [177]:
time_convert('65m')

ValueError: invalid literal for int() with base 10: 'm56'