In [None]:
##Imports needed to run scripts
import requests
import os
import csv
from io import StringIO
from datetime import datetime
import pandas as pd

url = "https://earthquake.usgs.gov/fdsnws/event/1/query?format=csv&starttime=2023-01-01&endtime=2023-12-31&minmagnitude=5"

## **Review for M1**

#### Testing M1 import function:

In [None]:
def import_data(url):
    os.makedirs('data', exist_ok=True)#creates the data folder if it already exists ignore
    file_name = 'dataset_M1.txt'#file name
    file_path = os.path.join('data', file_name)#path for the file
    response = requests.get(url)#get the url
    if response.status_code == 200:  #if the url request was sucessful
        with open(file_path, 'w', encoding= 'utf-8') as file:  # Open the file in write mode 
            file.write(response.text)  # Write the content to the file

        with open(file_path, 'r', encoding= 'utf-8') as file:  # Open the file in read mode
            lines = file.readlines()  # Read all lines into a list
        return lines#returns them

#checks:
import sys
print(sys.stdout.encoding) #checking the encoding as the encoding was not specified in the function, returning an error as the system's defualt encoding was charmap

import_data(url) #testing the import data function runs once encoding was specified (yes)

UTF-8


['time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,id,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource\n',
 '2023-12-30T17:16:23.833Z,-2.9934,139.372,33,6.3,mww,190,15,1.412,0.81,us,us6000m0n6,2024-05-23T16:43:01.175Z,"146 km WSW of Abepura, Indonesia",earthquake,6.56,1.732,0.073,18,reviewed,us,us\n',
 '2023-12-30T16:05:45.628Z,-14.753,-173.9714,10,5.2,mww,34,54,2.282,0.67,us,us6000m0mx,2024-03-02T21:39:34.040Z,"134 km N of Hihifo, Tonga",earthquake,9.71,1.868,0.093,11,reviewed,us,us\n',
 '2023-12-30T15:03:01.478Z,-6.4131,154.9575,73.469,5.3,mww,90,20,3.551,0.87,us,us6000m0mh,2024-03-02T21:39:33.040Z,"59 km W of Panguna, Papua New Guinea",earthquake,8.93,5.003,0.08,15,reviewed,us,us\n',
 '2023-12-30T15:00:11.442Z,-11.5203,166.415,155.602,5,mww,66,44,6.688,0.69,us,us6000m0mg,2024-03-02T21:39:33.040Z,"110 km SE of Lata, Solomon Islands",earthquake,10.76,5.451,0.098,10,reviewed,us,us\n',
 '2023-12-30T06:16:37.350Z,-54.5762,-23.271

#### Testing M1 cleaning function:

In [12]:

def clean_data(data_lines):
    os.makedirs('output', exist_ok=True)#creates output folder if it already exists ignore
    cleaned_file_name = 'cleaned_data_M1.txt'#file name
    cleaned_file_path = os.path.join('output', cleaned_file_name)#path for the file
    with open(cleaned_file_path, 'w') as cleaned_file:#opens file in write mode
        for line in data_lines[1:]:#for ignoring headers
            columns = line.strip().split(',')#removes any whitespace and seperates it into columns
            timestamp = columns[0]#first coloumn

            try:#this is to skip any errors
                dt = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")#matches the current time format
                formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S")#converts it in to the format we need
                print(f"Original: {timestamp} -> Formatted: {formatted_time}")#fancy way of printing old and new

                cleaned_file.write(formatted_time + '\n')#writes the formatted time with a new line
            except ValueError:
                print(f"Skipping invalid timestamp: {timestamp}")#this skips any invalid lines
                continue#continues the loop



data_lines = import_data(url)#input

if data_lines:
    clean_data(data_lines)#output


Original: 2023-12-30T17:16:23.833Z -> Formatted: 2023-12-30 17:16:23
Original: 2023-12-30T16:05:45.628Z -> Formatted: 2023-12-30 16:05:45
Original: 2023-12-30T15:03:01.478Z -> Formatted: 2023-12-30 15:03:01
Original: 2023-12-30T15:00:11.442Z -> Formatted: 2023-12-30 15:00:11
Original: 2023-12-30T06:16:37.350Z -> Formatted: 2023-12-30 06:16:37
Original: 2023-12-30T05:19:28.158Z -> Formatted: 2023-12-30 05:19:28
Original: 2023-12-30T02:42:25.497Z -> Formatted: 2023-12-30 02:42:25
Original: 2023-12-29T19:03:54.159Z -> Formatted: 2023-12-29 19:03:54
Original: 2023-12-29T16:31:16.478Z -> Formatted: 2023-12-29 16:31:16
Original: 2023-12-29T08:42:05.822Z -> Formatted: 2023-12-29 08:42:05
Original: 2023-12-29T03:37:19.230Z -> Formatted: 2023-12-29 03:37:19
Original: 2023-12-29T02:38:10.299Z -> Formatted: 2023-12-29 02:38:10
Original: 2023-12-29T01:18:50.036Z -> Formatted: 2023-12-29 01:18:50
Original: 2023-12-29T00:38:04.868Z -> Formatted: 2023-12-29 00:38:04
Original: 2023-12-28T22:37:37.025Z

M1's functions both effectively import and clean the data. However, M1's import code orginally produced an error when trying to run it. This is because the encoding (utf-8) was not specified and the defualt encoding being used was charmap, producing a UnicodeEncodeError. I resolved this error by specifying the encoding to be utf-8 in the open file functions which is something M1 could add to their code to prevent this error.   
The comments M1 included on each line meant the code was very clear throughout, making it easier to read and understand each code line's purpose in the functions.   
The code overall was very efficient. To improve efficiency even further, some aspects of the code could be condensed, for example in the import function, one open file function could be used instead of two by changing the mode used to be w+ (instead of r and w individually, w+ reads and writes the file).
If using the w+ mode instead, ensure to reset the file pointer back to the start of the file using the line f.seek(0).   
Also, M1 appears to have cleaned the data in the reverse format specified in the assignment brief. M1's data is in the format YYYY-MM-DD instead of DD-MM-YYYY. This can easily be resolved by editing the order of Y,m,d in the formatted_date definition.  

## **Review for M2**

#### Testing M2 import function:

In [None]:

team_member_id = "M2"

# New function to import data

def import_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        data_text = response.text
    else:
        raise Exception("Failed to download data.")
    
    if not os.path.exists("data"):
        os.makedirs("data")

    file_path = os.path.join("data", f"dataset_{team_member_id}.txt")

    with open(file_path, "w", encoding = "utf-8") as f:
        f.write(data_text)
    with open(file_path, "r", encoding = "utf-8") as f:
        data_lines = f.readlines()
    return data_lines

# Running the following code will import the data

import_data(url)


['time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,id,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource\n',
 '2023-12-30T17:16:23.833Z,-2.9934,139.372,33,6.3,mww,190,15,1.412,0.81,us,us6000m0n6,2024-05-23T16:43:01.175Z,"146 km WSW of Abepura, Indonesia",earthquake,6.56,1.732,0.073,18,reviewed,us,us\n',
 '2023-12-30T16:05:45.628Z,-14.753,-173.9714,10,5.2,mww,34,54,2.282,0.67,us,us6000m0mx,2024-03-02T21:39:34.040Z,"134 km N of Hihifo, Tonga",earthquake,9.71,1.868,0.093,11,reviewed,us,us\n',
 '2023-12-30T15:03:01.478Z,-6.4131,154.9575,73.469,5.3,mww,90,20,3.551,0.87,us,us6000m0mh,2024-03-02T21:39:33.040Z,"59 km W of Panguna, Papua New Guinea",earthquake,8.93,5.003,0.08,15,reviewed,us,us\n',
 '2023-12-30T15:00:11.442Z,-11.5203,166.415,155.602,5,mww,66,44,6.688,0.69,us,us6000m0mg,2024-03-02T21:39:33.040Z,"110 km SE of Lata, Solomon Islands",earthquake,10.76,5.451,0.098,10,reviewed,us,us\n',
 '2023-12-30T06:16:37.350Z,-54.5762,-23.271

#### Testing M2 cleaning function:

In [None]:

data_lines = import_data(url)

# New function to clean data

def clean_data(data_text_list):
    data_str = "".join(data_text_list)
    csv_reader = csv.reader(StringIO(data_str))
    header = next(csv_reader)

    date_index = header.index("time")

    cleaned_rows = [header]
    for row in csv_reader:
        try:
            original_date = row[date_index]
            original_date = original_date.rstrip("Z")
            dt = datetime.fromisoformat(original_date)
            formatted_date = dt.strftime("%Y-%m-%d %H:%M:%S")
            row[date_index] = formatted_date
        except Exception as e:
            print(f"Error proccessing row: {e}")
        cleaned_rows.append(row)

    if not os.path.exists("output"):
        os.makedirs("output")

    output_file = os.path.join("output", f"cleaned_data_{team_member_id}.txt")

    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerows(cleaned_rows)

    cleaned_text_lines = [",".join(row) + "\n" for row in cleaned_rows]
    return cleaned_text_lines

# Running the following code will clean the data

clean_data(data_lines)


M2's functions both effectively import and clean the data however, like in M1's cleaning funcction, M2 also formats the cleaned data in the reverse order as specified in the assignment brief. M2's clean data is of the format YYYY-MM-DD rather than DD-MM-YYYY. 
M2's use of whitespace makes their code clearer and easier to read. To improve clarity further, they could include mroe comments throughout their code. 
To improve efficiency and reduce unnecessary complexity, M2 could also remove the need for the variable of team_member_id and just use their id 'M2' in the strings when required.  
M2 could also eradicate the need for the if not statements (regarding mkaing the directory) by just including the parameter exists_ok=True in the os.makedirs() function.  
The import function could also use one open file function instead of two by changing the mode used to be w+ (instead of r and w individually, w+ reads and writes the file).  
If using the w+ mode instead, M2 should ensure to reset the file pointer back to the start of the file using the line f.seek(0).