In [14]:
# Package imports
import os
import pandas as pd
import csv
import tqdm
from sqlalchemy import create_engine

In [15]:
# Variable declaration
DATA_FOLDER = os.path.join("..","data")
DATA_HEADERS = ['title', 'subtitle', 'author',
                'publisheddate', 'article', 'link', 'tags']

# Define your PostgreSQL database connection parameters
db_url = "postgresql://postgres:Pa$$w0rd@localhost/decodeMT"

# Create the SQLAlchemy engine
engine = create_engine(db_url)

In [16]:
DATE_FORMATS = {"IN": "%A, %d %B %Y, %H:%M",
                "MT": "%d %B %Y, %I:%M%p",
                "NB": "%B %d, %Y %I:%M %p",
                "TVM": "%B %d, %Y",
                "ToM": "%d %B %Y"}

In [17]:
def processFile(file: str):
    try:
        if not file.endswith('.csv'):
            return

        parts = file.split("_")

        # Determine source by takin the first part (before the underscore)
        source = parts[0]

        # Use the Unit Separator character (ASCII 31) as the separator
        unit_separator = '\x1F'

        # Load the CSV file into a pandas DataFrame
        # Set quoting=csv.QUOTE_MINIMAL to handle fields that contain special characters
        # Set quotechar='"' if fields containing special characters are enclosed in double quotes
        df = pd.read_csv(os.path.join(DATA_FOLDER, file), sep=unit_separator, quoting=csv.QUOTE_MINIMAL, quotechar='"', header=None, names=DATA_HEADERS)
        df['source'] = source

        # Set the data type of all fields to string
        df = df.astype(str)
        df['link'] = df['link'].apply(lambda x: x.strip())

        # Parse the date string using the specified format
        # Get the date format for supported sources.
        date_format = DATE_FORMATS.get(source)
        # If format not found or date not available then skip
        if (date_format is not None and df['publisheddate'] is not None):
            # Only convert full dates, since some articles have just a time.
            df['publisheddate'] = df['publisheddate'].str.replace('Last updated on ','')
            df['parseddate'] = df['publisheddate'].apply(lambda x: pd.to_datetime(x, format=date_format) if len(x) > 6 else None)

        # Convert to DB types
        df = df.convert_dtypes()
        
        # Dump to DB
        df.to_sql('articles', engine, schema='localnews', if_exists='append', index=False)
    except Exception as e:
        print("Error occurred during processing file {}.".format(file))
        print(e)

In [30]:
# Test problematic/singular file
processFile('MT_20230407_081900.csv')

In [19]:
# Apply to all files
for file in tqdm.tqdm(os.listdir(DATA_FOLDER)):
    processFile(file)

100%|██████████| 2/2 [00:00<00:00, 18.98it/s]
