In [4]:
#Library imports
import re
import os
import sys
import shutil
import time
import math
import gzip
import fnmatch
import random
import warnings
import numpy as np
import pandas as pd
import scipy.stats as scs
import urllib.request
import seaborn as sns
import matplotlib.pyplot as plt

from collections import OrderedDict

import scipy.stats as scs

import fiona
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString
from pyproj import Proj
import geoplot as gplt
import geoplot.crs as gcrs

# Allows access to scripts and modules relative to the parent directory.
parent = os.getcwd()
sys.path.append(os.path.join(parent, "functions"))

# Project specific user driven functions
from cleaning_functions import *

# My open source reusable user driven function repository.
from random_lumberjacks.src.random_lumberjacks.cleaning.cleaning_functions import *
from random_lumberjacks.src.random_lumberjacks.model.model_classes import *
from random_lumberjacks.src.random_lumberjacks.visualization.visualization_functions import *

#Notebook arguments
%matplotlib inline

In [14]:
with gzip.open("010020-99999-2019.gz", "rb") as f:
    data = f.read().splitlines()

array([''], dtype='<U1')

In [80]:
u = np.empty(1, dtype=str)

In [82]:
u[0]

'0'

In [77]:
t.decode("utf-8") 

'01080'

In [81]:
u[0]= t.decode("utf-8") 

In [56]:
t = params[0][0:5]

In [233]:
def parse_fixed_noaa_data(bstring, params):
    
    return [bstring[param[1]:param[2]] for param in params]

def noa_df_convert_nans(df, column, nan_val):
    if nan_val:
        nan_val = bytes(nan_val, "utf-8")
        df[column] = np.where(df[column] == nan_val, np.nan, df[column])

def noa_df_convert_nums(df, column, dtype, scalar):
    df[column] = df[column].astype(dtype)
    if scalar:
        df[column] = df[column]/scalar

def noa_df_convert_strings(df, column, dtype):
    if dtype == "str" or dtype == "datetime":
        df[column] = df[column].str.decode("utf-8")

def noa_df_convert_datetime(df, column, dtype):
    if dtype == "datetime":
        df[column] = pd.to_datetime(df[column], format="%Y%m%d%H%M")

def fix_noaa_df_dtypes(df, params):
    for param in params:
        col, dtype, nan_val, scalar = param[0], param[3], param[4], param[5]
        
        noa_df_convert_nans(df, col, nan_val)
        
        if dtype != "str" and dtype != "datetime":
            noa_df_convert_nums(df, col, dtype, scalar)
        else:
            noa_df_convert_strings(df, col, dtype)
            noa_df_convert_datetime(df, col, dtype)

def raw_noaa_to_dataframe(data, fixed_locs):
    columns = [item[0] for item in [*fixed_locs]]
    noaa_list = []
    for line in data:
        var = line[108:].split(b" ")
        fixed_data = parse_fixed_noaa_data(line, fixed_locs)
        noaa_list.append(fixed_data)
        #print(var)
    df = pd.DataFrame(noaa_list, columns = columns)
    fix_noaa_df_dtypes(df, fixed_locs)
    return df

In [234]:
key = ["column_name", "start", "end", "dtype", "nan_value", "conversion factor"]
fixed_locs = [["USAF_ID", 4, 10, "str", None, None], ["NCEI_WBAN_ID", 10, 15, "str", None, None],
              ["Date", 15, 27, "datetime", None, None], ["Data Source", 27, 28, "str", "9", None],
              ["Latitude", 28, 34, "float64", "+99999", 1000], ["Longitude", 34, 41, "float64", "+99999", 1000],
              ["Code", 41, 46, "str", "99999", None], ["Elevation", 46, 51, "int64", "+9999", None],
              ["Call_Letter", 51, 56, "str", "99999", None], ["Quality_Control", 56, 60, "str", "99999", None],
              ["Wind_Dir", 60, 63, "float64", "999", None], ["Wind_Dir_Q", 63, 64, "str", None, None],
              ["Wind_Type", 64, 65, "str", "9", None], ["Wind_Speed", 65, 69, "float64", "9999", 10],
              ["Wind_Speed_Q", 69, 70, "str", None, None], ["Air Temperature", 87, 92, "float64", "+9999", 10],
              ["Air Temperature_Q", 92, 93, "str", None, None], ["Air_Pressure", 99, 104, "float64", "99999", 10],
              ["Air_Pressure_Q", 104, 105, "str", None, None]]

df = raw_noaa_to_dataframe(data, fixed_locs)
df

Unnamed: 0,USAF_ID,NCEI_WBAN_ID,Date,Data Source,Latitude,Longitude,Code,Elevation,Call_Letter,Quality_Control,Wind_Dir,Wind_Dir_Q,Wind_Type,Wind_Speed,Wind_Speed_Q,Air Temperature,Air Temperature_Q,Air_Pressure,Air_Pressure_Q
0,010020,99999,2019-01-01 00:00:00,4,80.050,16.25,FM-12,8,,V020,320.0,1,N,9.0,1,-19.0,1,1004.8,1
1,010020,99999,2019-01-01 01:00:00,4,80.050,16.25,FM-12,8,,V020,330.0,1,N,10.0,1,-19.1,1,1005.5,1
2,010020,99999,2019-01-01 02:00:00,4,80.050,16.25,FM-12,8,,V020,320.0,1,N,9.0,1,-19.2,1,1006.0,1
3,010020,99999,2019-01-01 03:00:00,4,80.050,16.25,FM-12,8,,V020,320.0,1,N,8.0,1,-19.6,1,1006.4,1
4,010020,99999,2019-01-01 04:00:00,4,80.059,16.25,FM-12,8,,V020,302.0,1,N,8.1,1,-19.4,1,1006.9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8080,010020,99999,2019-12-31 19:00:00,4,80.050,16.25,FM-12,8,,V020,70.0,1,N,5.0,1,-13.2,1,992.6,1
8081,010020,99999,2019-12-31 20:00:00,4,80.050,16.25,FM-12,8,,V020,60.0,1,N,5.0,1,-13.3,1,992.9,1
8082,010020,99999,2019-12-31 21:00:00,4,80.050,16.25,FM-12,8,,V020,80.0,1,N,2.0,1,-13.1,1,993.0,1
8083,010020,99999,2019-12-31 22:00:00,4,80.050,16.25,FM-12,8,,V020,90.0,1,N,2.0,1,-12.8,1,993.1,1


In [228]:
df["USAF_ID"].dtype(

SyntaxError: unexpected EOF while parsing (<ipython-input-228-92d925f30553>, line 1)

In [2]:
test = pd.read_csv("010020-99999-2019.gz", compression='gzip', header=0, sep=' ', quotechar='"',) #error_bad_lines=False)
test

ParserError: Error tokenizing data. C error: Expected 10 fields in line 7, saw 11
