In [None]:
# !pip install pycountry

import os
import shutil
from zipfile import ZipFile

import pandas as pd
import pycountry

In [None]:
def merge_data(year, year_data, station_df, country_df):

  print(f"Working on {year} -> ", end='')
  
  # unlawful merger if not performed
  year_data['station_num'] = year_data['station_num'].astype('str')

  # gather station information per merger
  merged_df = pd.merge(
    year_data,
    station_df,
    right_on='USAF',
    left_on='station_num'
  )[['station_num', 'temp_ft', 'dewpt_ft', 'slp_mb', 'visib_mi', 'wind_knt',
        'maxwind_knt', 'max_gust_knt', 'year', 'month', 'day', 'fog', 'rain',
        'snow', 'hail', 'thunder', 'tornado', 'precip_in', 'precip_flag',
        'max_temp_frnht', 'min_temp_frnht',
        'CTRY', 'ST', 'LAT', 'LON', 'ELEV(M)']]
  merged_df.columns = merged_df.columns.str.lower()

  # merge again to gather name of country
  fin_df = pd.merge(
    merged_df,
    country_df,
    left_on='ctry',
    right_on='FIPS'
  ).drop(columns=['FIPS', 'ID'])
  fin_df.columns = fin_df.columns.str.lower()

  # writing since Colab doesn't like RAM
  fin_df.to_csv(f'{year}weatherdata_updated.csv', index=False)
  shutil.move(
    f"./{year}weatherdata_updated.csv",
    f"./drive/MyDrive/STAT 480/Data/{year}weatherdata_updated.csv")

  print(f"finished @ ./drive/MyDrive/STAT 480/Data/{year}weatherdata_updated.csv")


In [None]:
def extract_csv_from_zip(start_year, end_year):
  
  station_df = pd.read_fwf('./drive/MyDrive/STAT 480/isd-history.txt', skiprows=range(20))      # update location to station identification file
  country_df = pd.read_fwf('./drive/MyDrive/STAT 480/country-list.txt')                         # update location to country list file
  
  for year in range(start_year, end_year + 1):
    
    if f'{year}_dir.zip' in os.listdir('./drive/MyDrive/STAT 480/Zip Files/'):
      with ZipFile(f'./drive/MyDrive/STAT 480/Zip Files/{year}_dir.zip', 'r') as zip:
        csv_file_name = list(filter(lambda x: not x.startswith('_') and x.endswith('.csv'), zip.namelist()))[0]
        df = pd.read_csv(zip.extract(csv_file_name)).drop(columns='Unnamed: 0')
      merge_data(year, df, station_df, country_df)
    
    elif f'{year}weatherdata.csv' in os.listdir('./drive/MyDrive/STAT 480/Zip Files/'):
      merge_data(year, pd.read_csv(f'./drive/MyDrive/STAT 480/Zip Files/{year}weatherdata.csv'), station_df, country_df)
    
    else:
      print(f'{year} not in selected range')


In [None]:
extract_csv_from_zip(2000, 2019)

Working on 2000 -> finished @ ./drive/MyDrive/STAT 480/Data/2000weatherdata_updated.csv
Working on 2001 -> finished @ ./drive/MyDrive/STAT 480/Data/2001weatherdata_updated.csv
Working on 2002 -> finished @ ./drive/MyDrive/STAT 480/Data/2002weatherdata_updated.csv
Working on 2003 -> finished @ ./drive/MyDrive/STAT 480/Data/2003weatherdata_updated.csv
Working on 2004 -> finished @ ./drive/MyDrive/STAT 480/Data/2004weatherdata_updated.csv
Working on 2005 -> finished @ ./drive/MyDrive/STAT 480/Data/2005weatherdata_updated.csv
Working on 2006 -> finished @ ./drive/MyDrive/STAT 480/Data/2006weatherdata_updated.csv
2007 not in selected range
Working on 2008 -> finished @ ./drive/MyDrive/STAT 480/Data/2008weatherdata_updated.csv
Working on 2009 -> finished @ ./drive/MyDrive/STAT 480/Data/2009weatherdata_updated.csv
Working on 2010 -> finished @ ./drive/MyDrive/STAT 480/Data/2010weatherdata_updated.csv
Working on 2011 -> finished @ ./drive/MyDrive/STAT 480/Data/2011weatherdata_updated.csv
Worki