In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# To-Do:
- Drop severerisk
- Drop snow

# Thoughts, Questions, and Considerations:
- We don't necessarily want to include pandemic-era data, as that is known to be anomalous.
- Things like major storms (e.g. Harvey) may be worth removing as well

# Joining DataFrames and Creating Unified CSV:

## Houston:

In [2]:
hstn1 = pd.read_csv('houston_00_04.csv')
hstn2 = pd.read_csv('houston_05_09.csv')
hstn3 = pd.read_csv('houston_10_14.csv')
hstn4 = pd.read_csv('houston_15_19.csv')
hstn5 = pd.read_csv('houston_20_22.csv')
houston = pd.concat([hstn1, hstn2, hstn3, hstn4, hstn5])

FileNotFoundError: [Errno 2] No such file or directory: 'houston_00_04.csv'

In [None]:
houston.shape[0]

In [None]:
houston.head()

## Galveston:

In [None]:
gvsn1 = pd.read_csv('galveston_00_04.csv')
gvsn2 = pd.read_csv('galveston_05_09.csv')
gvsn3 = pd.read_csv('galveston_10_14.csv')
gvsn4 = pd.read_csv('galveston_15_19.csv')
gvsn5 = pd.read_csv('galveston_20_22.csv')
galveston = pd.concat([gvsn1, gvsn2, gvsn3, gvsn4, gvsn5])

In [None]:
galveston.shape[0]

In [None]:
galveston.head(2)

## Port Lavaca:

In [None]:
ptlv1 = pd.read_csv('portlavaca_00_04.csv')
ptlv2 = pd.read_csv('portlavaca_05_09.csv')
ptlv3 = pd.read_csv('portlavaca_10_14.csv')
ptlv4 = pd.read_csv('portlavaca_15_19.csv')
ptlv5 = pd.read_csv('portlavaca_20_22.csv')
port_lavaca = pd.concat([ptlv1, ptlv2, ptlv3, ptlv4, ptlv5])

In [None]:
port_lavaca.shape[0]

In [None]:
port_lavaca.head(2)

## Victoria:

In [None]:
vctr1 = pd.read_csv('victoria_00_04.csv')
vctr2 = pd.read_csv('victoria_05_09.csv')
vctr3 = pd.read_csv('victoria_10_14.csv')
vctr4 = pd.read_csv('victoria_15_19.csv')
vctr5 = pd.read_csv('victoria_20_22.csv')
victoria = pd.concat([vctr1, vctr2, vctr3, vctr4, vctr5])

In [None]:
victoria.shape[0]

In [None]:
victoria.head(2)

In [None]:
houston.dtypes

# Defining Functions to Clean DataFrames for Concatenation:
- Steps Needed:
    - Take in df
    - Dropping Columns function:
        - Drop non-weather columns, like address, lat/long, etc. 
        - Also snow, snowdepth, and severerisk, source, preciptype, precipprob, conditions (others, if needed)
    - Datetime function:
        - Convert 'datetime' column to datetime format
        - Set index to datetime
        - Drop dates > 12-31-09 and < 7-1-17.
        - Resample by the hour to clean up the data a bit    
    - Rename all relevant columns to contain location name (separate function, most likely)

In [None]:
houston

In [None]:
df = houston.copy()

## Dropping Columns:

In [None]:
# drp_cols = ['name', 'address', 'resolvedAddress', 'latitude', 'longitude', 'snow', 'snowdepth', 'severerisk','preciptype','precipprob','source','conditions']
# df.drop(columns = drp_cols, inplace = True)

In [None]:
#houston = drop_columns(houston)

In [None]:
#houston.head()

## Converting to Datetime and Setting Index:

In [None]:
def date_cleaner(df):
    '''
    Takes in a DataFrame, converts date column to DateTime format,
    sets the index to the DateTime column, filters dates to include
    2010 - June 2017, and resamples by the hour. 
    
    Returns: pandas.DataFrame
    Parameters: 
        df: pandas.DataFrame
    '''
    #converting 'datetime' column to datetime format:
    df.datetime = pd.to_datetime(df.datetime)
    #setting index to datetime:
    df = df.set_index('datetime')
    #filter dates:
    df = df[(df.index > '12-31-2009') & (df.index < '7-1-2017')]
    #resampling by hour:
    df = df.resample('h').mean()
    return df

## Renaming Columns:

In [None]:
def rename_columns(df, city):
    '''
    Takes in a DataFrame and a string of the city name, 
    return a pandas DataFrame with all the columns renamed
    to include the city name. 
    '''
    for col in df.columns:
        df.rename(columns = {col: (city + '_' + col)}, inplace = True)
    return df

In [None]:
df.head(1)

## Total Function, Step-Wise:

In [None]:
def weather_cleaner(df, city):
    '''
    Takes in a DataFrame and a city name, drops unneeded columns, 
    cleans up the 'datetime' column and places it as the index, 
    and renames all measurement columns to include their city of 
    observation.
    '''
    #Dropping unneeded columns:
    drp_cols = ['name', 'address', 'resolvedAddress', 'latitude', 'longitude', 'snow', 'snowdepth', 'severerisk','preciptype','precipprob','source','conditions']
    df.drop(columns = drp_cols, inplace = True)
    #Cleaning up dates:
    df = date_cleaner(df)
    #Renaming Columns:
    df = rename_columns(df, city)
    return df

# Loop:

In [None]:
df_list = [houston, galveston, port_lavaca, victoria]
city_list = ['hs', 'gv', 'pl', 'vc']
total_df = pd.DataFrame()

for df, city in zip(df_list, city_list):
    city_df = weather_cleaner(df, city)
    total_df = pd.concat([total_df, city_df], axis = 1)

In [None]:
total_df.to_csv('total_df.csv')

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# To-Do:
- Drop severerisk
- Drop snow

# Thoughts, Questions, and Considerations:
- We don't necessarily want to include pandemic-era data, as that is known to be anomalous.
- Things like major storms (e.g. Harvey) may be worth removing as well

# Joining DataFrames and Creating Unified CSV:

## Houston:

hstn1 = pd.read_csv('houston_00_04.csv')
hstn2 = pd.read_csv('houston_05_09.csv')
hstn3 = pd.read_csv('houston_10_14.csv')
hstn4 = pd.read_csv('houston_15_19.csv')
hstn5 = pd.read_csv('houston_20_22.csv')
houston = pd.concat([hstn1, hstn2, hstn3, hstn4, hstn5])

houston.shape[0]

houston.head()

## Galveston:

gvsn1 = pd.read_csv('galveston_00_04.csv')
gvsn2 = pd.read_csv('galveston_05_09.csv')
gvsn3 = pd.read_csv('galveston_10_14.csv')
gvsn4 = pd.read_csv('galveston_15_19.csv')
gvsn5 = pd.read_csv('galveston_20_22.csv')
galveston = pd.concat([gvsn1, gvsn2, gvsn3, gvsn4, gvsn5])

galveston.shape[0]

galveston.head(2)

## Port Lavaca:

ptlv1 = pd.read_csv('portlavaca_00_04.csv')
ptlv2 = pd.read_csv('portlavaca_05_09.csv')
ptlv3 = pd.read_csv('portlavaca_10_14.csv')
ptlv4 = pd.read_csv('portlavaca_15_19.csv')
ptlv5 = pd.read_csv('portlavaca_20_22.csv')
port_lavaca = pd.concat([ptlv1, ptlv2, ptlv3, ptlv4, ptlv5])

port_lavaca.shape[0]

port_lavaca.head(2)

## Victoria:

vctr1 = pd.read_csv('victoria_00_04.csv')
vctr2 = pd.read_csv('victoria_05_09.csv')
vctr3 = pd.read_csv('victoria_10_14.csv')
vctr4 = pd.read_csv('victoria_15_19.csv')
vctr5 = pd.read_csv('victoria_20_22.csv')
victoria = pd.concat([vctr1, vctr2, vctr3, vctr4, vctr5])

victoria.shape[0]

victoria.head(2)

houston.dtypes

# Defining Functions to Clean DataFrames for Concatenation:
- Steps Needed:
    - Take in df
    - Dropping Columns function:
        - Drop non-weather columns, like address, lat/long, etc. 
        - Also snow, snowdepth, and severerisk, source, preciptype, precipprob, conditions (others, if needed)
    - Datetime function:
        - Convert 'datetime' column to datetime format
        - Set index to datetime
        - Drop dates > 12-31-09 and < 7-1-17.
        - Resample by the hour to clean up the data a bit    
    - Rename all relevant columns to contain location name (separate function, most likely)

houston

df = houston.copy()

## Dropping Columns:

# drp_cols = ['name', 'address', 'resolvedAddress', 'latitude', 'longitude', 'snow', 'snowdepth', 'severerisk','preciptype','precipprob','source','conditions']
# df.drop(columns = drp_cols, inplace = True)

#houston = drop_columns(houston)

#houston.head()

## Converting to Datetime and Setting Index:

def date_cleaner(df):
    '''
    Takes in a DataFrame, converts date column to DateTime format,
    sets the index to the DateTime column, filters dates to include
    2010 - June 2017, and resamples by the hour. 
    
    Returns: pandas.DataFrame
    Parameters: 
        df: pandas.DataFrame
    '''
    #converting 'datetime' column to datetime format:
    df.datetime = pd.to_datetime(df.datetime)
    #setting index to datetime:
    df = df.set_index('datetime')
    #filter dates:
    df = df[(df.index > '12-31-2009') & (df.index < '7-1-2017')]
    #resampling by hour:
    df = df.resample('h').mean()
    return df

## Renaming Columns:

def rename_columns(df, city):
    '''
    Takes in a DataFrame and a string of the city name, 
    return a pandas DataFrame with all the columns renamed
    to include the city name. 
    '''
    for col in df.columns:
        df.rename(columns = {col: (city + '_' + col)}, inplace = True)
    return df



df.head(1)

## Total Function, Step-Wise:

def weather_cleaner(df, city):
    '''
    Takes in a DataFrame and a city name, drops unneeded columns, 
    cleans up the 'datetime' column and places it as the index, 
    and renames all measurement columns to include their city of 
    observation.
    '''
    #Dropping unneeded columns:
    drp_cols = ['name', 'address', 'resolvedAddress', 'latitude', 'longitude', 'snow', 'snowdepth', 'severerisk','preciptype','precipprob','source','conditions']
    df.drop(columns = drp_cols, inplace = True)
    #Cleaning up dates:
    df = date_cleaner(df)
    #Renaming Columns:
    df = rename_columns(df, city)
    return df

# Loop:

df_list = [houston, galveston, port_lavaca, victoria]
city_list = ['hs', 'gv', 'pl', 'vc']
total_df = pd.DataFrame()

for df, city in zip(df_list, city_list):
    city_df = weather_cleaner(df, city)
    total_df = pd.concat([total_df, city_df], axis = 1)

total_df.to_csv('total_df.csv')