In [9]:
import os
import pandas as pd

# Function to parse data from a single file
def parse_file(file_path):
    data = []
    date = os.path.basename(file_path).split('.')[0]  # Extract date from filename
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) >= 10:
                time = parts[0]
                am_pm = parts[1]
                temp = int(parts[2])
                dew_point = int(parts[4])
                humidity = int(parts[6])
                wind_speed = parts[9]
                data.append([date, time, am_pm, temp, dew_point, humidity, wind_speed])
    return data

# Folder containing text files
nyc_folder_path = '/Users/samdvorin/Desktop/code/542/weather-prediction/nyc_weather_data'
chi_folder_path = '/Users/samdvorin/Desktop/code/542/weather-prediction/chi_weather_data'
mia_folder_path = '/Users/samdvorin/Desktop/code/542/weather-prediction/miami_weather_data'
aus_folder_path = '/Users/samdvorin/Desktop/code/542/weather-prediction/austin_weather_data'


# List to store data from all files
nyc_all_data = []

# Iterate through all files in the folder
for file_name in os.listdir(nyc_folder_path):
    if file_name.endswith('.txt'):
        file_path = os.path.join(nyc_folder_path, file_name)
        nyc_all_data.extend(parse_file(file_path))

# Create a DataFrame from the combined data
nyc_df = pd.DataFrame(nyc_all_data, columns=['date', 'time', 'am/pm', 'temp', 'dew point', 'humidity', 'wind speed'])

# Combine date, time, and am/pm into a single datetime column
nyc_df['datetime'] = pd.to_datetime(nyc_df['date'] + ' ' + nyc_df['time'] + ' ' + nyc_df['am/pm'], format='%Y-%m-%d %I:%M %p')

# Drop the original date, time, and am/pm columns
nyc_df.drop(['date', 'time', 'am/pm'], axis=1, inplace=True)

# Sort the DataFrame by the datetime column
nyc_df.sort_values(by='datetime', inplace=True)

print(nyc_df.head())

     temp  dew point  humidity wind speed            datetime
228    46         32        58          5 2024-02-01 00:51:00
202    39         30        70          0 2024-02-01 01:46:00
203    40         30        68          0 2024-02-01 01:51:00
204    39         30        70          5 2024-02-01 02:46:00
205    40         29        65          6 2024-02-01 02:51:00


In [8]:
### testing to see if it works
# Make a hard copy of the DataFrame
df_copy = nyc_df.copy()

# Combine date, time, and am/pm into a single datetime column
df_copy['datetime'] = pd.to_datetime(df_copy['date'] + ' ' + df_copy['time'] + ' ' + df_copy['am/pm'], format='%Y-%m-%d %I:%M %p')

# Drop the original date, time, and am/pm columns
df_copy.drop(['date', 'time', 'am/pm'], axis=1, inplace=True)

# Sort the DataFrame by the datetime column
df_copy.sort_values(by='datetime', inplace=True)

print(df_copy)

df_copy.to_csv('316test2.csv', index=False)

     temp  dew point  humidity wind speed            datetime
228    46         32        58          5 2024-02-01 00:51:00
202    39         30        70          0 2024-02-01 01:46:00
203    40         30        68          0 2024-02-01 01:51:00
204    39         30        70          5 2024-02-01 02:46:00
205    40         29        65          6 2024-02-01 02:51:00
..    ...        ...       ...        ...                 ...
512    68         50        52          9 2024-03-15 11:51:00
513    69         49        49         23 2024-03-15 12:51:00
514    68         48        49         15 2024-03-15 13:51:00
515    68         46        45         16 2024-03-15 14:51:00
516    69         41        36         21 2024-03-15 15:51:00

[1159 rows x 5 columns]
