In [1]:
import pandas as pd
import boto3

In [2]:
# Initialize an S3 client
s3 = boto3.client('s3')

# Define the S3 bucket and file path
bucket_name = 'awsbucket3323'
weather_file_path = 'capstone/raw-data/GlobalLandTemperaturesByCity.csv'

# Read our weather CSV file directly into a DataFrame
df = pd.read_csv(f's3://{bucket_name}/{weather_file_path}')

In [3]:
# A look at our city weather data
df_str = df.head().to_string(index=True, header=True)
print(df_str)

           dt  AverageTemperature  AverageTemperatureUncertainty   City  Country Latitude Longitude
0  1743-11-01               6.068                          1.737  Århus  Denmark   57.05N    10.33E
1  1743-12-01                 NaN                            NaN  Århus  Denmark   57.05N    10.33E
2  1744-01-01                 NaN                            NaN  Århus  Denmark   57.05N    10.33E
3  1744-02-01                 NaN                            NaN  Århus  Denmark   57.05N    10.33E
4  1744-03-01                 NaN                            NaN  Århus  Denmark   57.05N    10.33E


In [4]:
# Need to filter to reduce amount of rows
print(df.count())

dt                               8599212
AverageTemperature               8235082
AverageTemperatureUncertainty    8235082
City                             8599212
Country                          8599212
Latitude                         8599212
Longitude                        8599212
dtype: int64


In [5]:
# Null values in avg temp data only
print(df.isna().sum())

dt                                    0
AverageTemperature               364130
AverageTemperatureUncertainty    364130
City                                  0
Country                               0
Latitude                              0
Longitude                             0
dtype: int64


In [6]:
# Convert 'dt' column to datetime format
df['dt'] = pd.to_datetime(df['dt'])

In [7]:
# Filter the DataFrame to include only rows between 2000 and 2013
start_date = '2000-01-01'
end_date = '2013-12-31'
filtered_df = df[(df['dt'] >= start_date) & (df['dt'] <= end_date)]

In [8]:
# We can see we succesffully filtered by year starting from 2000 and ending at 2013
print(filtered_df)

                dt  AverageTemperature  AverageTemperatureUncertainty    City  \
3074    2000-01-01               3.065                          0.372   Århus   
3075    2000-02-01               3.724                          0.241   Århus   
3076    2000-03-01               3.976                          0.296   Århus   
3077    2000-04-01               8.321                          0.221   Århus   
3078    2000-05-01              13.567                          0.253   Århus   
...            ...                 ...                            ...     ...   
8599207 2013-05-01              11.464                          0.236  Zwolle   
8599208 2013-06-01              15.043                          0.261  Zwolle   
8599209 2013-07-01              18.775                          0.193  Zwolle   
8599210 2013-08-01              18.025                          0.298  Zwolle   
8599211 2013-09-01                 NaN                            NaN  Zwolle   

             Country Latitu

In [10]:
# Adding year, month, day columns
filtered_df.loc[:, 'date_year'] = filtered_df['dt'].dt.year
filtered_df.loc[:, 'date_month'] = filtered_df['dt'].dt.month
filtered_df.loc[:, 'date_day'] = filtered_df['dt'].dt.day

In [11]:
# Checking that columns look correct
print(filtered_df.head())

             dt  AverageTemperature  AverageTemperatureUncertainty   City  \
3074 2000-01-01               3.065                          0.372  Århus   
3075 2000-02-01               3.724                          0.241  Århus   
3076 2000-03-01               3.976                          0.296  Århus   
3077 2000-04-01               8.321                          0.221  Århus   
3078 2000-05-01              13.567                          0.253  Århus   

      Country Latitude Longitude  date_year  date_month  date_day  
3074  Denmark   57.05N    10.33E       2000           1         1  
3075  Denmark   57.05N    10.33E       2000           2         1  
3076  Denmark   57.05N    10.33E       2000           3         1  
3077  Denmark   57.05N    10.33E       2000           4         1  
3078  Denmark   57.05N    10.33E       2000           5         1  


In [12]:
# Save the filtered DataFrame to a CSV file
filtered_file_path = 'filtered_weather_city_data.csv'
filtered_df.to_csv(filtered_file_path, index=False)

In [21]:
# I then manually uploaded to my AWS S3 bucket using the AWS management console
# This variable is just to show the specified file path to my filtered weather data
destination_file_path = 'capstone/filtered-data/filtered_weather_data.csv'

In [13]:
# Closing the s3 client
s3.close()

## Applying same filters for the Country table 

In [14]:
# Initialize an S3 client 
s3 = boto3.client('s3')

# Define the S3 bucket and file path
bucket_name = 'awsbucket3323'
country_file_path = 'capstone/raw-data/GlobalLandTemperaturesByCountry.csv'

# Read our weather CSV file directly into a DataFrame
df = pd.read_csv(f's3://{bucket_name}/{country_file_path}')

In [15]:
# A look at our country weather data
df_str = df.head().to_string(index=True, header=True)
print(df_str) # we dont need data starting from 1743
print(df.count()) # counting rows

           dt  AverageTemperature  AverageTemperatureUncertainty Country
0  1743-11-01               4.384                          2.294   Åland
1  1743-12-01                 NaN                            NaN   Åland
2  1744-01-01                 NaN                            NaN   Åland
3  1744-02-01                 NaN                            NaN   Åland
4  1744-03-01                 NaN                            NaN   Åland
dt                               577462
AverageTemperature               544811
AverageTemperatureUncertainty    545550
Country                          577462
dtype: int64


In [16]:
# Convert 'dt' column to datetime format
df['dt'] = pd.to_datetime(df['dt'])

In [17]:
# Filter the DataFrame to include only rows between 2000 and 2013
start_date = '2000-01-01'
end_date = '2013-12-31'
filtered_df = df[(df['dt'] >= start_date) & (df['dt'] <= end_date)]
print(filtered_df) # we can see dt start/end points are correct

               dt  AverageTemperature  AverageTemperatureUncertainty   Country
3074   2000-01-01               0.197                          0.407     Åland
3075   2000-02-01              -0.023                          0.399     Åland
3076   2000-03-01               0.615                          0.429     Åland
3077   2000-04-01               4.124                          0.348     Åland
3078   2000-05-01               8.557                          0.447     Åland
...           ...                 ...                            ...       ...
577457 2013-05-01              19.059                          1.022  Zimbabwe
577458 2013-06-01              17.613                          0.473  Zimbabwe
577459 2013-07-01              17.000                          0.453  Zimbabwe
577460 2013-08-01              19.759                          0.717  Zimbabwe
577461 2013-09-01                 NaN                            NaN  Zimbabwe

[40095 rows x 4 columns]


In [21]:
# Adding year, month, day columns 
filtered_df.loc[:, 'date_year'] = filtered_df['dt'].dt.year
filtered_df.loc[:, 'date_month'] = filtered_df['dt'].dt.month
filtered_df.loc[:, 'date_day'] = filtered_df['dt'].dt.day

In [19]:
# Checking that columns look correct
print(filtered_df.head())

             dt  AverageTemperature  AverageTemperatureUncertainty Country  \
3074 2000-01-01               0.197                          0.407   Åland   
3075 2000-02-01              -0.023                          0.399   Åland   
3076 2000-03-01               0.615                          0.429   Åland   
3077 2000-04-01               4.124                          0.348   Åland   
3078 2000-05-01               8.557                          0.447   Åland   

      date_year  date_month  date_day  
3074       2000           1         1  
3075       2000           2         1  
3076       2000           3         1  
3077       2000           4         1  
3078       2000           5         1  


In [20]:
# Save the filtered DataFrame to a CSV file
filtered_file_path = 'filtered_weather_country_data.csv'
filtered_df.to_csv(filtered_file_path, index=False)

In [None]:
# I then manually uploaded to my AWS S3 bucket using the AWS management console
# This variable is just to show the specified file path to my filtered weather data
destination_file_path = 'capstone/filtered-data/filtered_weather_country_data.csv'

In [22]:
# Closing the s3 client
s3.close()