In [1]:
import pandas as pd

# Specify the file path to the CSV file
file_path = 'US-complete-data.csv'

# Define the column names for your data
column_names = ['region','country', 'state', 'city','month', 'day',  'year', 'avgtemperature']

# Define the data types for each column
data_types = {'region': str, 'country': str, 'state': str,  'city': str, 'month': int, 'day': int, 'year': int, 'avgtemperature': float}

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path, names=column_names, skiprows=1, dtype=data_types)

# Perform data analysis here

# some cleaning
# df = df[(df['day'] != 0) & (df.avgtemperature != -99)]

In [2]:
# create datetime clumn
df["date"] = pd.to_datetime(df[["year", "month", "day"]])

In [3]:
df.head()

Unnamed: 0,region,country,state,city,month,day,year,avgtemperature,date
0,North America,US,Alabama,Birmingham,1,1,1995,50.7,1995-01-01
1,North America,US,Alabama,Birmingham,1,2,1995,37.2,1995-01-02
2,North America,US,Alabama,Birmingham,1,3,1995,33.2,1995-01-03
3,North America,US,Alabama,Birmingham,1,4,1995,33.3,1995-01-04
4,North America,US,Alabama,Birmingham,1,5,1995,26.4,1995-01-05


In [4]:
# get an average temperature for each month for each city
df_grouped = df.drop(columns=['month', 'day','year','state', 'region'])
df_grouped = df_grouped.groupby(['country', 'city', pd.Grouper(key="date", freq='M')]).mean()

In [5]:
df_grouped.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,avgtemperature
country,city,date,Unnamed: 3_level_1
US,Abilene,1995-01-31,45.503226
US,Abilene,1995-02-28,50.042857
US,Abilene,1995-03-31,53.096774
US,Abilene,1995-04-30,62.623333
US,Abilene,1995-05-31,69.76129
US,Abilene,1995-06-30,76.12
US,Abilene,1995-07-31,82.870968
US,Abilene,1995-08-31,80.609677
US,Abilene,1995-09-30,73.576667
US,Abilene,1995-10-31,66.103226


In [10]:
# Convert 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Extract 'year' and 'season' from 'date' column
df['year'] = df['date'].dt.year
df['season'] = df['date'].dt.month % 12 // 3 + 1

# Drop all dates except for Season 3
df = df[df['season'] == 3]

# Define start and end years for the five-year ranges
start_years = [1995, 2014]
end_years = [2000, 2019]
result = []

# Calculate mean temperature change for each season within the five-year ranges
for start, end in zip(start_years, end_years):
    # Calculate average temperature for start and end years within the range
    start_avg = df[(df['year'] >= start) & (df['year'] <= start + 4)].groupby(['city', 'season'])['avgtemperature'].mean()
    end_avg = df[(df['year'] >= end - 4) & (df['year'] <= end)].groupby(['city', 'season'])['avgtemperature'].mean()
    
    # Calculate mean temperature change by subtracting start_avg from end_avg
    result.append(end_avg - start_avg)

# Concatenate the results into a single DataFrame
result = pd.concat(result)

# Calculate the total temperature change over 20 years for each city
total_temp_change = result.groupby('city').sum()

# Sort the cities based on the total temperature change over 20 years in descending order
sorted_cities = total_temp_change.sort_values(ascending=False)

# Print the cities with the largest change in seasonal temperature over 20 years
print("Cities with the largest change in seasonal temperature over 20 years:")
print(sorted_cities.head(10))

# Get the top 10 cities with the largest change in seasonal temperature
top_cities = sorted_cities.head(10)

# Filter the seasonal average temperature for the top cities in the final five years
first_years_avg_temp = df[df['city'].isin(top_cities.index) & (df['year'] <= df['year'].min() + 4)].groupby('city')['avgtemperature'].mean()

# Filter the seasonal average temperature for the top cities in the final five years
final_years_avg_temp = df[df['city'].isin(top_cities.index) & (df['year'] >= df['year'].max() - 4)].groupby('city')['avgtemperature'].mean()

# Print the seasonal average temperature for each city in the final five years
print("\nSeasonal average temperature for each city in the final five years (for cities with largest change in seasonal temperature):")
print("Average of First Five Years\n",first_years_avg_temp)
print("Average of Last Five Years\n",final_years_avg_temp)



Cities with the largest change in seasonal temperature over 20 years:
city
Salt Lake City    1.391522
Billings          1.237271
Grand Junction    1.219130
Pueblo            1.206304
Pocatello         1.088265
Denver            1.041087
Great Falls       0.927329
San Angelo        0.918337
Cheyenne          0.912430
Helena            0.911304
Name: avgtemperature, dtype: float64

Seasonal average temperature for each city in the final five years (for cities with largest change in seasonal temperature):
Average of First Five Years
 city
Billings          68.803111
Cheyenne          65.261656
Denver            69.556087
Grand Junction    75.018696
Great Falls       64.120713
Helena            64.241304
Pocatello         67.016812
Pueblo            71.961087
Salt Lake City    74.763696
San Angelo        82.020306
Name: avgtemperature, dtype: float64
Average of Last Five Years
 city
Billings          70.949783
Cheyenne          67.504348
Denver            72.093261
Grand Junction    77.448

In [7]:
clear





NameError: name 'clear' is not defined