<a href="https://colab.research.google.com/github/gabrielborja/python_data_analysis/blob/main/sustainability_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Global Climate Analysis

Global Climate Change Data from 1750－2015 can be found [here](https://data.world/data-society/global-climate-change-data)

## Uploading packages and data

In [None]:
#Importing necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
#Pip install plotly --upgrade
!pip install plotly --upgrade

In [None]:
#Remove previous versions of the uploaded file
!rm GlobalLandTemperaturesByCountry.csv

In [None]:
#Uploading file from local drive
from google.colab import files
uploaded1 = files.upload()

Saving GlobalLandTemperaturesByCountry.csv to GlobalLandTemperaturesByCountry.csv


In [None]:
#Storing dataset in a Pandas Dataframe
import io
df1_co = pd.read_csv(io.BytesIO(uploaded1['GlobalLandTemperaturesByCountry.csv']))

In [None]:
#Checking the dataframe information
df1_co.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 577462 entries, 0 to 577461
Data columns (total 4 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   dt                             577462 non-null  object 
 1   AverageTemperature             544811 non-null  float64
 2   AverageTemperatureUncertainty  545550 non-null  float64
 3   Country                        577462 non-null  object 
dtypes: float64(2), object(2)
memory usage: 17.6+ MB


##Data Cleaning

In [None]:
#Checking for missing values the in dataframe
df1_co.isna().sum()

dt                                   0
AverageTemperature               32651
AverageTemperatureUncertainty    31912
Country                              0
dtype: int64

In [None]:
#Removing missing values
df1_co.dropna(axis=0, how='any', subset=['AverageTemperature'], inplace=True, )
df1_co.isna().sum()

dt                               0
AverageTemperature               0
AverageTemperatureUncertainty    0
Country                          0
dtype: int64

In [None]:
#Parsing date column to datetime object and reset index
df1_co['dt'] = pd.to_datetime(df1_co['dt'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df1_co.reset_index(drop=True, inplace=True)
df1_co.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1744-04-01,1.53,4.68,Åland
2,1744-05-01,6.702,1.789,Åland
3,1744-06-01,11.609,1.577,Åland
4,1744-07-01,15.342,1.41,Åland


In [None]:
#Checking the number of unique countries
df1_co['Country'].nunique()

242

In [None]:
#Checking if there are duplicate values
df1_co['Country'].unique()

array(['Åland', 'Afghanistan', 'Africa', 'Albania', 'Algeria',
       'American Samoa', 'Andorra', 'Angola', 'Anguilla',
       'Antigua And Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Asia',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Baker Island', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
       'Belize', 'Benin', 'Bhutan', 'Bolivia',
       'Bonaire, Saint Eustatius And Saba', 'Bosnia And Herzegovina',
       'Botswana', 'Brazil', 'British Virgin Islands', 'Bulgaria',
       'Burkina Faso', 'Burma', 'Burundi', "Côte D'Ivoire", 'Cambodia',
       'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands',
       'Central African Republic', 'Chad', 'Chile', 'China',
       'Christmas Island', 'Colombia', 'Comoros',
       'Congo (Democratic Republic Of The)', 'Congo', 'Costa Rica',
       'Croatia', 'Cuba', 'Curaçao', 'Cyprus', 'Czech Republic',
       'Denmark (Europe)', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Ecuador', 'Egypt'

In [None]:
#Replacing duplicated values in Country names
countries_dict = {'Congo (Democratic Republic Of The)': 'Congo', 'Denmark (Europe)': 'Denmark', 'France (Europe)': 'France',
                  'Netherlands (Europe)': 'Netherlands', 'United Kingdom (Europe)': 'United Kingdom'}

df1_co['Country'] = df1_co['Country'].replace(to_replace=countries_dict)

In [None]:
#Exporting to excel in local disk
from google.colab import files
df1_co.to_excel('global_land_temp_by_country.xlsx', index=False) #==> Excluding index from file
files.download('global_land_temp_by_country.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Data Manipulation

In [None]:
#Remove previous versions of the uploaded file
!rm global_temp_by_country.xlsx

In [None]:
#Uploading file from local drive
from google.colab import files
uploaded1a = files.upload()

Saving global_temp_by_country.xlsx to global_temp_by_country (1).xlsx


In [None]:
#Storing dataset in a Pandas Dataframe
import io
df1_co = pd.read_excel(io.BytesIO(uploaded1a['global_temp_by_country.xlsx']))

In [None]:
#Extracting year from datetime object and renaming columns
df1_co = df1_co.assign(Year = df1_co['dt'].dt.year)
df1_co = df1_co.rename(columns={'AverageTemperature': 'Temp', 'AverageTemperatureUncertainty': 'Temp_uncer'})
df1_co.tail(2)

Unnamed: 0,dt,Temp,Temp_uncer,Country,Year
544809,2013-07-01,17.0,0.453,Zimbabwe,2013
544810,2013-08-01,19.759,0.717,Zimbabwe,2013


In [None]:
#Creating a datraframe with average temperatures for each year
df1_avgtemp = df1_co.groupby(['Year']).agg(Mean_temp=('Temp','mean')).reset_index()
df1_avgtemp.tail(2)

Unnamed: 0,Year,Mean_temp
265,2012,19.468744
266,2013,19.877007


In [None]:
#Creating a second dataframe with average temperatures by year and country
df1_avgcountry = df1_co.groupby(['Year', 'Country']).agg({'Temp': ['mean', 'min', 'max']}).reset_index()
df1_avgcountry.columns = ['_'.join(col) if 'Temp' in col[0] else col[0] for col in df1_avgcountry.columns]
df1_avgcountry.tail(2)

Unnamed: 0,Year,Country,Temp_mean,Temp_min,Temp_max
44775,2013,Zimbabwe,20.71075,17.0,24.075
44776,2013,Åland,6.22975,-4.779,16.447


##Global Data Visualization

In [None]:
#Visualizing Average Temperature from 1750 to 2013
fig_1 = px.line(data_frame=df1_avgtemp[6:], x='Year', y='Mean_temp', range_y=(0, 25), title='Average World Temperature °C', width=900, height=450)
fig_1.show()

In [None]:
#Visualizing average temperatures for top economies
countries_list = ['Australia', 'Brazil', 'Canada', 'China', 'Japan', 'Switzerland', 'Unites States', 'United Kingdom']
fig_2 = px.line(data_frame=df1_avgcountry[df1_avgcountry['Country'].isin(countries_list)][190:], x='Year', y='Temp_mean',
                color='Country', title=f'Average Temperature °C in Top Economies', width=900, height=450)
fig_2.show()

In [None]:
#Visualizing mean, min and max temperatures from 1750 to 2013 in Norway
my_country = 'Norway'
fig_2 = px.line(data_frame=df1_avgcountry[df1_avgcountry['Country']==my_country][6:], x='Year', y=['Temp_mean', 'Temp_min', 'Temp_max'],
                title=f'Average Temperature °C in {my_country}', width=900, height=450)
fig_2.show()

#Temperatures by City and Decade

Global Land Temperature Data by Major City from 1850－2015 can be found [here](https://data.world/data-society/global-climate-change-data)

## Uploading packages and data

In [None]:
#Importing primary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import random

In [None]:
#Add interactivity to graphs
import ipywidgets as widgets
from IPython import display
from ipywidgets import interact, interactive, fixed, interact_manual

In [None]:
#Pip install plotly --upgrade
!pip install plotly --upgrade

Collecting plotly
[?25l  Downloading https://files.pythonhosted.org/packages/95/8d/ac1560f7ccc2ace85cd1e9619bbec1975b5d2d92e6c6fdbbdaa994c6ab4d/plotly-5.1.0-py2.py3-none-any.whl (20.6MB)
[K     |████████████████████████████████| 20.6MB 39.8MB/s 
Collecting tenacity>=6.2.0
  Downloading https://files.pythonhosted.org/packages/41/ee/d6eddff86161c6a3a1753af4a66b06cbc508d3b77ca4698cd0374cd66531/tenacity-7.0.0-py2.py3-none-any.whl
Installing collected packages: tenacity, plotly
  Found existing installation: plotly 4.4.1
    Uninstalling plotly-4.4.1:
      Successfully uninstalled plotly-4.4.1
Successfully installed plotly-5.1.0 tenacity-7.0.0


In [None]:
#Remove previous versions of the uploaded file
!rm GlobalLandTemperaturesByMajorCity.csv

In [None]:
#Uploading file from local drive
from google.colab import files
uploaded2 = files.upload()

In [None]:
#Storing dataset in a Pandas Dataframe
import io
df2_ci = pd.read_csv(io.BytesIO(uploaded2['GlobalLandTemperaturesByMajorCity.csv']))

In [None]:
#Checking the dataframe information
df2_ci.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239177 entries, 0 to 239176
Data columns (total 7 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   dt                             239177 non-null  object 
 1   AverageTemperature             228175 non-null  float64
 2   AverageTemperatureUncertainty  228175 non-null  float64
 3   City                           239177 non-null  object 
 4   Country                        239177 non-null  object 
 5   Latitude                       239177 non-null  object 
 6   Longitude                      239177 non-null  object 
dtypes: float64(2), object(5)
memory usage: 12.8+ MB


##Data Cleaning

In [None]:
#Checking for missing values the in dataframe
df2_ci.isna().sum()

dt                                   0
AverageTemperature               11002
AverageTemperatureUncertainty    11002
City                                 0
Country                              0
Latitude                             0
Longitude                            0
dtype: int64

In [None]:
#Removing missing values
df2_ci.dropna(axis=0, how='any', subset=['AverageTemperature'], inplace=True)
df2_ci.isna().sum()

dt                               0
AverageTemperature               0
AverageTemperatureUncertainty    0
City                             0
Country                          0
Latitude                         0
Longitude                        0
dtype: int64

In [None]:
#Parsing date column to datetime object and reset index
df2_ci['dt'] = pd.to_datetime(df2_ci['dt'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df2_ci.reset_index(drop=True, inplace=True)
df2_ci.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W
3,1849-04-01,26.14,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W
4,1849-05-01,25.427,1.2,Abidjan,Côte D'Ivoire,5.63N,3.23W


In [None]:
#Checking if there are duplicate values
df2_ci['City'].unique()

array(['Abidjan', 'Addis Abeba', 'Ahmadabad', 'Aleppo', 'Alexandria',
       'Ankara', 'Baghdad', 'Bangalore', 'Bangkok', 'Belo Horizonte',
       'Berlin', 'Bogotá', 'Bombay', 'Brasília', 'Cairo', 'Calcutta',
       'Cali', 'Cape Town', 'Casablanca', 'Changchun', 'Chengdu',
       'Chicago', 'Chongqing', 'Dakar', 'Dalian', 'Dar Es Salaam',
       'Delhi', 'Dhaka', 'Durban', 'Faisalabad', 'Fortaleza', 'Gizeh',
       'Guangzhou', 'Harare', 'Harbin', 'Ho Chi Minh City', 'Hyderabad',
       'Ibadan', 'Istanbul', 'Izmir', 'Jaipur', 'Jakarta', 'Jiddah',
       'Jinan', 'Kabul', 'Kano', 'Kanpur', 'Karachi', 'Kiev', 'Kinshasa',
       'Lagos', 'Lahore', 'Lakhnau', 'Lima', 'London', 'Los Angeles',
       'Luanda', 'Madras', 'Madrid', 'Manila', 'Mashhad', 'Melbourne',
       'Mexico', 'Mogadishu', 'Montreal', 'Moscow', 'Nagoya', 'Nagpur',
       'Nairobi', 'Nanjing', 'New Delhi', 'New York', 'Paris', 'Peking',
       'Pune', 'Rangoon', 'Rio De Janeiro', 'Riyadh', 'Rome', 'São Paulo',
       'S

##Data Manipulation

In [None]:
#Extracting year from datetime object and renaming columns
df2_ci = df2_ci.assign(Year = df2_ci['dt'].dt.year, 
                       Month = df2_ci['dt'].dt.month)
df2_ci = df2_ci.rename(columns={'AverageTemperature': 'Temp', 'AverageTemperatureUncertainty': 'Temp_uncer'})
df2_ci.tail(2)

Unnamed: 0,dt,Temp,Temp_uncer,City,Country,Latitude,Longitude,Year,Month
228173,2013-07-01,25.251,1.042,Xian,China,34.56N,108.97E,2013,7
228174,2013-08-01,24.528,0.84,Xian,China,34.56N,108.97E,2013,8


In [None]:
#Creating categories for each decade
df2_ci = df2_ci.assign(Decade = df2_ci['Year'].astype(str).str[:3] + "0")
df2_ci.tail(2)

Unnamed: 0,dt,Temp,Temp_uncer,City,Country,Latitude,Longitude,Year,Month,Decade
228173,2013-07-01,25.251,1.042,Xian,China,34.56N,108.97E,2013,7,2010
228174,2013-08-01,24.528,0.84,Xian,China,34.56N,108.97E,2013,8,2010


In [None]:
df2_ci.Decade.unique()

array(['1840', '1850', '1860', '1870', '1880', '1890', '1900', '1910',
       '1920', '1930', '1940', '1950', '1960', '1970', '1980', '1990',
       '2000', '2010', '1790', '1800', '1810', '1820', '1830', '1750',
       '1760', '1770', '1780', '1740'], dtype=object)

In [None]:
#Slicing dataframe from 1900 to 2013
years = [i for i in range(1900, 2014)]
df2_yr = df2_ci[df2_ci['Year'].isin(years)].copy().reset_index(drop=True)
df2_yr = df2_yr[['dt', 'Year', 'Month', 'Decade', 'Temp', 'Country', 'City', 'Latitude', 'Longitude']]
df2_yr.tail(2)

Unnamed: 0,dt,Year,Month,Decade,Temp,Country,City,Latitude,Longitude
136405,2013-07-01,2013,7,2010,25.251,China,Xian,34.56N,108.97E
136406,2013-08-01,2013,8,2010,24.528,China,Xian,34.56N,108.97E


In [None]:
#Creating first dataframe grouped by decade and month
df2_dec = df2_yr.groupby(['Country', 'City', 'Decade', 'Month']).agg(Mean_temp = ('Temp', 'mean')).reset_index()
df2_dec.tail(2)

Unnamed: 0,Country,City,Decade,Month,Temp_mean
14398,Zimbabwe,Harare,2010,11,24.027333
14399,Zimbabwe,Harare,2010,12,22.614


In [None]:
#Exporting to excel in local disk
from google.colab import files
df2_dec.to_excel('temp_city_by_decade.xlsx', index=False) #==> Excluding index from file
files.download('temp_city_by_decade.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#Creating second dataframe grouped by decade, latitude and longitude
df2_loc = df2_yr.groupby(['Country', 'City', 'Decade', 'Latitude', 'Longitude']).agg(Mean_temp = ('Temp', 'mean')).reset_index()
df2_loc.tail(2)

Unnamed: 0,Country,City,Decade,Latitude,Longitude,Mean_temp
1198,Zimbabwe,Harare,2000,18.48S,30.42E,20.755858
1199,Zimbabwe,Harare,2010,18.48S,30.42E,20.629614


In [None]:
#Converting string latitude and longitude into float
df2_loc = df2_loc.assign(Latitude = df2_loc['Latitude'].str[:-1].astype(float), 
                         Longitude = df2_loc['Longitude'].str[:-1].astype(float))
df2_loc.tail(2)

Unnamed: 0,Country,City,Decade,Latitude,Longitude,Mean_temp
1198,Zimbabwe,Harare,2000,18.48,30.42,20.755858
1199,Zimbabwe,Harare,2010,18.48,30.42,20.629614


In [None]:
#Checking consistency of latitude and longitude values
df2_loc['Latitude'].unique(), df2_loc['Longitude'].unique()

(array([34.56,  8.84, 37.78, 23.31, 20.09, 15.27,  4.02, 13.66, 16.87,
        45.81, 44.2 , 32.95, 31.35, 29.74, 39.38, 36.17, 40.99,  5.63,
        18.48, 49.03, 52.24, 12.05, 28.13, 26.52, 21.7 ,  7.23, 42.59,
         0.8 , 24.92, 55.45, 60.27,  2.41, 50.63, 10.45]),
 array([ 70.05,  13.78, 144.41, 151.78,  90.  ,  44.36,  47.5 ,  40.98,
         42.82,  38.81,  46.31,  95.44,  72.69,  80.5 ,  69.89, 125.22,
        103.66, 107.08, 120.69, 112.72, 125.77, 117.35, 118.74, 116.53,
        120.63, 123.55, 111.86, 113.9 , 114.46, 108.97,  74.73,  76.34,
         15.27,   3.23,  69.3 ,  30.16,  31.38,  38.11,   2.45,  13.14,
         72.52,  77.26,  72.68,  88.25,  77.27,  78.7 ,  75.22,  80.6 ,
         80.09,  78.75,  74.37,  73.56, 106.55, 112.7 ,  59.67,  45.  ,
         13.09, 136.22, 139.23,  36.16,  98.96,   6.7 ,   4.05,   8.22,
         73.51,  67.39, 120.83,  36.85,  29.19,  38.94,  46.11,  17.5 ,
         45.8 ,  18.19, 126.1 ,   4.26,  32.5 ,  37.79, 122.36,  39.73,
        

In [None]:
#Exporting to excel in local disk
from google.colab import files
df2_loc.to_excel('temp_city_lat_and_lon.xlsx', index=False) #==> Excluding index from file
files.download('temp_city_lat_and_lon.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Data Visualization

###Plotting Temperatures by City and Decade



In [None]:
#Uploading file from local drive
from google.colab import files
uploaded2a = files.upload()

Saving temp_city_by_decade.xlsx to temp_city_by_decade.xlsx


In [None]:
#Storing dataset in a Pandas Dataframe
import io
df2_dec = pd.read_excel(io.BytesIO(uploaded2a['temp_city_by_decade.xlsx']))

In [None]:
#Uploading file from local drive
from google.colab import files
uploaded2b = files.upload()

Saving temp_city_lat_and_lon.xlsx to temp_city_lat_and_lon.xlsx


In [None]:
#Storing dataset in a Pandas Dataframe
import io
df2_loc = pd.read_excel(io.BytesIO(uploaded2b['temp_city_lat_and_lon.xlsx']))

In [None]:
#Checking dataframe tail
df2_loc.tail(2)

Unnamed: 0,Country,City,Decade,Latitude,Longitude,Mean_temp
1198,Zimbabwe,Harare,2000,18.48,30.42,20.755858
1199,Zimbabwe,Harare,2010,18.48,30.42,20.629614


In [None]:
#Creating a function to plot interactive temperatures by decade and city
def temp_per_month(city):
  """Plot interactive chart of temperatures by decade for different cities"""
  df2_month = df2_dec[df2_dec['City']==city].copy()
  fig2_1 = px.bar(data_frame=df2_month, x='Month', y='Mean_temp', animation_frame='Decade',
                title=f'Average Temperature °C by month in {city}', width=900, height=450)
  fig2_1.show()

In [None]:
#Creating a list of sorted cities
cities_list = sorted(df2_dec['City'].unique())

#Plotting the temperatures by city and month with interactive widgets
interact(temp_per_month, city=widgets.Dropdown(options=cities_list, value=cities_list[0], description='City', disabled=False))
plt.show()

interactive(children=(Dropdown(description='City', options=('Abidjan', 'Addis Abeba', 'Ahmadabad', 'Aleppo', '…

In [None]:
#Creating a function to plot interactive temperatures by city
def temp_per_city(city):
  """Plot interactive chart of temperatures for different cities"""
  df2_city = df2_dec[df2_dec['City']==city].copy()
  df2_city = df2_city.groupby(['Country', 'City', 'Decade'])['Mean_temp'].mean().reset_index()
  fig2_2 = px.bar(data_frame=df2_city, x='Decade', y='Mean_temp',
                  title=f'Average Temperature °C by decade in {city}', width=900, height=450)
  fig2_2.show()

In [None]:
#Creating a list of sorted cities
cities_list = sorted(df2_dec['City'].unique())

#Plotting the temperatures by city and decade with interactive widgets
interact(temp_per_city, city=widgets.Dropdown(options=cities_list, value=cities_list[0], description='City', disabled=False))
plt.show()

interactive(children=(Dropdown(description='City', options=('Abidjan', 'Addis Abeba', 'Ahmadabad', 'Aleppo', '…

In [None]:
#Creating a function to plot interactive temperatures by country
def temp_per_country():
  """Plot interactive chart of temperatures for different countries"""
  df2_country = df2_loc.copy()
  df2_country = df2_country.groupby(['Country', 'Decade', 'Latitude', 'Longitude'])['Mean_temp'].mean().reset_index()
  fig2_3 = px.choropleth(data_frame=df2_country, locations='Country', locationmode='country names', color='Mean_temp', animation_frame='Decade', 
                  title='Average Temperature °C by Country', width=900, height=450)
  fig2_3.show()

temp_per_country()

#Weather in Norway

Historic data of the weather in Norway can be found [here](https://www.yr.no/nb/historikk/tabell/1-72837/Norge/Oslo/Oslo/Oslo?q=2021-01)

## Uploading packages and data

In [1]:
#Importing data manipulation libraries
import numpy as np
import pandas as pd
from time import sleep
from random import randint

##Retrieving data from URL

In [2]:
#Creating a function to retrieve table data from given URL
def get_table(url):
  """Retrieve table data from given URL and return a dataframe"""

  #Get table data and parse with pandas
  try:
    tb = pd.read_html(url, attrs = {'class': 'fluid-table__table'})
    return tb
  except:
    print('No Table found')

In [3]:
#Scraping weather data from multiple pages and saving the returned tables to a list
data = []
url = 'https://www.yr.no/nb/historikk/tabell/1-72837/Norge/Oslo/Oslo/Oslo?q=2021-0'

for i in range(1, 9):
  page = url + str(i)
  data.append(get_table(page))
  print(f'Table: {i}')
  sleep(randint(3,8))

Table: 1
Table: 2
Table: 3
Table: 4
Table: 5
Table: 6
Table: 7
Table: 8


In [4]:
#Checking the number of scraped tables
len(data)

8

In [5]:
#Checking one of the returned dataframes inside list
data[-1][0][-5:]

Unnamed: 0,Dato,Min. temp.,Maks temp.,Gjennomsnitt,Normal temp.,Nedbør mm (måles kl 07),Snødybde cm,Vind m/s,Kraftigste vind m/s
26,27.0,"13,7°","18,1°","15,3°","16,0°",0,–,63,80
27,28.0,"14,5°","21,5°","17,5°","15,8°",0,–,39,72
28,29.0,"12,4°","25,1°","18,7°","15,7°",0,–,30,51
29,30.0,"14,3°","25,0°","18,8°","15,6°",0,–,24,36
30,31.0,"13,8°","24,7°","18,8°","15,4°",0,–,37,64


In [6]:
#Concatenating dataframes from list
df3 = pd.concat(objs=[i[0] for i in data], ignore_index=True)
df3.shape #==> from page 1 to the end

(243, 9)

In [7]:
#Creating range of dates for given year
year_range = pd.date_range(start='2021-01-01', end='2021-08-31', freq='D')
year_range[-10:]

DatetimeIndex(['2021-08-22', '2021-08-23', '2021-08-24', '2021-08-25',
               '2021-08-26', '2021-08-27', '2021-08-28', '2021-08-29',
               '2021-08-30', '2021-08-31'],
              dtype='datetime64[ns]', freq='D')

In [8]:
#Assigning dates to concatenated dataframe
df3 = df3.assign(Dag = year_range)
df3.shape

(243, 10)

In [9]:
#Rearranging columns order
df3 = df3[['Dag', 'Min. temp.', 'Maks temp.', 'Gjennomsnitt',	'Normal temp.',	'Nedbør mm (måles kl 07)', 'Snødybde cm',	'Vind m/s',	'Kraftigste vind m/s', 'Dato']]
df3.tail()

Unnamed: 0,Dag,Min. temp.,Maks temp.,Gjennomsnitt,Normal temp.,Nedbør mm (måles kl 07),Snødybde cm,Vind m/s,Kraftigste vind m/s,Dato
238,2021-08-27,"13,7°","18,1°","15,3°","16,0°",0,–,63,80,27.0
239,2021-08-28,"14,5°","21,5°","17,5°","15,8°",0,–,39,72,28.0
240,2021-08-29,"12,4°","25,1°","18,7°","15,7°",0,–,30,51,29.0
241,2021-08-30,"14,3°","25,0°","18,8°","15,6°",0,–,24,36,30.0
242,2021-08-31,"13,8°","24,7°","18,8°","15,4°",0,–,37,64,31.0


In [None]:
#Exporting to excel into local disk
from google.colab import files
df3.to_excel('2021_oslo_weather.xlsx', index=False) #==> Excluding index from file
files.download('2021_oslo_weather.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Data Cleaning

In [None]:
#Remove previous versions of the uploaded file
!rm 2021_oslo_weather.xlsx

In [None]:
#Uploading file from local drive
from google.colab import files
uploaded3 = files.upload()

Saving 2020_oslo_weather.xlsx to 2020_oslo_weather.xlsx


In [None]:
#Storing dataset in a Pandas Dataframe
import io
df3 = pd.read_excel(io.BytesIO(uploaded3['2020_oslo_weather.xlsx']))

In [10]:
#Renaming columns
df3.columns = ['Dato', 'Min_temp', 'Max_temp', 'Mean_temp', 'Norm_temp', 'Rainfall_mm', 'Snow_depth_cm', 'Wind_m_s', 'High_wind', 'Day']

In [11]:
#Checking the dataframe information
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Dato           243 non-null    datetime64[ns]
 1   Min_temp       243 non-null    object        
 2   Max_temp       243 non-null    object        
 3   Mean_temp      243 non-null    object        
 4   Norm_temp      243 non-null    object        
 5   Rainfall_mm    243 non-null    object        
 6   Snow_depth_cm  243 non-null    object        
 7   Wind_m_s       243 non-null    object        
 8   High_wind      243 non-null    object        
 9   Day            243 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(8)
memory usage: 19.1+ KB


In [12]:
#Cleaning temperature values

def clean_temp(arr1):
  return arr1.str.replace(',', '.').str.replace('°', '').str.replace('–', '').str.strip()

df3['Min_temp'] = clean_temp(df3['Min_temp'])
df3['Max_temp'] = clean_temp(df3['Max_temp'])
df3['Mean_temp'] = clean_temp(df3['Mean_temp'])
df3['Norm_temp'] = clean_temp(df3['Norm_temp'])

In [13]:
#Cleaning missing values from the rest of the columns
def clean_values(arr2):
  return arr2.str.replace('–', '').str.replace('-', '').str.strip()

df3['Rainfall_mm'] = clean_values(df3['Rainfall_mm'])
df3['Snow_depth_cm'] = clean_values(df3['Snow_depth_cm'])
df3['Wind_m_s'] = clean_values(df3['Wind_m_s'])
df3['High_wind'] = clean_values(df3['High_wind'])

In [14]:
#Checking the dataframe tail
df3.tail()

Unnamed: 0,Dato,Min_temp,Max_temp,Mean_temp,Norm_temp,Rainfall_mm,Snow_depth_cm,Wind_m_s,High_wind,Day
238,2021-08-27,13.7,18.1,15.3,16.0,,,,,27.0
239,2021-08-28,14.5,21.5,17.5,15.8,,,,,28.0
240,2021-08-29,12.4,25.1,18.7,15.7,,,,,29.0
241,2021-08-30,14.3,25.0,18.8,15.6,,,,,30.0
242,2021-08-31,13.8,24.7,18.8,15.4,,,,,31.0


In [15]:
#Checking unique values from one column
df3['Rainfall_mm'].unique()

array([nan, '00', '05', '34', '17', '', '02', '01', '40', '87', '04',
       '66', '63', '44', '03'], dtype=object)

In [16]:
#Exporting to csv into local disk
from google.colab import files
df3.to_csv('2021_oslo_weather.csv', index=False) #==> Excluding index from file
files.download('2021_oslo_weather.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>