<a href="https://colab.research.google.com/github/collinscolour/GoMyCodeCheckPoint/blob/main/datavisualizationwithpython.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Checkpoint Objective
In this checkpoint, we are going to work on the 'Climate change in Africa' dataset that was provided by the U.S global change research program.

Dataset description : This dataset contains historical data about the daily min, max and average temperature fluctuation in 5 African countries (Egypt, Tunisia, Cameroon, Senegal, Angola) between 1980 and 2023.

➡️ Dataset link


Instructions
Load the dataset into a data frame using Python.


Clean the data as needed.


Plot a line chart to show the average temperature fluctuations in Tunisia and Cameroon. Interpret the results.


Zoom in to only include data between 1980 and 2005, try to customize the axes labels.


Create Histograms to show temperature distribution in Senegal between [1980,2000] and [2000,2023] (in the same figure). Describe the obtained results.


Select the best chart to show the Average temperature per country.
Make your own questions about the dataset and try to answer them using the appropriate visuals.

In [1]:
!pip install plotly



In [2]:
!pip install ydata_profiling


Collecting ydata_profiling
  Downloading ydata_profiling-4.6.2-py2.py3-none-any.whl (357 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.5/357.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting pydantic>=2 (from ydata_profiling)
  Downloading pydantic-2.5.2-py3-none-any.whl (381 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.9/381.9 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Collecting visions[type_image_path]==0.7.5 (from ydata_profiling)
  Downloading visions-0.7.5-py3-none-any.whl (102 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.7/102.7 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting htmlmin==0.1.12 (from ydata_profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata_profiling)
  Downloading phik-0.12.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (679 kB)
[2K     [90m━━━━━━━━━━

In [1]:
from typing_extensions import dataclass_transform
#first import pandas
import pandas as pd

# import ydata_profiling
import ydata_profiling

# import plotly
import plotly.express as px

#import csv file
data= pd.read_csv('Africa_climate_change.csv')

# Clean the data
# Check for missing values
data.isnull().sum()

# Check for duplicate rows
data.duplicated().sum()

# Remove missing values
data.dropna(inplace=True)

# Remove duplicate rows
data.drop_duplicates(inplace=True)

# Save the cleaned data to a new CSV file
data.to_csv('cleaned_data.csv', index=False)

print(data)

                   DATE  PRCP  TAVG   TMAX  TMIN   COUNTRY
2       19800101 000000  0.00  72.0   86.0  59.0  Cameroon
6       19800101 000000  0.00  76.0   97.0  59.0   Senegal
7       19800101 000000  0.00  74.0   95.0  59.0   Senegal
8       19800101 000000  0.00  78.0   93.0  63.0   Senegal
9       19800101 000000  0.00  76.0   91.0  59.0   Senegal
...                 ...   ...   ...    ...   ...       ...
464778  20230822 000000  0.00  85.0   93.0  81.0   Senegal
464786  20230823 000000  0.00  87.0  101.0  71.0   Tunisia
464799  20230823 000000  0.00  90.0  102.0  80.0   Tunisia
464803  20230823 000000  1.22  83.0   90.0  76.0   Senegal
464809  20230823 000000  0.00  85.0   96.0  77.0   Tunisia

[113515 rows x 6 columns]


In [2]:
import datetime

In [4]:
data["DATE"]=pd.to_datetime(data["DATE"])


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113515 entries, 2 to 464809
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   DATE     113515 non-null  datetime64[ns]
 1   PRCP     113515 non-null  float64       
 2   TAVG     113515 non-null  float64       
 3   TMAX     113515 non-null  float64       
 4   TMIN     113515 non-null  float64       
 5   COUNTRY  113515 non-null  object        
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 6.1+ MB


In [6]:
data["YEAR"]=data["DATE"].dt.year
data["MONTH"]=data["DATE"].dt.month
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113515 entries, 2 to 464809
Data columns (total 8 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   DATE     113515 non-null  datetime64[ns]
 1   PRCP     113515 non-null  float64       
 2   TAVG     113515 non-null  float64       
 3   TMAX     113515 non-null  float64       
 4   TMIN     113515 non-null  float64       
 5   COUNTRY  113515 non-null  object        
 6   YEAR     113515 non-null  int64         
 7   MONTH    113515 non-null  int64         
dtypes: datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 7.8+ MB


In [7]:
df = data[(data["COUNTRY"] =="Cameroon") | (data["COUNTRY"] =="Tunisia")]
df

Unnamed: 0,DATE,PRCP,TAVG,TMAX,TMIN,COUNTRY,YEAR,MONTH
2,1980-01-01,0.0,72.0,86.0,59.0,Cameroon,1980,1
31,1980-01-02,0.0,75.0,91.0,61.0,Cameroon,1980,1
41,1980-01-02,0.0,81.0,90.0,75.0,Cameroon,1980,1
43,1980-01-02,0.0,76.0,95.0,59.0,Cameroon,1980,1
66,1980-01-03,0.0,80.0,91.0,73.0,Cameroon,1980,1
...,...,...,...,...,...,...,...,...
464752,2023-08-21,0.0,86.0,98.0,74.0,Tunisia,2023,8
464775,2023-08-22,0.0,88.0,99.0,79.0,Tunisia,2023,8
464786,2023-08-23,0.0,87.0,101.0,71.0,Tunisia,2023,8
464799,2023-08-23,0.0,90.0,102.0,80.0,Tunisia,2023,8


In [8]:
df1 = df.groupby(["COUNTRY", "DATE"])["TAVG"].mean()

In [9]:
df1.isnull().sum()

0

In [10]:
df1.tail(50)

COUNTRY  DATE      
Tunisia  2023-07-03     84.000000
         2023-07-04     84.500000
         2023-07-05     87.666667
         2023-07-06     88.333333
         2023-07-07     93.500000
         2023-07-08     96.500000
         2023-07-09     92.000000
         2023-07-10     96.000000
         2023-07-11     91.000000
         2023-07-12     94.000000
         2023-07-13     90.000000
         2023-07-14     92.000000
         2023-07-15     94.000000
         2023-07-16     95.000000
         2023-07-17     93.500000
         2023-07-18     97.500000
         2023-07-19     99.000000
         2023-07-20     95.333333
         2023-07-21     95.000000
         2023-07-22     98.333333
         2023-07-23     99.333333
         2023-07-24    100.750000
         2023-07-25     97.000000
         2023-07-27     83.666667
         2023-07-28     92.000000
         2023-07-29     89.500000
         2023-07-30     92.000000
         2023-07-31     88.333333
         2023-08-01     89.5

In [11]:
df1 = df1.reset_index()
df1

Unnamed: 0,COUNTRY,DATE,TAVG
0,Cameroon,1980-01-01,72.000000
1,Cameroon,1980-01-02,77.333333
2,Cameroon,1980-01-03,80.000000
3,Cameroon,1980-01-04,75.333333
4,Cameroon,1980-01-05,74.333333
...,...,...,...
18106,Tunisia,2023-08-19,89.000000
18107,Tunisia,2023-08-20,85.666667
18108,Tunisia,2023-08-21,86.000000
18109,Tunisia,2023-08-22,88.000000


In [12]:
fig = px.line(df1, x="DATE", y='TAVG', color = "COUNTRY", title='Africa_climate_change')
fig.show()

In [13]:
df2 = df.groupby(["COUNTRY", "YEAR"])["TAVG"].mean()

In [14]:
df2.isnull().sum()

0

In [15]:
df2.tail(50)

COUNTRY   YEAR
Cameroon  2016    78.629630
          2017    74.147287
          2018    75.974843
          2019    77.470085
          2020    76.580645
          2021    75.375000
          2022    73.734375
          2023    78.178571
Tunisia   1982    68.805381
          1983    66.845921
          1984    65.595611
          1985    67.015079
          1986    67.074163
          1987    67.812214
          1988    67.532764
          1989    66.672131
          1990    67.699857
          1991    65.451156
          1992    66.390023
          1993    67.476810
          1994    69.285714
          1995    67.810651
          1996    67.387695
          1997    68.434438
          1998    68.717485
          1999    66.308300
          2000    64.676349
          2001    63.268519
          2002    68.577731
          2003    71.700104
          2004    68.802700
          2005    68.483744
          2006    69.691089
          2007    69.442085
          2008    68.733978
     

In [16]:
df2 = df2.reset_index()
df2

Unnamed: 0,COUNTRY,YEAR,TAVG
0,Cameroon,1980,77.904000
1,Cameroon,1981,78.091408
2,Cameroon,1982,76.366864
3,Cameroon,1983,76.247748
4,Cameroon,1984,76.457364
...,...,...,...
81,Tunisia,2019,69.244666
82,Tunisia,2020,66.282776
83,Tunisia,2021,72.301397
84,Tunisia,2022,71.764624


In [17]:
fig = px.line(df2, x="YEAR", y='TAVG', color = "COUNTRY", title='Africa_climate_change')
fig.show()

In [18]:
df3 = df.groupby(["COUNTRY", "MONTH"])["TAVG"].mean()

In [19]:
df3.isnull().sum()

0

In [20]:
df3.info()

<class 'pandas.core.series.Series'>
MultiIndex: 24 entries, ('Cameroon', 1) to ('Tunisia', 12)
Series name: TAVG
Non-Null Count  Dtype  
--------------  -----  
24 non-null     float64
dtypes: float64(1)
memory usage: 462.0+ bytes


In [21]:
df3=df3.reset_index()
df3

Unnamed: 0,COUNTRY,MONTH,TAVG
0,Cameroon,1,78.603015
1,Cameroon,2,82.174721
2,Cameroon,3,82.375527
3,Cameroon,4,81.400651
4,Cameroon,5,79.513725
5,Cameroon,6,77.890013
6,Cameroon,7,76.334031
7,Cameroon,8,75.979853
8,Cameroon,9,76.233227
9,Cameroon,10,77.446839


In [22]:
fig = px.line(df3, x="MONTH", y='TAVG', color = "COUNTRY", title='Africa_climate_change')
fig.show()