# Project Nils : Data Loading

In [3]:
import pandas as pd

In [4]:
Raw_Dataset = pd.read_csv('data/fao_data_crops_data.csv')

The goal of this notebook is to download the crops data file which contains the Area Harvested in [Ha] for regions or countries as well as the yield [hg/Ha] for different years.
The steps are the following : 
- Load the dataset 
- Clean the dataset (Remove elements that are not countries or regions)
- Create two distinct dataset : one for the regions and one for the countries 

In [5]:
# Snapshot of the dataset :
Raw_Dataset

Unnamed: 0,country_or_area,element_code,element,year,unit,value,value_footnotes,category
0,Americas +,31,Area Harvested,2007.0,Ha,49404.0,A,agave_fibres_nes
1,Americas +,31,Area Harvested,2006.0,Ha,49404.0,A,agave_fibres_nes
2,Americas +,31,Area Harvested,2005.0,Ha,49404.0,A,agave_fibres_nes
3,Americas +,31,Area Harvested,2004.0,Ha,49113.0,A,agave_fibres_nes
4,Americas +,31,Area Harvested,2003.0,Ha,48559.0,A,agave_fibres_nes
...,...,...,...,...,...,...,...,...
2255344,Fc,Calculated Data,,,,,,yautia_cocoyam
2255345,A,"May include official, semi-official or estimat...",,,,,,yautia_cocoyam
2255346,NR,Not reported by country,,,,,,yautia_cocoyam
2255347,F,FAO Estimate,,,,,,yautia_cocoyam


In [6]:
# Snapshot of the missing values, note that the 'value_footnotes' columns is dropped because
# it contains a lot of data 
Trash_Data = Raw_Dataset[Raw_Dataset.drop(['value_footnotes'],axis=1).isnull().values]

# Snapshot of the data containing nan somewhere except from the 'value_footnotes' column :
Trash_Data

Unnamed: 0,country_or_area,element_code,element,year,unit,value,value_footnotes,category
2397,fnSeqID,Footnote,,,,,,agave_fibres_nes
2397,fnSeqID,Footnote,,,,,,agave_fibres_nes
2397,fnSeqID,Footnote,,,,,,agave_fibres_nes
2397,fnSeqID,Footnote,,,,,,agave_fibres_nes
2398,Fc,Calculated Data,,,,,,agave_fibres_nes
...,...,...,...,...,...,...,...,...
2255347,F,FAO Estimate,,,,,,yautia_cocoyam
2255348,*,Unofficial figure,,,,,,yautia_cocoyam
2255348,*,Unofficial figure,,,,,,yautia_cocoyam
2255348,*,Unofficial figure,,,,,,yautia_cocoyam


In [7]:
# By looking at the unique values of 'Trash_Tags' defined below, we notice that none of them contains the name of 
# a country or a region so we can remove them
Trash_Tags = Trash_Data.country_or_area.unique()
print(Trash_Tags)

['fnSeqID' 'Fc' 'A ' 'NR' 'F ' '* ']


In [8]:
Cleaned_Dataset = Raw_Dataset[~Raw_Dataset.country_or_area.isin(Trash_Tags)]
Cleaned_Dataset

Unnamed: 0,country_or_area,element_code,element,year,unit,value,value_footnotes,category
0,Americas +,31,Area Harvested,2007.0,Ha,49404.0,A,agave_fibres_nes
1,Americas +,31,Area Harvested,2006.0,Ha,49404.0,A,agave_fibres_nes
2,Americas +,31,Area Harvested,2005.0,Ha,49404.0,A,agave_fibres_nes
3,Americas +,31,Area Harvested,2004.0,Ha,49113.0,A,agave_fibres_nes
4,Americas +,31,Area Harvested,2003.0,Ha,48559.0,A,agave_fibres_nes
...,...,...,...,...,...,...,...,...
2255338,World +,51,Production Quantity,1965.0,tonnes,150123.0,A,yautia_cocoyam
2255339,World +,51,Production Quantity,1964.0,tonnes,143203.0,A,yautia_cocoyam
2255340,World +,51,Production Quantity,1963.0,tonnes,142094.0,A,yautia_cocoyam
2255341,World +,51,Production Quantity,1962.0,tonnes,123840.0,A,yautia_cocoyam


In [9]:
# Check if there is any nan value remaining in columns other than 'value_footnotes'
Cleaned_Dataset[Cleaned_Dataset.drop(['value_footnotes'],axis=1).isnull().values]

Unnamed: 0,country_or_area,element_code,element,year,unit,value,value_footnotes,category


In [32]:
Country_Or_Area = Cleaned_Dataset['country_or_area'].unique()

In [25]:
Dataset_Areas = Cleaned_Dataset[Cleaned_Dataset['country_or_area'].str.find('+') > -1]
Dataset_Areas

Unnamed: 0,country_or_area,element_code,element,year,unit,value,value_footnotes,category
0,Americas +,31,Area Harvested,2007.0,Ha,49404.0,A,agave_fibres_nes
1,Americas +,31,Area Harvested,2006.0,Ha,49404.0,A,agave_fibres_nes
2,Americas +,31,Area Harvested,2005.0,Ha,49404.0,A,agave_fibres_nes
3,Americas +,31,Area Harvested,2004.0,Ha,49113.0,A,agave_fibres_nes
4,Americas +,31,Area Harvested,2003.0,Ha,48559.0,A,agave_fibres_nes
...,...,...,...,...,...,...,...,...
2255338,World +,51,Production Quantity,1965.0,tonnes,150123.0,A,yautia_cocoyam
2255339,World +,51,Production Quantity,1964.0,tonnes,143203.0,A,yautia_cocoyam
2255340,World +,51,Production Quantity,1963.0,tonnes,142094.0,A,yautia_cocoyam
2255341,World +,51,Production Quantity,1962.0,tonnes,123840.0,A,yautia_cocoyam


In [26]:
Dataset_Areas.country_or_area.unique()

array(['Americas +', 'Asia +', 'Caribbean +', 'Central America +',
       'Low Income Food Deficit Countries +',
       'Net Food Importing Developing Countries +',
       'Small Island Developing States +', 'South America +',
       'South-Eastern Asia +', 'World +', 'Africa +',
       'Australia and New Zealand +', 'Central Asia +', 'Eastern Asia +',
       'Eastern Europe +', 'Europe +', 'European Union +',
       'LandLocked developing countries +', 'Least Developed Countries +',
       'Northern Africa +', 'Northern America +', 'Oceania +',
       'Southern Africa +', 'Southern Asia +', 'Southern Europe +',
       'Western Africa +', 'Western Asia +', 'Western Europe +',
       'Eastern Africa +', 'Northern Europe +', 'Middle Africa +',
       'Micronesia +', 'Polynesia +', 'Melanesia +'], dtype=object)

In [31]:
Dataset_Areas['year'].unique()

array([2007., 2006., 2005., 2004., 2003., 2002., 2001., 2000., 1999.,
       1998., 1997., 1996., 1995., 1994., 1993., 1992., 1991., 1990.,
       1989., 1988., 1987., 1986., 1985., 1984., 1983., 1982., 1981.,
       1980., 1979., 1978., 1977., 1976., 1975., 1974., 1973., 1972.,
       1971., 1970., 1969., 1968., 1967., 1966., 1965., 1964., 1963.,
       1962., 1961.])

In [28]:
Dataset_Countries = Cleaned_Dataset[Cleaned_Dataset['country_or_area'].str.find('+') == -1]
Dataset_Countries

Unnamed: 0,country_or_area,element_code,element,year,unit,value,value_footnotes,category
564,Colombia,31,Area Harvested,2007.0,Ha,17500.0,F,agave_fibres_nes
565,Colombia,31,Area Harvested,2006.0,Ha,17500.0,F,agave_fibres_nes
566,Colombia,31,Area Harvested,2005.0,Ha,17500.0,F,agave_fibres_nes
567,Colombia,31,Area Harvested,2004.0,Ha,17294.0,,agave_fibres_nes
568,Colombia,31,Area Harvested,2003.0,Ha,17094.0,,agave_fibres_nes
...,...,...,...,...,...,...,...,...
2255150,"Venezuela, Bolivarian Republic of",51,Production Quantity,1965.0,tonnes,61062.0,,yautia_cocoyam
2255151,"Venezuela, Bolivarian Republic of",51,Production Quantity,1964.0,tonnes,59225.0,,yautia_cocoyam
2255152,"Venezuela, Bolivarian Republic of",51,Production Quantity,1963.0,tonnes,57500.0,,yautia_cocoyam
2255153,"Venezuela, Bolivarian Republic of",51,Production Quantity,1962.0,tonnes,55825.0,,yautia_cocoyam


In [30]:
Dataset_Countries['year'].unique()

array([2007., 2006., 2005., 2004., 2003., 2002., 2001., 2000., 1999.,
       1998., 1997., 1996., 1995., 1994., 1993., 1992., 1991., 1990.,
       1989., 1988., 1987., 1986., 1985., 1984., 1983., 1982., 1981.,
       1980., 1979., 1978., 1977., 1976., 1975., 1974., 1973., 1972.,
       1971., 1970., 1969., 1968., 1967., 1966., 1965., 1964., 1963.,
       1962., 1961.])