In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import plotly.offline as py
py.init_notebook_mode(connected=True)
%matplotlib inline

In [2]:
global_temp = pd.read_csv('GlobalTemperatures.csv')
global_temp

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.490,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,
5,1750-06-01,12.937,1.724,,,,,,
6,1750-07-01,15.868,1.911,,,,,,
7,1750-08-01,14.750,2.231,,,,,,
8,1750-09-01,11.413,2.637,,,,,,
9,1750-10-01,6.367,2.668,,,,,,


### The First data set (Global)
Notice that there are some missing values in early years. It's not appropriate to set any of them to a specific value, like 0, so I just dropped all of them (and reset the index). Also, What I need in this stage is the land average temperature and the uncertainty, so I just dropped the rest of the column. Then I used the uncerainty to calculate the range of the temperature. And then dropped that column.

In [3]:
global_temp_dt = global_temp[['dt','LandAverageTemperature','LandAverageTemperatureUncertainty']]
global_temp_dt = global_temp_dt[global_temp_dt.LandAverageTemperature.notnull()]
global_temp_dt = global_temp_dt.reset_index(drop=True)
global_temp_dt.head()

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574
1,1750-02-01,3.083,3.702
2,1750-03-01,5.626,3.076
3,1750-04-01,8.49,2.451
4,1750-05-01,11.573,2.072


In [4]:
global_temp_dt['LandUncerMax'] = pd.Series(global_temp_dt.LandAverageTemperature+global_temp_dt.LandAverageTemperatureUncertainty, index=global_temp_dt.index)
global_temp_dt['LandUncerMin'] = pd.Series(global_temp_dt.LandAverageTemperature-global_temp_dt.LandAverageTemperatureUncertainty, index=global_temp_dt.index)
global_temp_dt = global_temp_dt.drop('LandAverageTemperatureUncertainty', 1)
global_temp_dt.head()

Unnamed: 0,dt,LandAverageTemperature,LandUncerMax,LandUncerMin
0,1750-01-01,3.034,6.608,-0.54
1,1750-02-01,3.083,6.785,-0.619
2,1750-03-01,5.626,8.702,2.55
3,1750-04-01,8.49,10.941,6.039
4,1750-05-01,11.573,13.645,9.501


### The second data set (Major City)
This one is the temperature of major city, coming with the location information. Beside dropping the unnecessary data, I have to convert the location info, from string to float value. Also, setting the city name as the index.

In [5]:
MCity_temp = pd.read_csv('GlobalLandTemperaturesByMajorCity.csv')
MCity_temp = MCity_temp[MCity_temp.AverageTemperature.notnull()]
MCity_temp = MCity_temp.reset_index(drop=True)
MCity_temp.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W
3,1849-04-01,26.14,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W
4,1849-05-01,25.427,1.2,Abidjan,Côte D'Ivoire,5.63N,3.23W


In [6]:
def convert(tude):
    multi = 1 if tude[-1] in ['N','E'] else -1
    return multi * float(tude[:-1])

In [7]:
MCity_dt = MCity_temp.groupby(['City'])
MCity_mean = MCity_dt.AverageTemperature.mean()
MCity_Lat = MCity_dt.Latitude.first()
MCity_new_dt = pd.DataFrame(MCity_mean)
MCity_new_dt['Latitude'] = pd.Series(MCity_Lat,index = MCity_new_dt.index)
for i, num in enumerate(MCity_new_dt['Latitude']):
     MCity_new_dt.ix[i,'Latitude'] = convert(num)
MCity_new_dt.head()

Unnamed: 0_level_0,AverageTemperature,Latitude
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Abidjan,26.163737,5.63
Addis Abeba,17.525073,8.84
Ahmadabad,26.529853,23.31
Aleppo,17.370587,36.17
Alexandria,20.312617,31.35


### The third data set (Country)
In this part, I tried to get the average values of different countries, and also sort them by the temperature values.
Also, I constructed a pivot table, with the countries and months as pivot. This will be used as feature table for later clustering process.

In [11]:
Country_temp = pd.read_csv('GlobalLandTemperaturesByCountry.csv')
Country_temp.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1743-12-01,,,Åland
2,1744-01-01,,,Åland
3,1744-02-01,,,Åland
4,1744-03-01,,,Åland


In [12]:
Country_temp = Country_temp[Country_temp.AverageTemperature.notnull()]
Country_temp.reset_index(drop=True).head()
Country_dt = Country_temp.groupby('Country').AverageTemperature.mean()
Country_dt = Country_dt.sort_values(ascending = False)
Country_dt

Country
Djibouti                                     28.816603
Mali                                         28.441977
Burkina Faso                                 28.083507
Senegal                                      27.967375
Aruba                                        27.920390
United Arab Emirates                         27.693995
Mauritania                                   27.620256
Gambia                                       27.538552
Niger                                        27.458973
Curaçao                                      27.353415
Palau                                        27.216379
Bonaire, Saint Eustatius And Saba            27.173295
Benin                                        27.171999
Palmyra Atoll                                27.163456
Kingman Reef                                 27.133034
Chad                                         27.120466
Sudan                                        27.093359
Northern Mariana Islands                     26.996834
Gu

In [13]:
Country_temp = pd.read_csv('GlobalLandTemperaturesByCountry.csv')
Country_temp = Country_temp[Country_temp.AverageTemperature.notnull()]
Country_temp.reset_index(drop=True).head()
Country_temp = Country_temp.drop('AverageTemperatureUncertainty',1)
Country_temp['dt'] = pd.to_datetime(Country_temp.dt)
m = Country_temp.dt
Country_m = []
for i in m:
    Country_m.append(i.month)
Country_temp_month = Country_temp.assign(Month = Country_m)
table = pd.pivot_table(Country_temp_month,index='Country',columns = 'Month',values = 'AverageTemperature')
table

Month,1,2,3,4,5,6,7,8,9,10,11,12
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Afghanistan,0.469763,2.916785,8.416103,14.499520,20.288477,24.933201,26.932966,25.220417,20.318960,13.864779,7.718598,2.545520
Africa,20.942925,22.237892,23.901484,25.005126,25.645503,25.516013,25.211234,25.364570,25.677854,24.970178,23.182083,21.196174
Albania,2.860728,4.284409,7.354585,11.295094,16.038395,19.978852,22.440231,22.134456,18.409932,13.508053,8.635586,4.482817
Algeria,11.820880,14.428823,18.377369,22.498071,27.148478,31.475217,33.662695,32.918013,29.470013,23.772215,17.542138,12.611409
American Samoa,26.789485,26.934397,27.066556,26.951059,26.880854,26.544625,26.211162,26.187563,26.259409,26.278772,26.562711,26.680459
Andorra,3.424238,4.910091,7.143283,9.542763,13.338087,17.253670,19.955258,19.629966,16.380616,11.625863,7.201779,4.089574
Angola,22.572186,22.836319,22.966097,22.432434,20.997411,18.881944,18.648312,20.513902,22.746606,23.502785,23.192923,22.590590
Anguilla,24.850431,24.862206,25.264206,25.810389,26.878926,27.700918,28.056057,28.169914,27.981989,27.508468,26.773538,25.501277
Antigua And Barbuda,24.762489,24.766777,25.190794,25.799440,26.847486,27.541456,27.815834,27.902274,27.649494,27.132216,26.497335,25.376486
Argentina,21.246563,20.399114,18.195032,14.279722,10.835572,7.996956,7.746182,9.467031,12.133747,15.044646,18.081025,20.170759
