In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data=pd.read_csv('sudeste.csv')

In [4]:
data.head(5)

Unnamed: 0,wsid,wsnm,elvt,lat,lon,inme,city,prov,mdct,date,...,tmax,dmax,tmin,dmin,hmdy,hmax,hmin,wdsp,wdct,gust
0,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 00:00:00,2007-11-06,...,29.7,16.8,25.5,10.8,35.0,58.0,32.0,3.2,101.0,6.5
1,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 01:00:00,2007-11-06,...,29.9,13.6,29.0,12.2,39.0,39.0,35.0,3.6,94.0,6.4
2,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 02:00:00,2007-11-06,...,29.0,14.0,27.4,13.6,44.0,44.0,39.0,2.5,93.0,6.9
3,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 03:00:00,2007-11-06,...,27.4,16.9,25.8,14.1,58.0,58.0,44.0,1.7,96.0,5.8
4,178,SÃO GONÇALO,237.0,-6.835777,-38.311583,A333,São Gonçalo,RJ,2007-11-06 04:00:00,2007-11-06,...,26.3,17.0,25.3,16.4,57.0,58.0,56.0,3.1,110.0,7.5


In [5]:
df=data.copy()

## General Information
- wsid: weather station id
- wsnm: city location of station
- elvt: elevation
- lat: latitude
- lon: longtitude
- inme: station number
- city: city
- prov: province
- mcdt: observation datetime

## Technical Information
- prcp: amount of precipitation in millimeter (last hour)
- stp: air pressure for the hour in hPa to tenths
- smax: maximum air pressure
- smin: minimum air pressure
- gbrd: solar radiation KJ/m2
- temp: air temperature in celsius degrees
- dewp: dew point temperature in celsius degree
- tmax: maximum temperature for the last hour in celsius degrees
- dmax: maximum dew point temperature for the last hour in celsius degrees
- tmin: minimum temperature for the last hour in celsius degrees
- dmin: minimum dew point temperature for the last hour in celsius degrees
- hmdy: relative humid in %
- hmax: maximum relative humid temperature for the last hour in %
- hmin: minimum relative humid temperature for the last hour in %
- wdsp: wind speed in metres per second
- wdct: wind direction in radius degrees (0-360)
- gust: wind gust in metres per second

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9779168 entries, 0 to 9779167
Data columns (total 31 columns):
 #   Column  Dtype  
---  ------  -----  
 0   wsid    int64  
 1   wsnm    object 
 2   elvt    float64
 3   lat     float64
 4   lon     float64
 5   inme    object 
 6   city    object 
 7   prov    object 
 8   mdct    object 
 9   date    object 
 10  yr      int64  
 11  mo      int64  
 12  da      int64  
 13  hr      int64  
 14  prcp    float64
 15  stp     float64
 16  smax    float64
 17  smin    float64
 18  gbrd    float64
 19  temp    float64
 20  dewp    float64
 21  tmax    float64
 22  dmax    float64
 23  tmin    float64
 24  dmin    float64
 25  hmdy    float64
 26  hmax    float64
 27  hmin    float64
 28  wdsp    float64
 29  wdct    float64
 30  gust    float64
dtypes: float64(20), int64(5), object(6)
memory usage: 2.3+ GB


In [7]:
df.isnull().sum()

wsid          0
wsnm          0
elvt          0
lat           0
lon           0
inme          0
city          0
prov          0
mdct          0
date          0
yr            0
mo            0
da            0
hr            0
prcp    8371184
stp           0
smax          0
smin          0
gbrd    4108820
temp         31
dewp        475
tmax         26
dmax        310
tmin         34
dmin        807
hmdy          0
hmax         12
hmin         44
wdsp     925561
wdct          0
gust     316474
dtype: int64

In [8]:
df.drop(['lat','lon','mdct','wsid','inme','date'],axis=1,inplace=True)

In [9]:
df.drop('wsnm',axis=1,inplace=True)

In [10]:
df.drop(['prcp','gbrd'],axis=1,inplace=True)

In [11]:
df.head(5)

Unnamed: 0,elvt,city,prov,yr,mo,da,hr,stp,smax,smin,...,tmax,dmax,tmin,dmin,hmdy,hmax,hmin,wdsp,wdct,gust
0,237.0,São Gonçalo,RJ,2007,11,6,0,982.5,982.5,981.3,...,29.7,16.8,25.5,10.8,35.0,58.0,32.0,3.2,101.0,6.5
1,237.0,São Gonçalo,RJ,2007,11,6,1,983.2,983.2,982.5,...,29.9,13.6,29.0,12.2,39.0,39.0,35.0,3.6,94.0,6.4
2,237.0,São Gonçalo,RJ,2007,11,6,2,983.5,983.5,983.2,...,29.0,14.0,27.4,13.6,44.0,44.0,39.0,2.5,93.0,6.9
3,237.0,São Gonçalo,RJ,2007,11,6,3,983.7,983.7,983.4,...,27.4,16.9,25.8,14.1,58.0,58.0,44.0,1.7,96.0,5.8
4,237.0,São Gonçalo,RJ,2007,11,6,4,983.7,983.8,983.6,...,26.3,17.0,25.3,16.4,57.0,58.0,56.0,3.1,110.0,7.5


In [12]:
df['date']=pd.to_datetime(df[['yr', 'mo', 'da','hr']].rename(columns={'yr': 'year','mo': 'month','da': 'day','hr':'hour'}))

In [13]:
df.drop(['yr','mo','da','hr'],axis=1,inplace=True)

In [14]:
df.head(5)

Unnamed: 0,elvt,city,prov,stp,smax,smin,temp,dewp,tmax,dmax,tmin,dmin,hmdy,hmax,hmin,wdsp,wdct,gust,date
0,237.0,São Gonçalo,RJ,982.5,982.5,981.3,29.3,12.1,29.7,16.8,25.5,10.8,35.0,58.0,32.0,3.2,101.0,6.5,2007-11-06 00:00:00
1,237.0,São Gonçalo,RJ,983.2,983.2,982.5,29.0,13.5,29.9,13.6,29.0,12.2,39.0,39.0,35.0,3.6,94.0,6.4,2007-11-06 01:00:00
2,237.0,São Gonçalo,RJ,983.5,983.5,983.2,27.4,14.0,29.0,14.0,27.4,13.6,44.0,44.0,39.0,2.5,93.0,6.9,2007-11-06 02:00:00
3,237.0,São Gonçalo,RJ,983.7,983.7,983.4,25.8,16.9,27.4,16.9,25.8,14.1,58.0,58.0,44.0,1.7,96.0,5.8,2007-11-06 03:00:00
4,237.0,São Gonçalo,RJ,983.7,983.8,983.6,25.4,16.4,26.3,17.0,25.3,16.4,57.0,58.0,56.0,3.1,110.0,7.5,2007-11-06 04:00:00


In [23]:
df.dtypes

elvt           float64
city            object
prov            object
stp            float64
smax           float64
smin           float64
temp           float64
dewp           float64
tmax           float64
dmax           float64
tmin           float64
dmin           float64
hmdy           float64
hmax           float64
hmin           float64
wdsp           float64
wdct           float64
gust           float64
date    datetime64[ns]
dtype: object

In [22]:
nan_val=df[(df['temp']==df['temp'].isnull()) & (df['tmin']==df['tmin'].isnull()) & (df['tmax']==df['tmax'].isnull())]

In [23]:
df.drop(nan_val.index,inplace=True)

In [26]:
nan_val2=df[(df['dewp']==df['dewp'].isnull()) & (df['dmin']==df['dmin'].isnull()) & (df['dmax']==df['dmax'].isnull())]

In [27]:
df.drop(nan_val2.index,inplace=True)

In [47]:
#nan_val3=df[(df['wdsp']==df['wdsp'].isnull()) & (df['gust']==df['gust'].isnull())]

In [49]:
#df.drop(nan_val3.index,inplace=True)

In [28]:
df.isnull().sum()

elvt         0
city         0
prov         0
stp          0
smax         0
smin         0
temp        31
dewp       475
tmax        26
dmax       310
tmin        34
dmin       807
hmdy         0
hmax         0
hmin        24
wdsp    917430
wdct         0
gust    313077
date         0
dtype: int64

In [64]:
df.head()

Unnamed: 0,elvt,city,prov,stp,smax,smin,temp,dewp,tmax,dmax,tmin,dmin,hmdy,hmax,hmin,wdsp,wdct,gust,date
0,237.0,São Gonçalo,RJ,982.5,982.5,981.3,29.3,12.1,29.7,16.8,25.5,10.8,35.0,58.0,32.0,3.2,101.0,6.5,2007-11-06 00:00:00
1,237.0,São Gonçalo,RJ,983.2,983.2,982.5,29.0,13.5,29.9,13.6,29.0,12.2,39.0,39.0,35.0,3.6,94.0,6.4,2007-11-06 01:00:00
2,237.0,São Gonçalo,RJ,983.5,983.5,983.2,27.4,14.0,29.0,14.0,27.4,13.6,44.0,44.0,39.0,2.5,93.0,6.9,2007-11-06 02:00:00
3,237.0,São Gonçalo,RJ,983.7,983.7,983.4,25.8,16.9,27.4,16.9,25.8,14.1,58.0,58.0,44.0,1.7,96.0,5.8,2007-11-06 03:00:00
4,237.0,São Gonçalo,RJ,983.7,983.8,983.6,25.4,16.4,26.3,17.0,25.3,16.4,57.0,58.0,56.0,3.1,110.0,7.5,2007-11-06 04:00:00


### Air pressure analysis

In [30]:
air=df[['date','city','elvt','prov','stp','smax','smin']]

In [31]:
air.head()

Unnamed: 0,date,city,elvt,prov,stp,smax,smin
0,2007-11-06 00:00:00,São Gonçalo,237.0,RJ,982.5,982.5,981.3
1,2007-11-06 01:00:00,São Gonçalo,237.0,RJ,983.2,983.2,982.5
2,2007-11-06 02:00:00,São Gonçalo,237.0,RJ,983.5,983.5,983.2
3,2007-11-06 03:00:00,São Gonçalo,237.0,RJ,983.7,983.7,983.4
4,2007-11-06 04:00:00,São Gonçalo,237.0,RJ,983.7,983.8,983.6


In [32]:
air.set_index('date',inplace=True)

In [33]:
air.head()

Unnamed: 0_level_0,city,elvt,prov,stp,smax,smin
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2007-11-06 00:00:00,São Gonçalo,237.0,RJ,982.5,982.5,981.3
2007-11-06 01:00:00,São Gonçalo,237.0,RJ,983.2,983.2,982.5
2007-11-06 02:00:00,São Gonçalo,237.0,RJ,983.5,983.5,983.2
2007-11-06 03:00:00,São Gonçalo,237.0,RJ,983.7,983.7,983.4
2007-11-06 04:00:00,São Gonçalo,237.0,RJ,983.7,983.8,983.6


In [34]:
air.isnull().sum()

city    0
elvt    0
prov    0
stp     0
smax    0
smin    0
dtype: int64

In [35]:
air.index.year

Int64Index([2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007,
            ...
            2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016],
           dtype='int64', name='date', length=9077472)

In [44]:
air.groupby('city')
for i,g in air.groupby('city'):
    if i == 'Aimorés':
        print(g)
        print()

                        city   elvt prov    stp   smax   smin
date                                                         
2007-08-06 00:00:00  Aimorés  288.0   MG  998.0  998.0  997.5
2007-08-06 01:00:00  Aimorés  288.0   MG  998.6  998.6  998.0
2007-08-06 02:00:00  Aimorés  288.0   MG  999.0  999.1  998.6
2007-08-06 04:00:00  Aimorés  288.0   MG  998.6  998.9  998.5
2007-08-06 05:00:00  Aimorés  288.0   MG  998.5  998.6  998.4
...                      ...    ...  ...    ...    ...    ...
2016-09-30 19:00:00  Aimorés  288.0   MG  977.2  977.4  977.1
2016-09-30 20:00:00  Aimorés  288.0   MG  977.7  977.7  977.2
2016-09-30 21:00:00  Aimorés  288.0   MG  978.1  978.2  977.6
2016-09-30 22:00:00  Aimorés  288.0   MG  979.4  979.6  978.1
2016-09-30 23:00:00  Aimorés  288.0   MG  980.6  980.7  979.3

[74305 rows x 6 columns]

