In [1]:
import pandas as pd
import urllib.request
import io
import numpy as np

url = 'https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/06_Stats/Wind_Stats/wind.data'
try:
    fhand = urllib.request.urlopen(url)
except:
    print('URL invalid')
    exit()
    
def create_datetime(df):
    df['Yr'] = 1900 + df['Yr']
    date_format = '%Y/%m/%d'
    df['datetime'] = pd.to_datetime(df['Yr'].apply(str)+'/'+df['Mo'].apply(lambda x: str(x).zfill(2))\
                                    +'/'+df['Dy'].apply(lambda x: str(x).zfill(2)), format=date_format)
    return df

#Steps 3, 4 and 5
data = pd.read_csv(io.StringIO(fhand.read().decode('utf-8')), sep='\s+')
data = create_datetime(data)
print(f"Data types from datetime: {data['datetime'].dtypes}\n\n")
data = data.set_index('datetime')

#Step 6
print(f"Total of \'NaN\' per location\n{data.iloc[:,3:].isna().sum()}\n\n")

#Step 7
print(f"Total of non-missing values is {data.iloc[:,3:].notna().sum().sum()}\n\n")

#Step 8
print(f"Average of windspeeds is {data.iloc[:,3:].mean().mean():.2f}\n\n")

#Step 9
min_loc = data.iloc[:,3:].min()
max_loc = data.iloc[:,3:].max()
mean_loc = round(data.iloc[:,3:].mean(), 2)
std_loc = round(data.iloc[:,3:].std(), 2)
loc_stats = pd.DataFrame({'min': min_loc, 'max': max_loc, 'mean': mean_loc, 'std': std_loc})
print(f"Min, max, mean and std of windspeeds at each location\n{loc_stats}\n\n")

#Step 10
#Groupby was not used because there is one record per day, in order to avoid 2 'datetime' columns
min_day = data.iloc[:,3:].apply(min, axis=1)
max_day = data.iloc[:,3:].apply(max, axis=1)
mean_day = round(data.iloc[:,3:].apply(np.mean, axis=1), 2)
std_day = round(data.iloc[:,3:].apply(np.std, axis=1), 2)
day_stats = pd.DataFrame({'min': min_day, 'max': max_day, 'mean': mean_day, 'std': std_day})
print(f"Stats per day\n{day_stats}\n\n")

#Step 11
aux = data[['Mo','RPT', 'VAL', 'ROS', 'KIL', 'SHA', 'BIR', 'DUB', 'CLA', 'MUL', 'CLO', 'BEL', 'MAL']]
avg_jan = round(aux.groupby('Mo').mean().iloc[0,:], 2)
print(f"Average windspeed in January for each location\n{avg_jan}\n\n")

#Step 12
yearly = data.resample('Y').pad()
print(f"Downsample yearly\n{yearly}\n\n")

#Step 13
monthly = data.resample('M').pad()
print(f"Downsample monthly\n{monthly}\n\n")

#Step 14
#https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases
weekly = data.resample('W-MON').pad()
print(f"Downsample weekly\n{weekly}\n\n")

#Step 15
weekly_min = data.iloc[1:365,3:].resample('W-MON', label='left', closed='left').min()
weekly_max = data.iloc[1:365,3:].resample('W-MON', label='left', closed='left').max()
weekly_std = round(data.iloc[1:365,3:].resample('W-MON', label='left', closed='left').std(), 2)
weekly_avg = round(data.iloc[1:365,3:].resample('W-MON', label='left', closed='left').mean(), 2)
weekly_stats61 = pd.concat({'min': weekly_min, 'max': weekly_max, 'avg': weekly_avg, 'std': weekly_std}, axis=1)
print(f"Weekly stats from 1961\n{weekly_stats61}")

Data types from datetime: datetime64[ns]


Total of 'NaN' per location
RPT    6
VAL    3
ROS    2
KIL    5
SHA    2
BIR    0
DUB    3
CLA    2
MUL    3
CLO    1
BEL    0
MAL    4
dtype: int64


Total of non-missing values is 78857


Average of windspeeds is 10.23


Min, max, mean and std of windspeeds at each location
      min    max   mean   std
RPT  0.67  35.80  12.36  5.62
VAL  0.21  33.37  10.64  5.27
ROS  1.50  33.84  11.66  5.01
KIL  0.00  28.46   6.31  3.61
SHA  0.13  37.54  10.46  4.94
BIR  0.00  26.16   7.09  3.97
DUB  0.00  30.37   9.80  4.98
CLA  0.00  31.08   8.50  4.50
MUL  0.00  25.88   8.49  4.17
CLO  0.04  28.21   8.71  4.50
BEL  0.13  42.38  13.12  5.84
MAL  0.67  42.54  15.60  6.70


Stats per day
              min    max   mean   std
datetime                             
1961-01-01   9.29  18.50  13.02  2.68
1961-01-02   6.50  17.54  11.34  3.04
1961-01-03   6.17  18.50  11.64  3.51
1961-01-04   1.79  11.75   6.62  3.06
1961-01-05   6.17  13.33  10.63  2.34
1961-01-


Weekly stats from 1961
              avg                                                          \
              RPT    VAL    ROS    KIL    SHA    BIR    DUB    CLA    MUL   
datetime                                                                    
1961-01-02  13.54  11.49  10.49   6.42   9.47   6.44  11.06   6.62   8.43   
1961-01-09  12.47   8.97  11.96   4.63   7.35   5.07   7.54   6.82   5.71   
1961-01-16  13.20   9.86  12.98   6.33   8.97   7.42   9.26   7.88   7.15   
1961-01-23  19.88  16.14  18.23  12.72  17.43  14.83  15.53  15.16  14.48   
1961-01-30  16.83  15.46  12.62   8.25  13.36   9.11  12.20   8.55   9.82   
1961-02-06  19.68  16.42  17.30  10.77  14.72  12.52  14.93  14.85  14.06   
1961-02-13  15.13  15.09  13.80  10.08  13.41  11.87   9.54  12.13  12.38   
1961-02-20  15.22  13.63  14.33   8.52  13.66  10.11  11.15  10.88  10.39   
1961-02-27  12.10  12.95  11.06   7.83  12.10   9.24  10.23  11.13  10.38   
1961-03-06   9.38  11.58  10.85   7.14  10.94   9.49