# Data analysis of Meteorological Data (Weather datasets)

In [1]:
# name - Jay Prakash Bind
# email - jaypr202@gmail.com
# mob no - 8887624847

#### Importing Libraries

In [2]:
import warnings
warnings.filterwarnings('ignore')
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

#### Read dataset

In [3]:
df=pd.read_csv('weatherHistory.csv')
df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251,15.8263,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259,15.8263,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204,14.9569,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269,15.8263,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259,15.8263,1016.51,Partly cloudy throughout the day.


In [4]:
df.columns

Index(['Formatted Date', 'Summary', 'Precip Type', 'Temperature (C)',
       'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)',
       'Daily Summary'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Formatted Date            96453 non-null  object 
 1   Summary                   96453 non-null  object 
 2   Precip Type               95936 non-null  object 
 3   Temperature (C)           96453 non-null  float64
 4   Apparent Temperature (C)  96453 non-null  float64
 5   Humidity                  96453 non-null  float64
 6   Wind Speed (km/h)         96453 non-null  float64
 7   Wind Bearing (degrees)    96453 non-null  int64  
 8   Visibility (km)           96453 non-null  float64
 9   Pressure (millibars)      96453 non-null  float64
 10  Daily Summary             96453 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 8.1+ MB


In [6]:
df.shape

(96453, 11)

In [7]:
df.describe()

Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
count,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0
mean,11.932678,10.855029,0.734899,10.81064,187.509232,10.347325,1003.235956
std,9.551546,10.696847,0.195473,6.913571,107.383428,4.192123,116.969906
min,-21.822222,-27.716667,0.0,0.0,0.0,0.0,0.0
25%,4.688889,2.311111,0.6,5.8282,116.0,8.3398,1011.9
50%,12.0,12.0,0.78,9.9659,180.0,10.0464,1016.45
75%,18.838889,18.838889,0.89,14.1358,290.0,14.812,1021.09
max,39.905556,39.344444,1.0,63.8526,359.0,16.1,1046.38


In [8]:
df.isnull().any()

Formatted Date              False
Summary                     False
Precip Type                  True
Temperature (C)             False
Apparent Temperature (C)    False
Humidity                    False
Wind Speed (km/h)           False
Wind Bearing (degrees)      False
Visibility (km)             False
Pressure (millibars)        False
Daily Summary               False
dtype: bool

In [9]:
df.isnull().all()

Formatted Date              False
Summary                     False
Precip Type                 False
Temperature (C)             False
Apparent Temperature (C)    False
Humidity                    False
Wind Speed (km/h)           False
Wind Bearing (degrees)      False
Visibility (km)             False
Pressure (millibars)        False
Daily Summary               False
dtype: bool

In [10]:
# percentage of null value
round(100*(df.isnull().sum()/len(df.index)),2)

Formatted Date              0.00
Summary                     0.00
Precip Type                 0.54
Temperature (C)             0.00
Apparent Temperature (C)    0.00
Humidity                    0.00
Wind Speed (km/h)           0.00
Wind Bearing (degrees)      0.00
Visibility (km)             0.00
Pressure (millibars)        0.00
Daily Summary               0.00
dtype: float64

In [11]:
df.isnull().sum()

Formatted Date                0
Summary                       0
Precip Type                 517
Temperature (C)               0
Apparent Temperature (C)      0
Humidity                      0
Wind Speed (km/h)             0
Wind Bearing (degrees)        0
Visibility (km)               0
Pressure (millibars)          0
Daily Summary                 0
dtype: int64

In [12]:
df['Precip Type'].value_counts()

rain    85224
snow    10712
Name: Precip Type, dtype: int64

In [13]:
# fill null value by rain because rain have much greate value
df.loc[df['Precip Type'].isnull(), 'Precip Type']='rain'

In [14]:
# percentage of null value
round(100*(df.isnull().sum()/len(df.index)),2)

# no null value

Formatted Date              0.0
Summary                     0.0
Precip Type                 0.0
Temperature (C)             0.0
Apparent Temperature (C)    0.0
Humidity                    0.0
Wind Speed (km/h)           0.0
Wind Bearing (degrees)      0.0
Visibility (km)             0.0
Pressure (millibars)        0.0
Daily Summary               0.0
dtype: float64

## Exploratory data analysis

In [15]:
df.loc[df['Precip Type']=='rain', 'Precip Type']=1
df.loc[df['Precip Type']=='snow', 'Precip Type']=0

In [16]:
df.dtypes

Formatted Date               object
Summary                      object
Precip Type                  object
Temperature (C)             float64
Apparent Temperature (C)    float64
Humidity                    float64
Wind Speed (km/h)           float64
Wind Bearing (degrees)        int64
Visibility (km)             float64
Pressure (millibars)        float64
Daily Summary                object
dtype: object

In [17]:
df.tail()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars),Daily Summary
96448,2016-09-09 19:00:00.000 +0200,Partly Cloudy,1,26.016667,26.016667,0.43,10.9963,31,16.1,1014.36,Partly cloudy starting in the morning.
96449,2016-09-09 20:00:00.000 +0200,Partly Cloudy,1,24.583333,24.583333,0.48,10.0947,20,15.5526,1015.16,Partly cloudy starting in the morning.
96450,2016-09-09 21:00:00.000 +0200,Partly Cloudy,1,22.038889,22.038889,0.56,8.9838,30,16.1,1015.66,Partly cloudy starting in the morning.
96451,2016-09-09 22:00:00.000 +0200,Partly Cloudy,1,21.522222,21.522222,0.6,10.5294,20,16.1,1015.95,Partly cloudy starting in the morning.
96452,2016-09-09 23:00:00.000 +0200,Partly Cloudy,1,20.438889,20.438889,0.61,5.8765,39,15.5204,1016.16,Partly cloudy starting in the morning.


In [18]:
# summary and daily summary is not necessary
df_num=df.drop(['Summary', 'Daily Summary','Formatted Date'], axis=1)
df_num.head()

Unnamed: 0,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
0,1,9.472222,7.388889,0.89,14.1197,251,15.8263,1015.13
1,1,9.355556,7.227778,0.86,14.2646,259,15.8263,1015.63
2,1,9.377778,9.377778,0.89,3.9284,204,14.9569,1015.94
3,1,8.288889,5.944444,0.83,14.1036,269,15.8263,1016.41
4,1,8.755556,6.977778,0.83,11.0446,259,15.8263,1016.51


In [19]:
df_num.head()

Unnamed: 0,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
0,1,9.472222,7.388889,0.89,14.1197,251,15.8263,1015.13
1,1,9.355556,7.227778,0.86,14.2646,259,15.8263,1015.63
2,1,9.377778,9.377778,0.89,3.9284,204,14.9569,1015.94
3,1,8.288889,5.944444,0.83,14.1036,269,15.8263,1016.41
4,1,8.755556,6.977778,0.83,11.0446,259,15.8263,1016.51


In [20]:
# split the datasets for x and y
y=df_num.pop('Temperature (C)')

In [21]:
x=df_num

In [22]:
x.head()

Unnamed: 0,Precip Type,Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
0,1,7.388889,0.89,14.1197,251,15.8263,1015.13
1,1,7.227778,0.86,14.2646,259,15.8263,1015.63
2,1,9.377778,0.89,3.9284,204,14.9569,1015.94
3,1,5.944444,0.83,14.1036,269,15.8263,1016.41
4,1,6.977778,0.83,11.0446,259,15.8263,1016.51


In [23]:
y.head()

0    9.472222
1    9.355556
2    9.377778
3    8.288889
4    8.755556
Name: Temperature (C), dtype: float64

In [24]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=4)

In [25]:
x_train

Unnamed: 0,Precip Type,Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
70626,1,21.061111,0.31,12.5580,110,16.1000,1005.87
52457,1,25.016667,0.36,18.4989,352,10.3523,1025.36
90690,1,0.738889,0.89,17.1304,270,15.8263,1014.75
69528,1,13.772222,0.78,14.4900,300,15.8263,1014.56
92419,1,23.288889,0.82,6.3917,357,16.1000,1022.05
...,...,...,...,...,...,...,...
23346,1,6.783333,0.64,4.9427,316,9.9820,1012.35
11863,1,-1.600000,0.91,9.3541,181,4.8783,1027.02
92599,1,22.127778,0.59,3.1717,210,9.9820,1010.22
73902,1,22.666667,0.63,3.1878,240,16.1000,1013.33


#### model trainng

In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [27]:
model=RandomForestRegressor(max_depth=50, random_state=0, n_estimators=100)
model.fit(x_train, y_train)

RandomForestRegressor(max_depth=50, random_state=0)

In [29]:
# prediction
prediction=model.predict(x_test)
prediction

array([-2.27711111,  8.85983333,  9.82611111, ..., 12.77777778,
       15.27761111,  2.2       ])

In [30]:
np.mean((prediction - y_test)**2)

0.001877068885918295

In [32]:
asd=pd.DataFrame({'Actual': y_test,
             'Prediction': prediction,
              'difference': y_test - prediction
             })

In [33]:
asd.head()

Unnamed: 0,Actual,Prediction,difference
37443,-2.288889,-2.277111,-0.01177778
86534,8.861111,8.859833,0.001277778
2082,9.805556,9.826111,-0.02055555
53130,27.222222,27.218667,0.003555555
45196,17.705556,17.705556,3.552714e-15
