# Storm Data Machine Learning
## Big Data Applications
### Class: E534 | Group: fa18-523-57, fa18-523-58
### Fall 2018
### Indiana University

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pymongo
import pandas as pd
import numpy as np

### Importing Data stored in Azure Cosmos DB

In [3]:
# Reading uri path from a file
config_file = open('cosmos_db.config','r')
uri_path = config_file.read()

In [4]:
client = pymongo.MongoClient(uri_path)

In [5]:
# Database Name and Collection Name
db = client['test']
collection = db['storm_data']

In [6]:
# Reading data from Cosmos DB as a Pandas Dataframe
from time import time
t0 = time()
mongo_data = pd.DataFrame(list(collection.find()))
print('Completed in ', np.round(time() - t0,3) , ' seconds')

Completed in  74.09  seconds


In [7]:
print('The number of documents in the collection is: ', len(mongo_data))

The number of documents in the collection is:  173263


In [8]:
mongo_data.shape 
# 173263 Rows, 13 Features/Columns

(173263, 13)

In [9]:
mongo_data.head()

Unnamed: 0,ADVISORY,ADV_DATE,ADV_HOUR,LAT,LEN,LONG_,NAME,PRESSURE,REGION,SPEED,STORM_NO,TYPE,_id
0,0,02/11/1979,12,-16.4,120480.811401,0.0,1979-02-11,0,I,-999,0,D,5bf4b61f44925f2a18e00ba8
1,0,10/02/1992,0,38.1,36249.206243,-21.8,BONNIE,1012,A,25,3,D,5bf4b61f44925f2a18e00ba9
2,0,06/21/1993,6,27.1,22263.9067,-98.3,ARLENE,1006,A,25,1,D,5bf4b61f44925f2a18e00baa
3,0,01/20/1975,6,-23.4,89055.6265,0.0,1975-01-19,0,S,-999,0,D,5bf4b61f44925f2a18e00bab
4,0,09/27/1988,18,30.9,141631.071665,-51.9,HELENE,979,A,77,9,H,5bf4b61f44925f2a18e00bac


### Pre-processing Data

In [10]:
mongo_data = mongo_data.drop_duplicates()
mongo_data.shape

(173263, 13)

In [11]:
# Dropping columns/features not needed
data = mongo_data.drop(columns=['_id','ADV_DATE','ADV_HOUR','NAME','LEN','LAT','LONG_'])

In [12]:
data.shape 

(173263, 6)

In [13]:
data.count()

ADVISORY    173263
PRESSURE    173263
REGION      173263
SPEED       173263
STORM_NO    173263
TYPE        173263
dtype: int64

In [14]:
data.isna().sum()

ADVISORY    0
PRESSURE    0
REGION      0
SPEED       0
STORM_NO    0
TYPE        0
dtype: int64

In [15]:
data.isnull().sum()

ADVISORY    0
PRESSURE    0
REGION      0
SPEED       0
STORM_NO    0
TYPE        0
dtype: int64

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 173263 entries, 0 to 173262
Data columns (total 6 columns):
ADVISORY    173263 non-null int64
PRESSURE    173263 non-null int64
REGION      173263 non-null object
SPEED       173263 non-null int64
STORM_NO    173263 non-null int64
TYPE        173263 non-null object
dtypes: int64(4), object(2)
memory usage: 9.3+ MB


In [17]:
data.TYPE.drop_duplicates()

0         D
4         H
6         S
19        U
164795     
Name: TYPE, dtype: object

In [18]:
data.REGION.drop_duplicates()

0     I
1     A
3     S
14    W
26    E
Name: REGION, dtype: object

In [19]:
data.STORM_NO.drop_duplicates().sort_values().shape

(607,)

In [20]:
data.drop_duplicates().shape

(12699, 6)

In [21]:
len(data[data.REGION.isna() == True]),len(data[data.STORM_NO.isna() == True]),len(data[data.SPEED.isna() == True])

(0, 0, 0)

In [22]:
len(data[data.PRESSURE.isna() == True]), len(data[data.TYPE.isna() == True])

(0, 0)

In [23]:
data = data[data.TYPE.isna() == False]
data = data[(data.TYPE == 'D') | (data.TYPE == 'H') | (data.TYPE == 'S') | (data.TYPE == 'U')]

In [24]:
data.REGION.unique()

array(['I', 'A', 'S', 'W', 'E'], dtype=object)

In [25]:
data.TYPE.unique()

array(['D', 'H', 'S', 'U'], dtype=object)

In [26]:
data.head(5)

Unnamed: 0,ADVISORY,PRESSURE,REGION,SPEED,STORM_NO,TYPE
0,0,0,I,-999,0,D
1,0,1012,A,25,3,D
2,0,1006,A,25,1,D
3,0,0,S,-999,0,D
4,0,979,A,77,9,H


In [27]:
# Removing rows with invalid speed
data = data[(data.SPEED != 999) & (data.SPEED != -999)]
data.shape

(156365, 6)

In [28]:
data.describe()

Unnamed: 0,ADVISORY,PRESSURE,SPEED,STORM_NO
count,156365.0,156365.0,156365.0,156365.0
mean,4.099453,362.68743,52.266728,4.94351
std,10.27725,478.827623,28.076052,22.123711
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,30.0,0.0
50%,0.0,0.0,45.0,1.0
75%,0.0,987.0,65.0,7.0
max,104.0,9830.0,185.0,606.0


In [29]:
# Mapping Categorical Data before we train our model
data['REGION_LBL'] = data.REGION.map({'A':0,'E':1,'I':2,'S':3,'W':4})
data['TYPE_LABEL'] = data.TYPE.map({'D':0,'H':1,'S':2,'U':3,'':4})

In [30]:
data.columns

Index(['ADVISORY', 'PRESSURE', 'REGION', 'SPEED', 'STORM_NO', 'TYPE',
       'REGION_LBL', 'TYPE_LABEL'],
      dtype='object')

In [31]:
# Splitting our Data into Features and Label
y = data.TYPE_LABEL
data = data.drop(columns = ['REGION','TYPE','TYPE_LABEL'])
X = data

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
# Splitting our data into Train and Test data
t0 = time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30,random_state = 2)
print('Number of examples in our data is', len(X))
print('Number of examples in Training data is', len(X_train))
print('Number of examples in Testing data is', len(X_test))
print("Completed in %0.3fs" % (time() - t0))

Number of examples in our data is 156365
Number of examples in Training data is 109455
Number of examples in Testing data is 46910
Completed in 0.022s


In [34]:
# Naive Bayes without Cross-Validation
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB();
t0 = time()
mnb.fit(X_train,y_train);
print("Completed in %0.3f seconds" % (time() - t0))

Completed in 0.032 seconds


In [35]:
y_pred = mnb.predict(X_test)
# Checking for the Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy: %0.3f %%" % (accuracy * 100.0))

Accuracy: 13.123 %


In [36]:
# Naive Bayes without Cross-Validation (5 folds)
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(mnb, X, y, cv=5, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))

Mean Accuracy: 13.300 %


In [37]:
# Naive Bayes without Cross-Validation (10 folds)
cv_scores = cross_val_score(mnb, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))

Mean Accuracy: 13.300 %


In [38]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(penalty='l2',max_iter=100, tol=None)

In [39]:
# SVM with SGD Without Cross-Validation
t0 = time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30,random_state = 2)
sgd_clf.fit(X_train,y_train);
y_pred = sgd_clf.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print('Number of examples in our data is', len(X))
print('Number of examples in Training data is', len(X_train))
print('Number of examples in Testing data is', len(X_test))
print("Accuracy: %0.3f %%" % (accuracy * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Number of examples in our data is 156365
Number of examples in Training data is 109455
Number of examples in Testing data is 46910
Accuracy: 96.080 %
Completed in 3.504 seconds


In [40]:
# SVM with SGD With Cross-Validation (5 folds)
t0 = time()
cv_scores = cross_val_score(sgd_clf, X, y, cv=5, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 91.300 %
Completed in 21.241 seconds


In [41]:
# SVM with SGD With Cross-Validation (10 folds)
t0 = time()
cv_scores = cross_val_score(sgd_clf, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 94.100 %
Completed in 50.809 seconds


In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [43]:
# Logistic Regression without Cross-Validation
t0 = time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30,random_state = 2)
log_clf = LogisticRegression()
log_clf.fit(X_train,y_train);
y_pred = log_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Number of examples in our data is', len(X))
print('Number of examples in Training data is', len(X_train))
print('Number of examples in Testing data is', len(X_test))
print("Accuracy: %0.3f %%" % (accuracy * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Number of examples in our data is 156365
Number of examples in Training data is 109455
Number of examples in Testing data is 46910
Accuracy: 98.363 %
Completed in 2.512 seconds


In [44]:
# Logistic Regression with Cross-Validation (5 folds)
t0 = time()
cv_scores = cross_val_score(log_clf, X, y, cv = 5, scoring = "accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 98.000 %
Completed in 12.359 seconds


In [45]:
# Logistic Regression with Cross-Validation (10 folds)
t0 = time()
cv_scores = cross_val_score(log_clf, X, y, cv = 10, scoring = "accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 97.900 %
Completed in 28.953 seconds


In [46]:
# Random Forest Classifier without Cross-Validation
t0 = time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40,random_state = 2)
rnd_clf = RandomForestClassifier(n_estimators=10)
rnd_clf.fit(X_train,y_train);
y_pred = rnd_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Number of examples in our data is', len(X))
print('Number of examples in Training data is', len(X_train))
print('Number of examples in Testing data is', len(X_test))
print("Accuracy: %0.3f %%" % (accuracy * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Number of examples in our data is 156365
Number of examples in Training data is 93819
Number of examples in Testing data is 62546
Accuracy: 98.983 %
Completed in 0.282 seconds


In [47]:
# Random Forest Classifier with Cross-Validation (5 folds)
t0 = time()
cv_scores = cross_val_score(rnd_clf,X= X,y= y, cv=5, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 98.800 %
Completed in 1.570 seconds


In [48]:
# Random Forest Classifier with Cross-Validation (10 folds)
t0 = time()
cv_scores = cross_val_score(rnd_clf, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 98.900 %
Completed in 3.772 seconds


In [49]:
# Support Vector Machine without Cross-Validation
t0 = time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35,random_state = 2)
svm_clf = SVC()
svm_clf.fit(X_train,y_train);
y_pred = svm_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Number of examples in our data is', len(X))
print('Number of examples in Training data is', len(X_train))
print('Number of examples in Testing data is', len(X_test))
print("Accuracy: %0.3f %%" % (accuracy * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Number of examples in our data is 156365
Number of examples in Training data is 101637
Number of examples in Testing data is 54728
Accuracy: 98.664 %
Completed in 391.230 seconds


In [50]:
# Gradient Boosting (XGBoost) without Cross-Validation
from xgboost import XGBClassifier
xgb_clf = XGBClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35,random_state = 2)
xgb_clf.fit(X_train,y_train);
y_pred = xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Number of examples in our data is', len(X))
print('Number of examples in Training data is', len(X_train))
print('Number of examples in Testing data is', len(X_test))
print("Accuracy: %0.3f %%" % (accuracy * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Number of examples in our data is 156365
Number of examples in Training data is 101637
Number of examples in Testing data is 54728
Accuracy: 98.968 %
Completed in 402.412 seconds


In [51]:
# Gradient Boosting (XGBoost) with Cross-Validation (5 folds)
t0 = time()
cv_scores = cross_val_score(xgb_clf, X, y, cv=5, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 98.900 %
Completed in 73.581 seconds


In [52]:
# Gradient Boosting (XGBoost) with Cross-Validation (10 folds)
t0 = time()
cv_scores = cross_val_score(xgb_clf, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 98.900 %
Completed in 166.827 seconds
