# Storm Data Machine Learning
## Big Data Applications
### Class: E534 | Group: fa18-523-57, fa18-523-58
### Fall 2018
### Indiana University

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pymongo
import pandas as pd
import numpy as np

### Importing Data stored in Azure Cosmos DB

In [3]:
# Reading uri path from a file
config_file = open('cosmos_db.config','r')
uri_path = config_file.read()

In [4]:
client = pymongo.MongoClient(uri_path)

In [5]:
# Database Name and Collection Name
db = client['test']
collection = db['storm_data']

In [None]:
# Reading data from Cosmos DB as a Pandas Dataframe
from time import time
t0 = time()
mongo_data = pd.DataFrame(list(collection.find()))
print('Completed in ', np.round(time() - t0,3) , ' seconds')

### Importing Data stored in Github Repository

In [6]:
#mongo_data = pd.read_csv('./data/storm_data.csv')

In [None]:
print('The number of documents in the collection is: ', len(mongo_data))

In [21]:
mongo_data.shape 
# 173263 Rows, 13 Features/Columns

(191716, 13)

In [22]:
mongo_data.head()

Unnamed: 0,_id,REGION,STORM_NO,NAME,ADVISORY,ADV_DATE,ADV_HOUR,SPEED,PRESSURE,TYPE,LAT,LONG_,LEN
0,1,S,0,12/2/97,0,12/3/97,18,25,0,D,-8.6,0.0,24916.15849
1,2,I,0,2/11/79,0,2/11/79,12,-999,0,D,-16.4,0.0,120480.8114
2,3,A,6,FIFI,0,9/6/58,6,65,0,S,15.6,-55.7,218179.689
3,4,A,6,FRANCES,0,9/2/04,0,120,939,H,22.2,-71.4,121127.7297
4,5,A,9,HELENE,0,9/27/88,18,77,979,H,30.9,-51.9,141631.0717


### Pre-processing Data

In [23]:
mongo_data = mongo_data.drop_duplicates()
mongo_data.shape

(191716, 13)

In [24]:
# Dropping columns/features not needed
data = mongo_data.drop(columns=['_id','ADV_DATE','ADV_HOUR','NAME','LEN','LAT','LONG_'])

In [25]:
data.shape 

(191716, 6)

In [26]:
data.count()

REGION      191716
STORM_NO    191716
ADVISORY    191716
SPEED       191716
PRESSURE    191716
TYPE        191705
dtype: int64

In [27]:
data.isna().sum()

REGION       0
STORM_NO     0
ADVISORY     0
SPEED        0
PRESSURE     0
TYPE        11
dtype: int64

In [28]:
data.isnull().sum()

REGION       0
STORM_NO     0
ADVISORY     0
SPEED        0
PRESSURE     0
TYPE        11
dtype: int64

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191716 entries, 0 to 191715
Data columns (total 6 columns):
REGION      191716 non-null object
STORM_NO    191716 non-null int64
ADVISORY    191716 non-null int64
SPEED       191716 non-null int64
PRESSURE    191716 non-null int64
TYPE        191705 non-null object
dtypes: int64(4), object(2)
memory usage: 10.2+ MB


In [30]:
data.TYPE.drop_duplicates()

0           D
2           S
3           H
61          U
183200    NaN
Name: TYPE, dtype: object

In [31]:
data.REGION.drop_duplicates()

0     S
1     I
2     A
10    W
18    E
Name: REGION, dtype: object

In [32]:
data.STORM_NO.drop_duplicates().sort_values().shape

(607,)

In [33]:
data.drop_duplicates().shape

(12928, 6)

In [34]:
len(data[data.REGION.isna() == True]),len(data[data.STORM_NO.isna() == True]),len(data[data.SPEED.isna() == True])

(0, 0, 0)

In [35]:
len(data[data.PRESSURE.isna() == True]), len(data[data.TYPE.isna() == True])

(0, 11)

In [36]:
data = data[data.TYPE.isna() == False]
data = data[(data.TYPE == 'D') | (data.TYPE == 'H') | (data.TYPE == 'S') | (data.TYPE == 'U')]

In [37]:
data.REGION.unique()

array(['S', 'I', 'A', 'W', 'E'], dtype=object)

In [38]:
data.TYPE.unique()

array(['D', 'S', 'H', 'U'], dtype=object)

In [39]:
data.head(5)

Unnamed: 0,REGION,STORM_NO,ADVISORY,SPEED,PRESSURE,TYPE
0,S,0,0,25,0,D
1,I,0,0,-999,0,D
2,A,6,0,65,0,S
3,A,6,0,120,939,H
4,A,9,0,77,979,H


In [40]:
# Removing rows with invalid speed
data = data[(data.SPEED != 999) & (data.SPEED != -999)]
data.shape

(172962, 6)

In [41]:
data.describe()

Unnamed: 0,STORM_NO,ADVISORY,SPEED,PRESSURE
count,172962.0,172962.0,172962.0,172962.0
mean,4.840474,4.093038,52.23175,364.613129
std,21.088996,10.275283,28.048877,479.268446
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,30.0,0.0
50%,1.0,0.0,45.0,0.0
75%,7.0,0.0,65.0,988.0
max,606.0,104.0,185.0,9830.0


In [42]:
# Mapping Categorical Data before we train our model
data['REGION_LBL'] = data.REGION.map({'A':0,'E':1,'I':2,'S':3,'W':4})
data['TYPE_LABEL'] = data.TYPE.map({'D':0,'H':1,'S':2,'U':3,'':4})

In [43]:
data.columns

Index(['REGION', 'STORM_NO', 'ADVISORY', 'SPEED', 'PRESSURE', 'TYPE',
       'REGION_LBL', 'TYPE_LABEL'],
      dtype='object')

In [44]:
# Splitting our Data into Features and Label
y = data.TYPE_LABEL
data = data.drop(columns = ['REGION','TYPE','TYPE_LABEL'])
X = data

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
# Splitting our data into Train and Test data
t0 = time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30,random_state = 2)
print('Number of examples in our data is', len(X))
print('Number of examples in Training data is', len(X_train))
print('Number of examples in Testing data is', len(X_test))
print("Completed in %0.3fs" % (time() - t0))

Number of examples in our data is 172962
Number of examples in Training data is 121073
Number of examples in Testing data is 51889
Completed in 0.020s


In [47]:
# Naive Bayes without Cross-Validation
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB();
t0 = time()
mnb.fit(X_train,y_train);
print("Completed in %0.3f seconds" % (time() - t0))

Completed in 0.026 seconds


In [48]:
y_pred = mnb.predict(X_test)
# Checking for the Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy: %0.3f %%" % (accuracy * 100.0))

Accuracy: 13.409 %


In [49]:
# Naive Bayes without Cross-Validation (5 folds)
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(mnb, X, y, cv=5, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))

Mean Accuracy: 13.400 %


In [50]:
# Naive Bayes without Cross-Validation (10 folds)
cv_scores = cross_val_score(mnb, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))

Mean Accuracy: 13.400 %


In [51]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(penalty='l2',max_iter=100, tol=None)

In [39]:
# SVM with SGD Without Cross-Validation
t0 = time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30,random_state = 2)
sgd_clf.fit(X_train,y_train);
y_pred = sgd_clf.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print('Number of examples in our data is', len(X))
print('Number of examples in Training data is', len(X_train))
print('Number of examples in Testing data is', len(X_test))
print("Accuracy: %0.3f %%" % (accuracy * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Number of examples in our data is 156365
Number of examples in Training data is 109455
Number of examples in Testing data is 46910
Accuracy: 96.080 %
Completed in 3.504 seconds


In [40]:
# SVM with SGD With Cross-Validation (5 folds)
t0 = time()
cv_scores = cross_val_score(sgd_clf, X, y, cv=5, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 91.300 %
Completed in 21.241 seconds


In [41]:
# SVM with SGD With Cross-Validation (10 folds)
t0 = time()
cv_scores = cross_val_score(sgd_clf, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 94.100 %
Completed in 50.809 seconds


In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [43]:
# Logistic Regression without Cross-Validation
t0 = time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30,random_state = 2)
log_clf = LogisticRegression()
log_clf.fit(X_train,y_train);
y_pred = log_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Number of examples in our data is', len(X))
print('Number of examples in Training data is', len(X_train))
print('Number of examples in Testing data is', len(X_test))
print("Accuracy: %0.3f %%" % (accuracy * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Number of examples in our data is 156365
Number of examples in Training data is 109455
Number of examples in Testing data is 46910
Accuracy: 98.363 %
Completed in 2.512 seconds


In [44]:
# Logistic Regression with Cross-Validation (5 folds)
t0 = time()
cv_scores = cross_val_score(log_clf, X, y, cv = 5, scoring = "accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 98.000 %
Completed in 12.359 seconds


In [45]:
# Logistic Regression with Cross-Validation (10 folds)
t0 = time()
cv_scores = cross_val_score(log_clf, X, y, cv = 10, scoring = "accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 97.900 %
Completed in 28.953 seconds


In [46]:
# Random Forest Classifier without Cross-Validation
t0 = time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40,random_state = 2)
rnd_clf = RandomForestClassifier(n_estimators=10)
rnd_clf.fit(X_train,y_train);
y_pred = rnd_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Number of examples in our data is', len(X))
print('Number of examples in Training data is', len(X_train))
print('Number of examples in Testing data is', len(X_test))
print("Accuracy: %0.3f %%" % (accuracy * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Number of examples in our data is 156365
Number of examples in Training data is 93819
Number of examples in Testing data is 62546
Accuracy: 98.983 %
Completed in 0.282 seconds


In [47]:
# Random Forest Classifier with Cross-Validation (5 folds)
t0 = time()
cv_scores = cross_val_score(rnd_clf,X= X,y= y, cv=5, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 98.800 %
Completed in 1.570 seconds


In [48]:
# Random Forest Classifier with Cross-Validation (10 folds)
t0 = time()
cv_scores = cross_val_score(rnd_clf, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 98.900 %
Completed in 3.772 seconds


In [49]:
# Support Vector Machine without Cross-Validation
t0 = time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35,random_state = 2)
svm_clf = SVC()
svm_clf.fit(X_train,y_train);
y_pred = svm_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Number of examples in our data is', len(X))
print('Number of examples in Training data is', len(X_train))
print('Number of examples in Testing data is', len(X_test))
print("Accuracy: %0.3f %%" % (accuracy * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Number of examples in our data is 156365
Number of examples in Training data is 101637
Number of examples in Testing data is 54728
Accuracy: 98.664 %
Completed in 391.230 seconds


In [50]:
# Gradient Boosting (XGBoost) without Cross-Validation
from xgboost import XGBClassifier
xgb_clf = XGBClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35,random_state = 2)
xgb_clf.fit(X_train,y_train);
y_pred = xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Number of examples in our data is', len(X))
print('Number of examples in Training data is', len(X_train))
print('Number of examples in Testing data is', len(X_test))
print("Accuracy: %0.3f %%" % (accuracy * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Number of examples in our data is 156365
Number of examples in Training data is 101637
Number of examples in Testing data is 54728
Accuracy: 98.968 %
Completed in 402.412 seconds


In [51]:
# Gradient Boosting (XGBoost) with Cross-Validation (5 folds)
t0 = time()
cv_scores = cross_val_score(xgb_clf, X, y, cv=5, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 98.900 %
Completed in 73.581 seconds


In [52]:
# Gradient Boosting (XGBoost) with Cross-Validation (10 folds)
t0 = time()
cv_scores = cross_val_score(xgb_clf, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: %0.3f %%" % (np.round(np.mean(cv_scores),3) * 100.0))
print("Completed in %0.3f seconds" % (time() - t0))

Mean Accuracy: 98.900 %
Completed in 166.827 seconds
