In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Read in the Data

In [2]:
df = pd.read_csv(model_weather_data.csv)

## Balance the amount of weather events

- __Earthquake has 1733 entries__
- __Fire has 1486 entries__
- __Hurricane has 4021 entries__

__Because of this we will leave all the entries that exist for Earthquake and Fire and cut the Hurricane category down to about 2k__

### Can just do it by indexes
__indexed values are from 3219 up to 7239__

In [3]:
#Just dropping off the last 2k which were all hurricane entries
df = df[0:5220]

In [4]:
len(df[df.type == 'Earthquake']),len(df[df.type == 'Fire']),len(df[df.type == 'Hurricane'])

(1733, 1486, 2001)

## Create Test and Train Splits

In [23]:
df.head()

Unnamed: 0,text,type,follower_count,verified,0,1,2,3,4,5,...,758,759,760,761,762,763,764,765,766,767
0,RT @PressTV: UPDATE:\nDeath toll from Iran’s q...,Earthquake,45,False,-0.180702,-0.557254,-0.257066,0.247704,-0.116079,-1.246596,...,-1.880393,0.574295,-1.08228,-1.194331,-0.716398,-1.528029,-1.393145,1.733719,0.871081,3.009707
1,RT @CAFOD: We pray for all those affected by t...,Earthquake,129,False,0.332884,-1.345755,-0.410285,0.575408,-0.032268,-1.802706,...,-1.687065,0.718519,-1.029724,-0.767176,-0.032971,-1.298846,-1.810065,1.686207,0.579394,2.899038
2,RT @ReutersWorld: JUST IN: Death toll reaches ...,Earthquake,256,False,-0.339567,-1.182502,-0.533709,0.414308,-0.11326,-2.112401,...,-1.734978,0.265315,-0.387211,-1.87277,-0.245966,-2.04415,-1.692681,1.304106,1.5586,3.291314
3,RT @LaylaAlhussein: A magnitude 7.2 earthquake...,Earthquake,85,False,1.150446,-0.266334,-0.942941,0.596258,-0.945368,-1.232079,...,-1.013363,0.537306,-0.347581,-0.786124,-0.903818,-0.866392,-1.320476,1.485271,0.290309,1.867713
4,Video: 7.2-magnitude earthquake jolts #Iran-Ir...,Earthquake,6192,False,0.868155,-1.257778,0.021792,0.602206,-1.028045,-1.127659,...,-1.190339,0.817445,-1.245844,-1.677243,-0.206908,-1.712536,-1.538209,0.461534,0.934861,2.520552


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X = df.loc[:, df.columns != 'type']
y = df.type

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=69)

In [37]:
len(X_test)

1305

## Begin Modeling

__Support Vector Machine__

In [9]:
from sklearn.svm import SVC

In [10]:
mod_svm = SVC()
mod_svm.fit(X_train.iloc[:,3:], y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [11]:
svm_pred = mod_svm.predict(X_test.iloc[:,3:])

In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,svm_pred)

0.9670498084291188

- __Precision is the percent of predictions that were correct TP/(TP+FP)__
- __Recall is the ability to identify the prediction power within the actual class TP/(TP+FN)__
- __F1 is a weighted average of Precision and Recall__


In [13]:
from sklearn.metrics import classification_report
report = classification_report(y_test,svm_pred)
print(report)

              precision    recall  f1-score   support

  Earthquake       0.97      0.97      0.97       440
        Fire       0.97      0.96      0.97       375
   Hurricane       0.96      0.97      0.96       490

    accuracy                           0.97      1305
   macro avg       0.97      0.97      0.97      1305
weighted avg       0.97      0.97      0.97      1305



__Logistic Regression__

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
mod_lr = LogisticRegression(max_iter = 200, multi_class = 'auto', solver = 'saga')
mod_lr.fit(X_train.iloc[:,3:], y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
lr_pred = mod_lr.predict(X_test.iloc[:,3:])

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,lr_pred)

0.9662835249042145

In [18]:
from sklearn.metrics import classification_report
report = classification_report(y_test,lr_pred)
print(report)

              precision    recall  f1-score   support

  Earthquake       0.97      0.97      0.97       440
        Fire       0.96      0.97      0.96       375
   Hurricane       0.96      0.97      0.97       490

    accuracy                           0.97      1305
   macro avg       0.97      0.97      0.97      1305
weighted avg       0.97      0.97      0.97      1305



## Random Test

In [19]:
from transformers import BertTokenizer, BertModel
import torch

In [20]:
# Let's encode some text in a sequence of hidden-states using each model:

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

model = BertModel.from_pretrained('bert-base-cased', output_hidden_states = True)
#We aren't training so this will improve performance
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
'All of the California vineyards are burning up!!'
'How do the American and European track predictions compare?'

In [34]:
tphrase = ['How do the American and European track predictions compare?']

sent_list = []

for sent in tphrase:
    sent_list.append(tokenizer.encode_plus(sent, add_special_tokens = True))
    out = []

# No grad call stops gradient calculation since we are not training but just passing information
with torch.no_grad():
    for item in sent_list:
        out.append(model(torch.tensor([item.input_ids]),torch.tensor([item.attention_mask])))
        
        
layers = []

for sentence in out:
    a = sentence
    layers.append(a[2])
    
sl4 = []

for sent in layers:
    temp = []
    a = torch.stack(sent[-4:]).sum(0)[0]
    
    for i in range(1,len(a)-1):
        temp.append(a[i])
    
    temp = torch.stack(temp)
    average = torch.mean(temp, dim = 0)
    sl4.append(average)
    
tst = pd.DataFrame(sl4).astype(float)

In [35]:
mod_svm.predict(tst)

array(['Hurricane'], dtype=object)