In [237]:
import pandas as pd
import numpy as np
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [238]:
train = pd.read_csv('train.csv')

# only text and target
train = (train
        .drop(['keyword', 'location'], 1)
        .set_index('id')
)

# clean data 
## remove @name
## remove non-characters: 
## all lowercase: 
## remove urls: 
## remove short words (3 characters and below)
train['text'] = (train['text']
                 .str.replace('(http://.+)', '')
                 .str.replace('(^\S{1,3}[\s])', '')
                 .str.lower()
                 .str.replace('(@[A-Za-z0-9]+)','')
                 .str.replace('([^a-zA-Z\d\s:])', '')
                 .str.replace('([^A-Za-z\s])', '')
)

train.head()

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,deeds are the reason of this earthquake may al...,1
4,forest fire near la ronge sask canada,1
5,residents asked to shelter in place are being ...,1
6,people receive wildfires evacuation orders in...,1
7,just got sent this photo from ruby alaska as s...,1


In [239]:
train_y = train.target.copy()
train_x = train.drop('target', 1)

In [168]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
train_x_counts = count_vect.fit_transform(train_x['text'])
train_x_counts.shape

(7613, 14743)

In [170]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
train_x_tfidf = tfidf_transformer.fit_transform(train_x_counts)
train_x_tfidf.shape

(7613, 14743)

In [213]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

from sklearn.model_selection import cross_val_score
scores = cross_val_score(
    clf, X_train, y_train, cv=5, scoring='f1_macro')

print("Accuracy: {0} (+/- {1})".format(scores.mean(), scores.std()))

Accuracy: 0.7760349351065885 (+/- 0.011132279930972573)


In [214]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=33,
                   max_iter=5, tol=None)



Accuracy: 0.7478848633662902 (+/- 0.008791509051019126)


In [262]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [263]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train['text'], train_y,
                                                   test_size=0.2, random_state=33)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(
    text_clf, X_train, y_train, cv=20, scoring='f1_macro')

print("Accuracy: {0} (+/- {1})".format(scores.mean(), scores.std()))

Accuracy: 0.7842246503571518 (+/- 0.017316580029984072)


In [265]:
text_clf.fit(train['text'], train_y)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [266]:
predicted = text_clf.predict(['fire tornado hurricane', 'chop'])

In [267]:
predicted

array([1, 0])

In [2]:
import joblib

In [270]:
joblib.dump(text_clf, "model/model.pkl")

['model/model.pkl']

In [3]:
import_model = joblib.load('app/model/model.pkl')

In [4]:
import_model.predict(['halls', 'balls'])

array([0, 1])

In [8]:
import json

In [22]:
output = import_model.predict(['tornado'])

In [11]:
json.loads('hallo')

TypeError: the JSON object must be str, bytes or bytearray, not list

In [24]:
output

array([1])

In [37]:
test_data = {"data":"tornado"}

In [39]:
{test_data['data']}

{'tornado'}

In [48]:
test_data = {"tornado", "hallo", "tornado hurricane flood flash", "balls", "halls", "hurricane"}

for item in test_data:
    print(item, import_model.predict({item})[0])

tornado hurricane flood flash 1
hurricane 1
balls 1
hallo 0
tornado 1
halls 0
