In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.externals import joblib

# Text Analytics

In [51]:
train_file = "imdb_train.csv"

In [52]:
def data_preprocess(train_texts):
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), sublinear_tf=True, max_features=40000)
    features_train = vectorizer.fit_transform(train_texts)
    return features_train, vectorizer

In [53]:
train_data = pd.read_csv(train_file)
training_texts, transformer = data_preprocess(train_data.text)
training_labels = train_data.polarity

In [54]:
model = SGDClassifier(penalty="l1", loss="log", max_iter=200)
model.fit(training_texts, training_labels)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=200, n_iter=None,
       n_jobs=1, penalty='l1', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [55]:
model.score(training_texts, training_labels)

0.8592

### Serialization

In [56]:
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(transformer, 'sentiment_transformer.pkl')

['transformer.pkl']

**Uso:**

In [57]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib

model2 = joblib.load('sentiment_model.pkl')
transformer2 = joblib.load('sentiment_transformer.pkl')

In [60]:
def text_transformer(text, vectorizer):
    text = pd.Series([text], name='text')
    features_test = vectorizer.transform(text)
    return features_test

In [86]:
def scorer(text, classifier, vectorizer):
    test_data = text_transformer(text=text, vectorizer=vectorizer)
    pred = classifier.predict(test_data)[0]
    ppred = np.max(classifier.predict_proba(test_data))
    return {'sentiment': int(pred), 'probability': float(ppred)}

In [87]:
test_text = train_data.loc[5, ['text']][0]
test_text

'"It appears that many critics find the idea of a Woody Allen drama unpalatable." And for good reason: they are unbearably wooden and pretentious imitations of Bergman. And let\'s not kid ourselves: critics were mostly supportive of Allen\'s Bergman pretensions, Allen\'s whining accusations to the contrary notwithstanding. What I don\'t get is this: why was Allen generally applauded for his originality in imitating Bergman, but the contemporaneous Brian DePalma was excoriated for "ripping off" Hitchcock in his suspense/horror films? In Robin Wood\'s view, it\'s a strange form of cultural snobbery. I would have to agree with that.'

In [88]:
result = scorer(text=test_text, classifier=model2, vectorizer=transformer2)
result

{'sentiment': 0, 'probability': 0.7106227422767973}

# Classification

In [2]:
file = "iris_data.csv"

In [3]:
iris_data = pd.read_csv(file)
iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [28]:
iris_data[iris_data['type'] == 'virginica'].head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
100,6.3,3.3,6.0,2.5,virginica
101,5.8,2.7,5.1,1.9,virginica
102,7.1,3.0,5.9,2.1,virginica
103,6.3,2.9,5.6,1.8,virginica
104,6.5,3.0,5.8,2.2,virginica


In [4]:
iris_model = LogisticRegression()
iris_model.fit(iris_data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], iris_data.type)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [5]:
iris_model.score(iris_data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], iris_data.type)

0.96

## Serialization

In [6]:
joblib.dump(iris_model, 'iris_model.pkl')

['iris_model.pkl']

**Uso:**

In [8]:
iris_model = joblib.load('iris_model.pkl')

In [25]:
def scorer(data, model):
    sepal_length, sepal_width = float(data['sepal_length']), float(data['sepal_width'])
    petal_length, petal_width = float(data['petal_length']), float(data['petal_width'])
    # deben ir en el mismo orden en que se estimó el modelo
    data_p = [[sepal_length, sepal_width, petal_length, petal_width]]
    pred = model.predict(data_p)[0]
    ppred = round(np.max(model.predict_proba(data_p)), 4)
    return {'predicted': pred, 'probability': float(ppred), 'message': 'success'}

In [26]:
test_data = {'sepal_length': 4.9, 'sepal_width': 3, 'petal_length': 1.4, 'petal_width': 0.2}
print(scorer(test_data, iris_model))

{'predicted': 'setosa', 'probability': 0.7997, 'message': 'success'}
