In [39]:
# Standard libs
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

# DataPrep
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
nltk.download('rslp')
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
import re
import spacy
nlp = spacy.load('pt_core_news_lg')

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB


# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gabriel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/gabriel/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


## Importing Dataset 

In [23]:
df = pd.read_csv('../data/raw/B2W-Reviews01.csv')
df.head(5)

  df = pd.read_csv('../data/raw/B2W-Reviews01.csv')


Unnamed: 0,submission_date,reviewer_id,product_id,product_name,product_brand,site_category_lv1,site_category_lv2,review_title,overall_rating,recommend_to_a_friend,review_text,reviewer_birth_year,reviewer_gender,reviewer_state
0,2018-01-01 00:11:28,d0fb1ca69422530334178f5c8624aa7a99da47907c44de...,132532965,Notebook Asus Vivobook Max X541NA-GO472T Intel...,,Informática,Notebook,Bom,4,Yes,Estou contente com a compra entrega rápida o ú...,1958.0,F,RJ
1,2018-01-01 00:13:48,014d6dc5a10aed1ff1e6f349fb2b059a2d3de511c7538a...,22562178,Copo Acrílico Com Canudo 500ml Rocie,,Utilidades Domésticas,"Copos, Taças e Canecas","Preço imbatível, ótima qualidade",4,Yes,"Por apenas R$1994.20,eu consegui comprar esse ...",1996.0,M,SC
2,2018-01-01 00:26:02,44f2c8edd93471926fff601274b8b2b5c4824e386ae4f2...,113022329,Panela de Pressão Elétrica Philips Walita Dail...,philips walita,Eletroportáteis,Panela Elétrica,ATENDE TODAS AS EXPECTATIVA.,4,Yes,SUPERA EM AGILIDADE E PRATICIDADE OUTRAS PANEL...,1984.0,M,SP
3,2018-01-01 00:35:54,ce741665c1764ab2d77539e18d0e4f66dde6213c9f0863...,113851581,Betoneira Columbus - Roma Brinquedos,roma jensen,Brinquedos,Veículos de Brinquedo,presente mais que desejado,4,Yes,MEU FILHO AMOU! PARECE DE VERDADE COM TANTOS D...,1985.0,F,SP
4,2018-01-01 01:00:28,7d7b6b18dda804a897359276cef0ca252f9932bf4b5c8e...,131788803,"Smart TV LED 43"" LG 43UJ6525 Ultra HD 4K com C...",lg,TV e Home Theater,TV,"Sem duvidas, excelente",5,Yes,"A entrega foi no prazo, as americanas estão de...",1994.0,M,MG


## Pre process

In [24]:
# Pre processing
# Apply  regex

def apply_regex(corpus, regex):
    corpus = [re.sub(regex, ' ', x) for x in corpus]
    return corpus

def multiple_regex(corpus, regex_list):
    # Lowcase
    corpus = corpus.apply(lambda x: x.lower())
    # Negation
    corpus = [re.sub('([nN][ãÃaA][oO]|[ñÑ]| [nN] )', ' não ', r) for r in corpus]
    # Basix regex
    for regex in regex_list:
        corpus = apply_regex(corpus, regex)
    return corpus

def stemmer(corpus):
    stemmer = RSLPStemmer()
    stemmetized = []
    for sentence in corpus:
        words = [stemmer.stem(word) for word in sentence]
        sentence = ' '.join(words)
        stemmetized.append(sentence)
    return corpus

def lemmatizer(corpus):
    lemmatized = []
    for sentence in corpus:
        doc = nlp(sentence)
        lemmat = ' '.join([token.lemma_ for token in doc])
        lemmatized.append(lemmat)
    return lemmatized

def data_cleaning(df):
    df = df.copy()
    df.drop_duplicates(inplace=True)
    # Feature engineering
    df['review_text'] = df['review_title'] + ' ' + df['review_text']
    cols=['submission_date', 'reviewer_id', 'product_id', 'product_name', 'product_brand',
          'site_category_lv1', 'site_category_lv2', 'overall_rating', 'reviewer_birth_year',
          'reviewer_gender', 'reviewer_state', 'review_title']
    df.drop(columns=cols, inplace=True)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    # Noise removal
    regex_list = [r'www\S+', r'http\S+', r'@\S+', r'#\S+', r'[0-9]+', r'\W', r'\s+', r'[ \t]+$']
    df['review_text'] = multiple_regex(df['review_text'], regex_list)
    df['review_text'] = lemmatizer(df['review_text'])

    return df

## ML classification models
### Base Model

In [25]:
%%time

# Create Feature and Label sets
df_ml = data_cleaning(df.copy())
X = df_ml['review_text']
y = df_ml['recommend_to_a_friend']

print(len(df_ml))

127982
CPU times: user 14min 6s, sys: 623 ms, total: 14min 6s
Wall time: 14min 6s


Stemming avg time: ~ 7 min

Lemma avg time: ~ 14 min

In [26]:
# df_ml.to_csv('../data/clean/lemma.csv') # type: ignore
df_ml.head(5)

Unnamed: 0,recommend_to_a_friend,review_text
0,Yes,bom estar contente com o compra entregar rápid...
1,Yes,preço imbatível ótimo qualidade por apenas r e...
2,Yes,atender todo o expectativa superar em agilidad...
3,Yes,presente mais que desejado meu filho amar pare...
4,Yes,sem duvida excelente o entrega ser em o prazo ...


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # type: ignore
print('Training Data Size:', len(X_train))
print('Testing Data Size: ', len(X_test))

Training Data Size: 102385
Testing Data Size:  25597


In [28]:
pt_stopwords = stopwords.words('portuguese')

count_vect = CountVectorizer(lowercase=True, min_df=2, max_df=0.95, ngram_range=(1,3), stop_words=pt_stopwords)
tfidf_vect = TfidfVectorizer(lowercase=True, min_df=2, max_df=0.95, ngram_range=(1,3), stop_words=pt_stopwords)

X1 = X_train.copy()
X2 = X_train.copy()

X_train_tfidf = tfidf_vect.fit_transform(X1)
X_train_count= count_vect.fit_transform(X2)

In [29]:
%%time

clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)
# Form a prediction set
X_test_tfidf = tfidf_vect.transform(X_test)
predictions = clf.predict(X_test_tfidf)

CPU times: user 2.39 s, sys: 28 ms, total: 2.42 s
Wall time: 2.42 s


In [30]:
def evaluate(y_test, predictions):
    print('f1 score: ',metrics.f1_score(y_test, predictions, pos_label="Yes"))
    print('\n', '------------------', '\n')
    print(metrics.classification_report(y_test, predictions))
    print('\n', '------------------', '\n')
    print(metrics.confusion_matrix(y_test, predictions))

In [31]:
evaluate(y_test, predictions) # type: ignore

f1 score:  0.9475507954008505

 ------------------ 

              precision    recall  f1-score   support

          No       0.85      0.84      0.85      6599
         Yes       0.95      0.95      0.95     18998

    accuracy                           0.92     25597
   macro avg       0.90      0.90      0.90     25597
weighted avg       0.92      0.92      0.92     25597


 ------------------ 

[[ 5551  1048]
 [  950 18048]]


#### Features investigation

In [32]:
# Number of features in bag of words = 232110

print(f'Number of data rows = {X2.shape[0]}') # type: ignore
print(f'Number of features in bag of words = {X_train_count.shape[1]}')

Number of data rows = 102385
Number of features in bag of words = 237116


The ngram_range=(1,3) despite adding important data also makes the matrix really sparse which can lead to failed convergence specially when using CV.

### Grid Search Models

**goal:** minimize false positives (due to imbalanced dataset) and to target negative reviews from users

**metric:** precision

**models:** since most sentiment trained models are in english this option will not be considered a first go. Also due to the limited ammount of data deep learning probably won't fit very well

In [33]:
text_lin_svc = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, min_df=5, max_df=0.85, stop_words=pt_stopwords, max_features=5000)),
                     ('lin_svc', LinearSVC())])

text_rf = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, min_df=5, max_df=0.85, stop_words=pt_stopwords, max_features=5000)),
                     ('rf', RandomForestClassifier())])

text_nb = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, min_df=5, max_df=0.85, stop_words=pt_stopwords, max_features=5000)),
                     ('nb', MultinomialNB())])

text_lr = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, min_df=5, max_df=0.85, stop_words=pt_stopwords, max_features=5000)),
                     ('lr', LogisticRegression())])


parameters = {'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)],
              'tfidf__use_idf': (True, False),
}

pipelines = [text_rf, text_nb, text_lr, text_lin_svc]

In [34]:
for pipe in pipelines:

    precision = metrics.make_scorer(metrics.precision_score, pos_label="Yes")
    grid = GridSearchCV(pipe, parameters, cv=2, verbose=2, scoring=precision, n_jobs=-1)
    grid.fit(X_train, y_train)

    best_precision = grid.best_score_
    best_parameters = grid.best_params_
  
    print('\n', "Current Pipeline: ", pipe[1], '\n')
    print("Best precision: {}".format(best_precision))
    print("Best Parameters:", best_parameters)

    # Scoring Training data 
    print('\n', 'Precision on training data: ', round(grid.score(X_train, y_train), 4))        # type: ignore
    # Scoring Test data 
    print('Precision on test data: ', round(grid.score(X_test, y_test), 4), '\n')      # type: ignore

    print('Confusion Matrix : \n' + str(metrics.confusion_matrix(y_test, grid.best_estimator_.predict(X_test))))

    print(metrics.classification_report(y_test, grid.best_estimator_.predict(X_test))) # type: ignore

Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV] END .....tfidf__ngram_range=(1, 1), tfidf__use_idf=True; total time= 1.9min
[CV] END ....tfidf__ngram_range=(1, 1), tfidf__use_idf=False; total time= 1.9min
[CV] END .....tfidf__ngram_range=(1, 1), tfidf__use_idf=True; total time= 2.0min
[CV] END ....tfidf__ngram_range=(1, 1), tfidf__use_idf=False; total time= 2.0min
[CV] END ....tfidf__ngram_range=(1, 2), tfidf__use_idf=False; total time= 2.1min
[CV] END ....tfidf__ngram_range=(1, 2), tfidf__use_idf=False; total time= 2.1min
[CV] END .....tfidf__ngram_range=(1, 2), tfidf__use_idf=True; total time= 2.1min
[CV] END .....tfidf__ngram_range=(1, 2), tfidf__use_idf=True; total time= 2.2min
[CV] END .....tfidf__ngram_range=(1, 3), tfidf__use_idf=True; total time= 2.0min
[CV] END .....tfidf__ngram_range=(1, 3), tfidf__use_idf=True; total time= 2.1min
[CV] END ....tfidf__ngram_range=(1, 3), tfidf__use_idf=False; total time= 2.0min
[CV] END ....tfidf__ngram_range=(1, 3), tfidf__us

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END .....tfidf__ngram_range=(1, 1), tfidf__use_idf=True; total time=   4.5s
[CV] END .....tfidf__ngram_range=(1, 1), tfidf__use_idf=True; total time=   4.6s
[CV] END ....tfidf__ngram_range=(1, 1), tfidf__use_idf=False; total time=   5.1s
[CV] END ....tfidf__ngram_range=(1, 1), tfidf__use_idf=False; total time=   5.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END .....tfidf__ngram_range=(1, 2), tfidf__use_idf=True; total time=  10.6s
[CV] END ....tfidf__ngram_range=(1, 2), tfidf__use_idf=False; total time=  10.6s
[CV] END ....tfidf__ngram_range=(1, 2), tfidf__use_idf=False; total time=  11.0s
[CV] END .....tfidf__ngram_range=(1, 2), tfidf__use_idf=True; total time=  11.2s
[CV] END .....tfidf__ngram_range=(1, 3), tfidf__use_idf=True; total time=  15.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END .....tfidf__ngram_range=(1, 3), tfidf__use_idf=True; total time=  16.3s
[CV] END ....tfidf__ngram_range=(1, 3), tfidf__use_idf=False; total time=  15.9s
[CV] END ....tfidf__ngram_range=(1, 3), tfidf__use_idf=False; total time=  16.3s
[CV] END .....tfidf__ngram_range=(2, 3), tfidf__use_idf=True; total time=  12.1s
[CV] END ....tfidf__ngram_range=(2, 3), tfidf__use_idf=False; total time=  11.7s
[CV] END .....tfidf__ngram_range=(2, 3), tfidf__use_idf=True; total time=  12.4s
[CV] END ....tfidf__ngram_range=(2, 3), tfidf__use_idf=False; total time=  11.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 Current Pipeline:  LogisticRegression() 

Best precision: 0.9423498510824917
Best Parameters: {'tfidf__ngram_range': (1, 2), 'tfidf__use_idf': False}

 Precision on training data:  0.9491
Precision on training data:  0.9452 

Confusion Matrix : 
[[ 5551  1048]
 [  912 18086]]
              precision    recall  f1-score   support

          No       0.86      0.84      0.85      6599
         Yes       0.95      0.95      0.95     18998

    accuracy                           0.92     25597
   macro avg       0.90      0.90      0.90     25597
weighted avg       0.92      0.92      0.92     25597

Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV] END .....tfidf__ngram_range=(1, 1), tfidf__use_idf=True; total time=   4.6s
[CV] END ....tfidf__ngram_range=(1, 1), tfidf__use_idf=False; total time=   4.4s
[CV] END ....tfidf__ngram_range=(1, 1), tfidf__use_idf=False; total time=   4.6s
[CV] END .....tfidf__ngram_range=(1, 1), tfidf__use_idf=True; total time=   5.4s
[CV] END .

### Grid Search Hyperarameters

In [35]:
opt_svc = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, min_df=5, max_df=0.85, stop_words=pt_stopwords, max_features=5000, ngram_range=(1, 3), use_idf=False)),
                     ('svc', LinearSVC(class_weight='balanced'))])

parameters_svc = {'svc__C':[0.1,1,10,100,1000],
                'svc__loss':['hinge', 'squared_hinge'], 
}

In [36]:
svc_grid = GridSearchCV(opt_svc, parameters_svc, cv=2, verbose=2, scoring=precision, n_jobs=-1, refit=True)
svc_grid.fit(X_train, y_train)

best_precision = svc_grid.best_score_
best_parameters = svc_grid.best_params_
# ADD PIPELINE NAME
print('\n', "Current Pipeline: ", opt_svc[1], '\n')
print("Best precision: {}".format(best_precision))
print("Best Parameters:", best_parameters)

# Scoring Training data
print('\n', 'Precision on training data: ', round(svc_grid.score(X_train, y_train), 4))        # type: ignore
# Scoring Test data
print('Precision on test data: ', round(svc_grid.score(X_test, y_test), 4), '\n')      # type: ignore

print('Confusion Matrix : \n' + str(metrics.confusion_matrix(y_test, svc_grid.best_estimator_.predict(X_test))))

print(metrics.classification_report(y_test, svc_grid.best_estimator_.predict(X_test))) # type: ignore

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] END ........................svc__C=0.1, svc__loss=hinge; total time=  13.2s
[CV] END ........................svc__C=0.1, svc__loss=hinge; total time=  13.4s
[CV] END ................svc__C=0.1, svc__loss=squared_hinge; total time=  13.9s
[CV] END ..........................svc__C=1, svc__loss=hinge; total time=  14.1s
[CV] END ..........................svc__C=1, svc__loss=hinge; total time=  14.1s
[CV] END ................svc__C=0.1, svc__loss=squared_hinge; total time=  14.8s
[CV] END ..................svc__C=1, svc__loss=squared_hinge; total time=  14.8s
[CV] END ..................svc__C=1, svc__loss=squared_hinge; total time=  15.7s




[CV] END .........................svc__C=10, svc__loss=hinge; total time=  18.7s
[CV] END .........................svc__C=10, svc__loss=hinge; total time=  18.9s




[CV] END ........................svc__C=100, svc__loss=hinge; total time=  22.2s
[CV] END .................svc__C=10, svc__loss=squared_hinge; total time=  23.6s
[CV] END .................svc__C=10, svc__loss=squared_hinge; total time=  23.5s




[CV] END ........................svc__C=100, svc__loss=hinge; total time=  23.6s




[CV] END ................svc__C=100, svc__loss=squared_hinge; total time=  25.2s
[CV] END ................svc__C=100, svc__loss=squared_hinge; total time=  26.5s




[CV] END .......................svc__C=1000, svc__loss=hinge; total time=  23.6s




[CV] END .......................svc__C=1000, svc__loss=hinge; total time=  24.2s




[CV] END ...............svc__C=1000, svc__loss=squared_hinge; total time=  21.4s
[CV] END ...............svc__C=1000, svc__loss=squared_hinge; total time=  21.4s

 Current Pipeline:  LinearSVC(class_weight='balanced') 

Best precision: 0.9698867112982654
Best Parameters: {'svc__C': 0.1, 'svc__loss': 'hinge'}

 Precision on training data:  0.9726
Precision on training data:  0.971 

Confusion Matrix : 
[[ 6091   508]
 [ 1962 17036]]
              precision    recall  f1-score   support

          No       0.76      0.92      0.83      6599
         Yes       0.97      0.90      0.93     18998

    accuracy                           0.90     25597
   macro avg       0.86      0.91      0.88     25597
weighted avg       0.92      0.90      0.91     25597



In [41]:
joblib.dump(svc_grid.best_estimator_, '../../models/svc_pipeline_no_cleaning.pkl', compress = 1)

['../../models/svc_pipeline_no_cleaning.pkl']

### Model understanding

In [37]:
# Try to get importance of features
# something like:
svc_grid.best_estimator_.named_steps["svc"].feature_importances_  # type: ignore

AttributeError: 'LinearSVC' object has no attribute 'feature_importances_'

In [None]:
# ver score por categorias ou análise assim
# métricas de classificação, 
# mudar threshold de classificação (0.8)

In [71]:
# pre trained models (encoding)
# pouco dados 

## (TO-DO) Create Pipelines

In [None]:
class DataCleaning(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Make a new variable that is rating divided by number of reviews
        X['review_text'] = X['review_title'] + ' ' + X['review_text']
        X.drop_duplicates(inplace=True)
        X.drop(columns=['review_title'], inplace=True)
        X.reset_index(drop=True, inplace=True)
        X['review_text'] = X['review_text'].str.lower().copy()
        X['review_text'] = X['review_text'].str.replace('([nN][ãÃaA][oO]|[ñÑ]| [nN] )', 'negação', regex=True).copy()
        regex_list = [r'www\S+', r'http\S+', r'@\S+', r'#\S+', r'[0-9]+', r'\W', r'\s+', r'[ \t]+$']
        for regex in regex_list:
            X['review_text'] = X['review_text'].str.replace(regex, ' ', regex=True).copy()
        return X

In [None]:
text_vars = ['review_text', 'review_title']

text_pipeline = Pipeline([('data_cleaning', DataCleaning())])

data_pipeline = ColumnTransformer(
    [
    ('text', text_pipeline, text_vars),
    ],
    remainder='drop'
)

num_transformed = data_pipeline.fit_transform(X) # type: ignore

In [None]:
# Linear SVC:

pt_stopwords = stopwords.words('portuguese')

text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer(min_df=2, max_df=0.75, ngram_range=(1,3), stop_words=pt_stopwords)),
                         ('clf', LinearSVC()),
])

text_clf_lsvc.fit(X_train, y_train)

In [None]:
parameters = {'clf_C': [0.25, 0.5, 0.75, 1],
                'clf_kernel': ['linear', 'rbf'],
                'clf_gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}
grid_search = GridSearchCV(estimator = text_clf_lsvc,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs = 2)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

In [None]:
# Form a prediction set
predictions = text_clf_lsvc.predict(X_test)