**Install Dependencies**

Using an instance of Google Cloud AI Platform Notebooks - Intel optimized Base

In [None]:
!pip3 install xgboost

In [None]:
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd

**Query IMDB Data from BigQuery**

In [None]:
%%bigquery df
SELECT 
    string_field_2 as reviewtext,
    string_field_3 as sentiment
FROM `gmikels-devenv.imdb.reviews`
WHERE string_field_2 IS NOT NULL AND string_field_3 IS NOT NULL
AND string_field_3 in ('pos','neg')
AND RAND() < 0.10

In [None]:
df.head()

**Format data, split into test and train, and build classifier pipeline**

In [None]:
# convert input data to list objects
data = df['reviewtext'].tolist()
target = df['sentiment'].tolist()
print("n_samples: %d" % len(data))

# split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    data, target, test_size=0.25, random_state=None)

# TASK: Build a vectorizer / classifier pipeline that filters out tokens
# that are too rare or too frequent
pipeline = Pipeline([
    ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
    ('clf', LinearSVC(C=1000)),
])

**Fit the pipeline on the training set with a grid search**

In [None]:
# TASK: Build a grid search to find out whether unigrams or bigrams are
# more useful.
# Fit the pipeline on the training set using grid search for the parameters
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
grid_search.fit(docs_train, y_train)

**Display accuracy against the test set**

In [None]:
# TASK: print the mean and std for each candidate along with the parameter
# settings for all the candidates explored by grid search.
n_candidates = len(grid_search.cv_results_['params'])
for i in range(n_candidates):
    print(i, 'params - %s; mean - %0.2f; std - %0.2f'
             % (grid_search.cv_results_['params'][i],
                grid_search.cv_results_['mean_test_score'][i],
                grid_search.cv_results_['std_test_score'][i]))

# TASK: Predict the outcome on the testing set and store it in a variable
# named y_predicted
y_predicted = grid_search.predict(docs_test)

# Print the classification report
print(metrics.classification_report(y_test, y_predicted))
print(metrics.accuracy_score(y_test, y_predicted))

**Simple confusion matrix**

In [None]:
# Print and plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)