In [1]:
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
movie_reviews_data_folder = '../txt_sentoken/'
dataset = load_files(movie_reviews_data_folder, shuffle=False)
print("n_samples: %d" % len(dataset.data))


n_samples: 2000


In [3]:
dataset.target_names

['neg', 'pos']

In [4]:
# split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.25, random_state=None)


In [22]:
clf = Pipeline([
    ('vec', TfidfVectorizer()),
    ('clf', LinearSVC()),
])

In [23]:
clf.fit(docs_train, y_train)



In [24]:
y_predicted = clf.predict(docs_test)

In [25]:
#evaluates the performance of the classifier
print(metrics.classification_report(y_test, y_predicted,
                                    target_names=dataset.target_names))

              precision    recall  f1-score   support

         neg       0.83      0.86      0.85       240
         pos       0.87      0.84      0.86       260

    accuracy                           0.85       500
   macro avg       0.85      0.85      0.85       500
weighted avg       0.85      0.85      0.85       500



In [26]:
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

[[207  33]
 [ 41 219]]


In [14]:
# TASK: Build a grid search to find out whether unigrams or bigrams are
# more useful.
parameters = {
    'vec__ngram_range': [(1, 1), (1, 2)]
}

gs_clf = GridSearchCV(clf, parameters, cv=5, n_jobs=-1)


In [19]:
gs_clf.param_grid

{'vec__ngram_range': [(1, 2)]}

In [21]:
clf.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('vec', TfidfVectorizer()), ('clf', LinearSVC())])>

In [15]:
gs_clf.fit(docs_train, y_train)
gs_y_predicted = gs_clf.predict(docs_test)



In [16]:
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

[[207  33]
 [ 41 219]]


In [None]:
# Fit the pipeline on the training set using grid search for the parameters

# TASK: print the cross-validated scores for the each parameters set
# explored by the grid search

# TASK: Predict the outcome on the testing set and store it in a variable
# named y_predicted

# Print the classification report
print(metrics.classification_report(y_test, y_predicted,
                                    target_names=dataset.target_names))

# Print and plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

# import matplotlib.pyplot as plt
# plt.matshow(cm)
# plt.show()
