In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

pd.set_option('display.max_colwidth', 120)
df = pd.read_json('reviews.json')

In [10]:
df = df.replace({'sentiment' : { 'pos' : "Positive", 'neg' : "Negative", 'neutral' : "Neutral" }})

In [2]:
x_train, x_test, y_train, y_test = train_test_split(df.text, df.sentiment, test_size = 0.2, random_state = 0)

In [3]:
#TfidfVectorizer() - Convert a collection of raw documents to a matrix of TF-IDF features. Equivalent to CountVectorizer 
#                    (Convert a collection of text documents to a matrix of token counts) followed by TfidfTransformer
#                    (Transform a count matrix to a normalized tf or tf-idf representation).
#LinearSVC() - Similar to SVC with parameter kernel=’linear’, but implemented in terms of liblinear rather than libsvm, 
#              so it has more flexibility in the choice of penalties and loss functions and should scale better to large 
#              numbers of samples.

pipe = Pipeline([
    ("tdidf", TfidfVectorizer()),
    ("clf", LinearSVC())])

In [4]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('tdidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [5]:
y_pred = pipe.predict(x_test)

In [7]:
data_pred = pd.DataFrame({'wtf':y_pred})

In [8]:
data_pred.to_csv('data-model-pipeline222.csv', index = False)