In [1]:
import sklearn
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
scikit_version = sklearn.__version__
scikit_version

'0.24.1'

In [3]:
sentiment_data = pd.read_csv('datasets/sentimental_analysis_data.csv',
                            header=None,
                            names=['Label', 'Text'],
                            sep='\t')

sentiment_data.sample(10)

Unnamed: 0,Label,Text
1989,1,"For example, I love the Harry Potter books as ..."
5851,0,I think I hate Harry Potter because it outshin...
2317,1,I love Harry Potter..
5985,0,"I have to say, I hated Brokeback Mountain, tho..."
5519,0,"Is it just me, or does Harry Potter suck?..."
3525,1,"He's like,'YEAH I GOT ACNE AND I LOVE BROKEBAC..."
1868,1,we're gonna like watch Mission Impossible or H...
4731,0,"Da Vinci Code = Up, Up, Down, Down, Left, Righ..."
3431,1,"Anyway, thats why I love "" Brokeback Mountain."
1327,1,mission impossible 2 rocks!!....


In [4]:
sentiment_data.shape

(6918, 2)

In [5]:
X = sentiment_data['Text']
y = sentiment_data['Label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
X_train.shape, X_test.shape

((5534,), (1384,))

In [8]:
y_train.shape, y_test.shape

((5534,), (1384,))

In [9]:
tfidf_vect = TfidfVectorizer(max_features=15)
x_trans = tfidf_vect.fit_transform(X_train)

In [10]:
print(x_trans[0:3])

  (0, 0)	0.458901690695104
  (0, 6)	0.5354838109758301
  (0, 9)	0.5354838109758301
  (0, 8)	0.46467560151440906
  (1, 1)	0.48601927428689595
  (1, 7)	0.4366945193762843
  (1, 3)	0.3893823977339896
  (1, 13)	0.3893823977339896
  (1, 4)	0.3893823977339896
  (1, 12)	0.3438418339486082
  (2, 3)	0.5773502691896257
  (2, 13)	0.5773502691896257
  (2, 4)	0.5773502691896257


In [11]:
x_trans.shape

(5534, 15)

In [12]:
classifier = LinearSVC(C=1.0, max_iter=1000, tol=1e-3)
linear_svc_model = classifier.fit(x_trans, y_train)
linear_svc_model

LinearSVC(tol=0.001)

In [13]:
x_test_trans = tfidf_vect.fit_transform(X_test)

In [14]:
x_test_trans.shape

(1384, 15)

In [15]:
y_pred = linear_svc_model.predict(x_test_trans)
y_pred

array([0, 1, 0, ..., 1, 0, 0], dtype=int64)

In [16]:
pred_results = pd.DataFrame({'y_test': y_test,
                            'y_pred': y_pred})
pred_results.sample(5)

Unnamed: 0,y_test,y_pred
272,1,1
5602,0,0
1782,1,0
5525,0,0
1876,1,1


In [17]:
from sklearn.metrics import accuracy_score

In [18]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.786849710982659

In [19]:
text_clf_param = {}

text_clf_param['preprocessing'] = tfidf_vect
text_clf_param['model'] = linear_svc_model
text_clf_param['sklearn_version'] = scikit_version
text_clf_param['accuracy'] = accuracy

In [20]:
text_clf_param

{'preprocessing': TfidfVectorizer(max_features=15),
 'model': LinearSVC(tol=0.001),
 'sklearn_version': '0.24.1',
 'accuracy': 0.786849710982659}

In [21]:
import joblib

In [22]:
filename = 'models/text_clf_checkpoint.joblib'

In [23]:
joblib.dump(text_clf_param, filename)

['models/text_clf_checkpoint.joblib']

In [24]:
clf_checkpoint = joblib.load(filename)

In [25]:
reloaded_vect = clf_checkpoint['preprocessing']
reloaded_vect

TfidfVectorizer(max_features=15)

In [26]:
clf_model = clf_checkpoint['model']
clf_model

LinearSVC(tol=0.001)

In [27]:
x_test_trans_new = reloaded_vect.fit_transform(X_test)

In [28]:
y_pred = clf_model.predict(x_test_trans_new)
y_pred

array([0, 1, 0, ..., 1, 0, 0], dtype=int64)

In [29]:
accuracy_score(y_test, y_pred)

0.786849710982659

In [30]:
clf_checkpoint['accuracy']

0.786849710982659

In [31]:
from sklearn.pipeline import Pipeline

In [32]:
clf_pipeline = Pipeline(steps =[('tfidf_vect', tfidf_vect), ('classifier', classifier)])
pipeline_model = clf_pipeline.fit(X_train, y_train)

In [33]:
y_pred = pipeline_model.predict(X_test)

In [34]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8916184971098265

In [35]:
pipe_clf_param = {}
pipe_clf_param['pipeline_clf'] = pipeline_model
pipe_clf_param['sklearn_version'] = scikit_version
pipe_clf_param['accuracy'] = accuracy

In [36]:
filename = 'models/pipe_clf_checkpoint.joblib'

In [37]:
joblib.dump(pipe_clf_param, filename)

['models/pipe_clf_checkpoint.joblib']

In [38]:
pipe_clf_checkpoint = joblib.load(filename)

In [39]:
reloaded_pipeline = pipe_clf_checkpoint['pipeline_clf']
reloaded_pipeline

Pipeline(steps=[('tfidf_vect', TfidfVectorizer(max_features=15)),
                ('classifier', LinearSVC(tol=0.001))])

In [40]:
y_pred = reloaded_pipeline.predict(X_test)

In [41]:
accuracy_score(y_test, y_pred)

0.8916184971098265

In [42]:
pipe_clf_checkpoint['accuracy']

0.8916184971098265