In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/sentiment_set.csv')
df = df[['text', 'label']]
df.head()

Unnamed: 0,text,label
0,bought album loved title song 's great song ba...,neg
1,misled thought buying entire cd contains one song,neg
2,introduced many ell high school students lois ...,neg
3,anything purchase left behind series excellent...,pos
4,loved movies cant wiat third one funny not sui...,pos


In [2]:
X_train, X_test, y_train, y_test = train_test_split(df.text, df.label, test_size=0.1, random_state=37)
print('Train data samples:', X_train.shape[0])
print('Test data samples:', X_test.shape[0])

Train data samples: 10722
Test data samples: 1192


In [3]:
X_train_rest, X_val, y_train_rest, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=37)
print('Train data samples:', X_train_rest.shape[0])
print('Test data samples:', X_val.shape[0])

Train data samples: 9649
Test data samples: 1073


In [4]:
# Create feature vectors
vectorizer = TfidfVectorizer(sublinear_tf = True,use_idf = True)
train_vectors = vectorizer.fit_transform(X_train_rest)


In [5]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(train_vectors, y_train_rest)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [6]:
# Predict test data set
test_vectors = vectorizer.transform(X_test)
prediction_linear = classifier_linear.predict(test_vectors)
report = classification_report(y_test, prediction_linear, output_dict=True)
print('positive: ', report['pos'])
print('negative: ', report['neg'])
print("Accuracy text:",report['accuracy'])

positive:  {'precision': 0.8409475465313029, 'recall': 0.8283333333333334, 'f1-score': 0.834592779177162, 'support': 600}
negative:  {'precision': 0.8286189683860233, 'recall': 0.8412162162162162, 'f1-score': 0.8348700754400671, 'support': 592}
Accuracy text: 0.834731543624161


In [7]:
#Predict validation data set
val_vectors = vectorizer.transform(X_val)
prediction_linear = classifier_linear.predict(val_vectors)
report = classification_report(y_val, prediction_linear, output_dict=True)
print('positive: ', report['pos'])
print('negative: ', report['neg'])
print("Accuracy val: ",report['accuracy'])


positive:  {'precision': 0.849290780141844, 'recall': 0.8853974121996303, 'f1-score': 0.8669683257918552, 'support': 541}
negative:  {'precision': 0.8781925343811395, 'recall': 0.8402255639097744, 'f1-score': 0.8587896253602306, 'support': 532}
Accuracy val:  0.8630009319664492
