In [1]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = Path('data/vader_emolex.csv')
df = pd.read_csv(data)
test = df.sample(frac=0.1, random_state=1)
stars = test['stars'].tolist()
text = test['text'].tolist()

In [5]:
# Vectoriser breaks text into single words and bi-grams and turns text into matrix
vectoriser = TfidfVectorizer(ngram_range=(1,3))
vectors = vectoriser.fit_transform(text)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(vectors, stars, random_state=42)

In [None]:
classifier = LinearSVC()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

In [9]:
X_null, X_full_test, y_null, y_full_test = train_test_split(vectors, stars, random_state=42)
predict_all = classifier.predict(X_full_test)

In [11]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           1       0.68      0.86      0.76      3217
           2       0.53      0.29      0.38      2355
           3       0.55      0.32      0.41      2814
           4       0.54      0.50      0.52      5721
           5       0.70      0.85      0.77      8109

    accuracy                           0.64     22216
   macro avg       0.60      0.57      0.57     22216
weighted avg       0.62      0.64      0.62     22216



In [None]:
from joblib import dump
dump(classifier, 'model_svm.joblib')