# Sentiment Analysis using Support Vector Machine (SVM)

We train our sentiment analysis using classifier algo SVM.

In [15]:
# Install the packages

!pip install pandas scikit-learn > /dev/null 2>&1

In [16]:
import pandas as pd
import re
import sklearn.model_selection as ms
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import LinearSVC


In [17]:
# Load the training data and perform eda.

df = pd.read_csv('../../data/twitter_data.csv')

In [18]:
# Clean up the data.
def cleanse(text):
    # Only string and replace null with an empty string.
    text = '' if pd.isna(text) else str(text)    
    
    # Remove leading and trailing spaces.
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['text'] = df['text'].apply(cleanse)

# Drop if category is nan or none
df = df.dropna(subset=['category'])

In [19]:
# Split the data into 2 datasets for training and testing.

X_train, X_test, y_train, y_test = ms.train_test_split(df['text'], df['category'], test_size=0.25, random_state=42)

In [20]:
vec = CountVectorizer()
X_train_vector = vec.fit_transform(X_train)
X_test_vector = vec.transform(X_test)

Here are our inputs to the classification models:

| Parameter       | Description                     | Example                                   |
|:----------------|:--------------------------------|:------------------------------------------|
| Feature name    | Word name                       | ['abuses' 'again' 'from' 'this',...]      |
| Feature measure | Word count                      | [0, 0, 1, 1,...]                          |  
| Label           | The sentiment grade (-1, 0, 1)  |                                           |
| Data (X_train)  | The twitter text                | ['this comes from cabinet which...',...]  |
| Data (Y_train)  | The sentiment grade (-1, 0, 1)  | -1                                        |

In [21]:
# Now that we have the features determined and quantified as a vector, we can feed the features.

svc = LinearSVC()
svc.fit(X_train_vector, y_train)
y_pred = svc.predict(X_test_vector)

# See results.
print("Classification Report (ML-based):")
print(classification_report(y_test, y_pred))
print("Accuracy Score (ML-based):")
print(accuracy_score(y_test, y_pred))


Classification Report (ML-based):
              precision    recall  f1-score   support

        -1.0       0.91      0.89      0.90      9019
         0.0       0.97      0.97      0.97     13689
         1.0       0.95      0.95      0.95     18036

    accuracy                           0.95     40744
   macro avg       0.94      0.94      0.94     40744
weighted avg       0.95      0.95      0.95     40744

Accuracy Score (ML-based):
0.9455625368152366
