# Contents

In [14]:
# data: raw data set 
# Vectorizations using bag of words technique 
# vectorizer from sklearn CountVectorizer
# Model Classifier: Support Vector Classifier 

# Acuracy 70% ? ? ? "Im surprised" / check whether is real or just chance. 

# Metrics: accuracy, classification report and confusion matrix from sklearn

# Setup

In [15]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

# Raw dataset

In [16]:
df = pd.read_csv('../data/raw/tweets-train.csv')

In [17]:
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


# Removing nan cells

In [18]:
df = df.dropna()

# Vectorizing text

* Convert a collection of text documents to a matrix of token counts
* produces 25,913 features. I'll select only first 23,000

In [19]:
vectorizer = CountVectorizer(stop_words='english', strip_accents='ascii', lowercase=True, max_features=23000)

In [20]:
X = vectorizer.fit_transform(df['text'])

In [21]:
print(len(vectorizer.get_feature_names()))

23000


In [22]:
print(X.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [23]:
Y = df['sentiment']

# Split train Test

* Test size left as default 0.25 -> train 0.75 
* shuffle rows left to true

In [24]:
X_train, x_test, Y_train, y_test = train_test_split(X, Y, random_state=42)

# SVC Model

In [25]:
clf = SVC()

# Corss Validation

In [26]:
cv_results = cross_val_score(clf, X_train, Y_train, scoring='accuracy', cv=5, n_jobs=-1)

In [27]:
clf.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [28]:
predictions = clf.predict(x_test)

In [29]:
real_pred = pd.DataFrame({'y_true':y_test, 'y_pred':predictions})

In [30]:
real_pred

Unnamed: 0,y_true,y_pred
1589,positive,positive
10414,negative,neutral
6562,neutral,neutral
2603,positive,positive
4004,neutral,neutral
...,...,...
25223,positive,positive
1040,neutral,neutral
2134,negative,neutral
12418,neutral,neutral


# Metrics and model evaluation

In [31]:
print("Accuracy archived: {0} %".format( round(accuracy_score(real_pred['y_true'],real_pred['y_pred']), 2)))

Accuracy archived: 0.69 %


In [32]:
print(classification_report(real_pred['y_true'],real_pred['y_pred']))

              precision    recall  f1-score   support

    negative       0.76      0.50      0.60      1956
     neutral       0.61      0.80      0.70      2787
    positive       0.78      0.71      0.75      2127

    accuracy                           0.69      6870
   macro avg       0.72      0.67      0.68      6870
weighted avg       0.71      0.69      0.68      6870



In [33]:
pd.DataFrame(confusion_matrix(real_pred['y_true'],real_pred['y_pred']), index=[('Actuals','Negative'), ('Actuals','Neutral'), ('Actuals','Positive')], 
            columns=[('predicted','Negative'), ('predicted','Neutral'), ('predicted','Positive')])

Unnamed: 0,"(predicted, Negative)","(predicted, Neutral)","(predicted, Positive)"
"(Actuals, Negative)",973,858,125
"(Actuals, Neutral)",245,2241,301
"(Actuals, Positive)",54,556,1517
