In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix
import pandas as pd
import math
import pickle
import random
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
import numpy as np
from collections import Counter

### load & inspect data

In [3]:
# read the data from the excel file
data = pd.read_excel('Sentiment_Labelling.xlsx')
data = data[["Youtube_Text_Corrected", "Wav2Vec","Coin"]]
# remove all records that are not labeled
data = data.dropna(subset=["Youtube_Text_Corrected", "Coin"])[["Youtube_Text_Corrected", "Wav2Vec", "Coin"]]
data = data.dropna(subset=["Youtube_Text_Corrected", "Coin"])
# check if the Wav2Vec not empty and replaced it with Youtube_Text_Corrected if it is empty
corpus = data.apply(lambda x: x["Wav2Vec"] if x["Wav2Vec"] != " " else x["Youtube_Text_Corrected"], axis=1)
corpus = corpus.values.astype('U')
coins = list(data['Coin'].values)
print(data['Coin'].value_counts())

BTC     475
ETH     430
DOGE    313
None    154
Name: Coin, dtype: int64


### build coin model

In [4]:
# sample 30% class as validation set

In [5]:
# split the data in train and test using train_test_split_method
X_train, X_test, y_train, y_test = train_test_split(corpus, coins, test_size=0.3, stratify=coins, random_state=2)

In [6]:
Counter(y_test)

Counter({'BTC': 143, 'None': 46, 'ETH': 129, 'DOGE': 94})

In [10]:
# create a Pipeline to train the data 
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])
# train the data using fit_method
text_clf.fit(X_train, y_train)
# use pickle to load the model 
pickle_load = pickle.dumps(text_clf)
coin_model = pickle.loads(pickle_load)
with open("coin_model.pkl", 'wb') as pickle_file:
    pickle.dump(text_clf, pickle_file)
predicted = text_clf.predict(X_test)
"evaluation Acc.:{:.3f}".format(np.mean(predicted == y_test))

'evaluation Acc.:0.803'

In [11]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

         BTC       0.73      0.94      0.82       143
        DOGE       0.89      0.85      0.87        94
         ETH       0.86      0.79      0.82       129
        None       0.82      0.30      0.44        46

    accuracy                           0.80       412
   macro avg       0.82      0.72      0.74       412
weighted avg       0.82      0.80      0.79       412

