Let's model tweets with naive bayes and svm algorithms

In [1]:
import pickle
import numpy as np
import pandas as pd

In [2]:
np.random.seed(123)

In [3]:
tweets_df = pickle.load(open('./data/tweets_cleaned.pkl','rb'))

In [4]:
print("Working with {} tweets".format(len(tweets_df['tweets'].unique())))

Working with 4585 tweets


In [176]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm 
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [9]:
X = tweets_df['tweets'].values
y = tweets_df['emoji']

In [178]:
stopwords = set(list(ENGLISH_STOP_WORDS) + ['rt', 'follow', 'dm', 'https', 'ur', 'll' ,'amp', 'subscribe', 'don', 've', 'retweet', 'im', 'http','lt'])
tfidf = TfidfVectorizer(max_features=10000, max_df = .8, min_df = .001, stop_words = stopwords, ngram_range = (1,2))
tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [184]:
# Naive Bayes multinom
nb = naive_bayes.MultinomialNB()
nb.fit(X_train_tfidf, y_train)
# predict the labels on validation dataset
predictions_nb = nb.predict(X_test_tfidf)
# Use accuracy_score function to get the accuracy
nb_score = accuracy_score(predictions_nb, y_test)*100
print("Naive Bayes Accuracy Score -> ", nb_score)

Naive Bayes Accuracy Score ->  8.511870295309786


In [179]:
# Naive Bayes gaussian
gnb = naive_bayes.GaussianNB()
gnb.fit(X_train_tfidf.todense(), y_train)
predictions_gnb = gnb.predict(X_test_tfidf.todense())
        
gnb_score = accuracy_score(predictions_gnb, y_test)*100
print("Gaussian Naive Bayes Accuracy Score -> ", gnb_score)

Gaussian Naive Bayes Accuracy Score ->  3.1847133757961785


In [182]:
# SVM
svm = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', probability=True)
svm.fit(X_train_tfidf, y_train)
# predict the labels on validation dataset
predictions_svm = svm.predict(X_test_tfidf)
# Use accuracy_score function to get the accuracy
score = accuracy_score(predictions_svm, y_test)*100
print("SVM Accuracy Score -> ",score)

SVM Accuracy Score ->  9.843659525188189


In [243]:
# prediction
text = 'climate change'
top_n = 5
test_tfidf = tfidf.transform([text])

In [244]:
# multinom prediction
probs = nb.predict_proba(test_tfidf)
predict_rank = pd.DataFrame({'classes': svm.classes_, 'probs': probs[0]})
predict_rank = predict_rank.sort_values(by = 'probs', ascending = False)

print('top {} predictions for {} is:'.format(top_n, text))

predict_rank[:top_n]

top 5 predictions for climate change is:


Unnamed: 0,classes,probs
475,😂,0.136599
436,🔥,0.074257
516,😭,0.038399
536,🙄,0.032417
587,🤣,0.0272


In [247]:
# gnb prediction
probs = gnb.predict_proba(test_tfidf.todense())
predict_rank = pd.DataFrame({'classes': svm.classes_, 'probs': probs[0]})
predict_rank = predict_rank.sort_values(by = 'probs', ascending = False)

print('top {} predictions for {} is:'.format(top_n, text))

predict_rank[:top_n]

top 5 predictions for climate change is:


Unnamed: 0,classes,probs
22,☠,1.0
11,⏳,6.828435000000001e-23
164,🌲,4.232595e-23
314,👎,4.326672e-37
162,🌩,4.546423e-38


In [246]:
# svm prediction
probs = svm.predict_proba(test_tfidf)
predict_rank = pd.DataFrame({'classes': svm.classes_, 'probs': probs.flatten()})
predict_rank = predict_rank.sort_values(by = 'probs', ascending = False)

print('top {} predictions for {} is:'.format(top_n, text))

predict_rank[:top_n]

top 5 predictions for climate change is:


Unnamed: 0,classes,probs
475,😂,0.094941
436,🔥,0.035626
516,😭,0.033471
587,🤣,0.019915
536,🙄,0.011545


 Accuracy score is higher with emoji with "enough" data

In [19]:
# emojis with at least 30 tweets
enough_emoji = tweets_df.groupby('emoji').count()[tweets_df.groupby('emoji').count()['tweets']>=30]
enough_emoji = pd.merge(enough_emoji.reset_index()[['emoji']], tweets_df, on='emoji', how='left')

In [60]:
class classify_emoji(object):
    
    def __init__(self):
        np.random.seed(1)
    
    def tfidf(self, X, y, max_features=10000, ngram_range = (1,2)):
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        tfidf = TfidfVectorizer(max_features=max_features, ngram_range = ngram_range)
        tfidf.fit(X_train)
        return(tfidf)

    def vectorize(self, X, y, max_features=10000, ngram = (1,2)):
        tfidf = self.tfidf(X, y, max_features=max_features, ngram = ngram_range)
        X_train_tfidf = tfidf.transform(X_train)
        X_test_tfidf = tfidf.transform(X_test)
        return(X_train_tfidf, X_test_tfidf)
    
    def nb_model(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        X_train_tfidf, X_test_tfidf = self.vectorize(X, y)
        # fit the training dataset on the NB classifier
        naive = naive_bayes.MultinomialNB()
        naive.fit(X_train_tfidf, y_train)
        # predict the labels on validation dataset
        predictions_nb = naive.predict(X_test_tfidf)
        # Use accuracy_score function to get the accuracy
        score = accuracy_score(predictions_nb, y_test)*100
        print("Naive Bayes Accuracy Score -> ", score)
        return(naive)
    
    def gnb_model(self, X, y):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        X_train_tfidf, X_test_tfidf = self.vectorize(X, y)

        gnb = naive_bayes.GaussianNB()
        gnb.fit(X_train_tfidf.todense(), y_train)
        predictions_gnb = gnb.predict(X_test_tfidf.todense())
        
        score = accuracy_score(predictions_gnb, y_test)*100
        print("Gaussian Naive Bayes Accuracy Score -> ", score)
        return(gnb)
                                    
    def svm_model(self, X, y):
        # Classifier - Algorithm - SVM
        # fit the training dataset on the classifier
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        X_train_tfidf, X_test_tfidf = self.vectorize(X, y)
        svm = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', probability=True)
        svm.fit(X_train_tfidf, y_train)
        # predict the labels on validation dataset
        predictions_svm = svm.predict(X_test_tfidf)
        # Use accuracy_score function to get the accuracy
        score = accuracy_score(predictions_svm, y_test)*100
        print("SVM Accuracy Score -> ",score)
        return(svm)
    
    def predict(self, text, X, y, top_n = 3, max_features=10000, ngram_range = (1,2)):
        tfidf = self.tfidf(X, y, max_features=max_features, ngram_range = ngram_range)
        test_tfidf = tfidf.transform([text])
        gnb = self.gnb_model(X, y)
        probs = gnb.predict_proba(test_tfidf)
        predict_rank = pd.DataFrame({'classes': gnb.classes_, 'probs': probs}).sort_values(by = 'probs', ascending = False)

        return(print('top {} predictions for {} is:'.format(top_n, text)),
               predict_rank[:top_n])

need to:
    1. grid search
    2. more models
    3. more data