In [16]:
#https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f

#Lets load data set
import numpy as np
import pandas as pd

messages = pd.read_csv('tweet_emotions.csv')


In [17]:
messages.head()


Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [18]:
messages.shape

(40000, 3)

In [19]:
messages.sentiment.unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [20]:
len(messages.sentiment.unique())


13

In [21]:
#Now we want
#Label          Message      
#Spam           this is a cat it is this cat
#To Become
#               this        is          a           cat         it 
#  Spam         2           2           1           2           1 


In [22]:

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2',
                        encoding='latin-1', ngram_range=(1, 2), stop_words='english')


In [23]:

counts = vectorizer.fit_transform(messages['content'].values)


In [24]:
features = counts.toarray()
features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
vectorizer.get_feature_names()




['00',
 '000',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '10 30',
 '10 days',
 '10 hours',
 '10 min',
 '10 mins',
 '10 minutes',
 '10 years',
 '100',
 '100 followers',
 '1000',
 '100th',
 '10am',
 '10pm',
 '10th',
 '11',
 '11 11',
 '11 hours',
 '12',
 '12 days',
 '12 hours',
 '12am',
 '12th',
 '13',
 '14',
 '14 hours',
 '140',
 '140 characters',
 '15',
 '15 mins',
 '15 minutes',
 '150',
 '15th',
 '16',
 '16th',
 '17',
 '18',
 '18th',
 '19',
 '1995',
 '1am',
 '1st',
 '1st time',
 '20',
 '20 minutes',
 '200',
 '2000',
 '2008',
 '2009',
 '2010',
 '20th',
 '21',
 '22',
 '22nd',
 '23',
 '24',
 '24 hours',
 '25',
 '25 minutes',
 '26',
 '27',
 '27th',
 '28',
 '29',
 '2am',
 '2b',
 '2day',
 '2moro',
 '2morrow',
 '2nd',
 '2night',
 '2nite',
 '2pm',
 '2uhs',
 '2w5v',
 '30',
 '30 min',
 '30 minutes',
 '300',
 '3000',
 '30am',
 '30pm',
 '31',
 '32',
 '33',
 '333',
 '3333',
 '35',
 '36',
 '360',
 '38',
 '3am',
 '3d',
 '3d movie',
 '3g',
 '3rd',
 '3wordsaftersex',
 '40',
 '400',
 '44',
 '45'

In [26]:
from sklearn.naive_bayes import MultinomialNB



In [27]:

targets = messages['sentiment'].values


In [28]:

classifier = MultinomialNB().fit(counts, targets)


In [29]:
worry_message = "@onscrn Ahh.  ... Well, I was hoping that I could learn some stuff on the way. ... Why not you and I work on separate things but also"
sadness_message = "I'm having a problem with my photo here in twitter amf!!!...can't see my face!"
nuteral_message = "Chocolate milk is so much better through a straw. I lack said straw"
enthusiasm_message = "bed...sorta. today was good, sara has strep thought Angelina does to; i shared a water with her B4 they told me, i will prob get it to"

examples = [worry_message, sadness_message,
            nuteral_message, enthusiasm_message]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions


array(['worry', 'worry', 'neutral', 'worry'], dtype='<U10')