## Processamento de texto a partir do dataset:

## > School of AI Algiers Challenge - Twitter Sentiment Analysis

fonte: https://www.kaggle.com/youben/twitter-sentiment-analysis

## Load the data

In [1]:
import numpy as np
import pandas as pd

# This is for making some large tweets to be displayed
pd.options.display.max_colwidth = 100

# I got some encoding issue, I didn't knew which one to use !
# This post suggested an encoding that worked!
# https://stackoverflow.com/questions/19699367/unicodedecodeerror-utf-8-codec-cant-decode-byte
train_data = pd.read_csv("train.csv", encoding='ISO-8859-1')

## Visualize the tweets

In [2]:
# We will now take a look at random tweets
# to gain more insights

rand_indexs = np.random.randint(1,len(train_data),50).tolist()
train_data["SentimentText"][rand_indexs]

56280                                          @BaltarStar Wow! No cliche left unturned (and fluffy hair!). 
88117            @aydeejay awwwwww.... YOU CAN CLEAN THE GLASS SLiPPERS CiNDERELLA, BUT YOU CANT HAVE THEM. 
99986                                                                         @CuPcAkE_2120 ya i thought so 
557                 - thelovelybones: I plan on owning this â¦donât judge me. http://tumblr.com/xr0238tmq
65412                          @BlackBoxBelfast it's not Sunday - at least you can wangle a cup of Clements 
7762                                                                          #myweakness F0OD iN GENERAL.. 
48084    @arnoldwender Cool! And I hope you won't get any. I lost my HD but luckily I saved my profile in...
58733                  @Becky_x_x_ I don't have a fave three, just diversity  but sean smith is gorgeous too
43998    @Anne_Frasier ha! have some of our rain. been raining so much can't drive my new jeep  i just wa...
38734      @amurode

#### Emoticons

In [3]:
# We are gonna find what emoticons are used in our dataset
import re
tweets_text = train_data.SentimentText.str.cat()
emos = set(re.findall(r" ([xX:;][-']?.) ",tweets_text))
emos_count = []
for emo in emos:
    emos_count.append((tweets_text.count(emo), emo))
sorted(emos_count,reverse=True)[:11]

[(3281, ':/'),
 (2874, 'x '),
 (2626, ': '),
 (1339, 'x@'),
 (1214, 'xx'),
 (1162, 'xa'),
 (984, ';3'),
 (887, 'xp'),
 (842, 'xo'),
 (713, ';)'),
 (483, 'xe')]

In [4]:
HAPPY_EMO = r" ([xX;:]-?[dD)]|:-?[\)]|[;:][pP]) "
SAD_EMO = r" (:'?[/|\(]) "
print("Happy emoticons:", set(re.findall(HAPPY_EMO, tweets_text)))
print("Sad emoticons:", set(re.findall(SAD_EMO, tweets_text)))

Happy emoticons: {':D', ':p', 'x)', ';-D', ';-)', ':-D', ';P', 'XD', 'xD', ':d', ';)', 'xd', ';d', ';p', ';D'}
Sad emoticons: {':|', ":'(", ':(', ':/'}


### Most used words

In [5]:
import nltk
from nltk.tokenize import word_tokenize

# Uncomment this line if you haven't downloaded punkt before
# or just run it as it is and uncomment it if you got an error.
#nltk.download('punkt')
def most_used_words(text):
    tokens = word_tokenize(text)
    frequency_dist = nltk.FreqDist(tokens)
    print("There is %d different words" % len(set(tokens)))
    return sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)

In [7]:
most_used_words(train_data.SentimentText.str.cat())[:25]

There is 133899 different words


['@',
 '!',
 '.',
 'I',
 ',',
 'to',
 'the',
 'you',
 '?',
 'a',
 'it',
 'i',
 '...',
 ';',
 'and',
 '&',
 'my',
 'for',
 'is',
 'that',
 "'s",
 "n't",
 'in',
 'of',
 'me']

### Stop words

In [8]:
from nltk.corpus import stopwords

#nltk.download("stopwords")

mw = most_used_words(train_data.SentimentText.str.cat())
most_words = []
for w in mw:
    if len(most_words) == 1000:
        break
    if w in stopwords.words("english"):
        continue
    else:
        most_words.append(w)

There is 133899 different words


In [9]:
most_words[:25]

['@',
 '!',
 '.',
 'I',
 ',',
 '?',
 '...',
 ';',
 '&',
 "'s",
 "n't",
 'quot',
 "'m",
 ':',
 '#',
 'like',
 '-',
 'get',
 'good',
 'u',
 'know',
 ')',
 'love',
 '(',
 'one']

## Stemming

In [10]:
# I'm defining this function to use it in the 
# Data Preparation Phase
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

#nltk.download('wordnet')
def stem_tokenize(text):
    stemmer = SnowballStemmer("english")
    stemmer = WordNetLemmatizer()
    return [stemmer.lemmatize(token) for token in word_tokenize(text)]

def lemmatize_tokenize(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in word_tokenize(text)]

## Prepare the data

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

### Building the pipeline

In [12]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline

In [13]:
# We need to do some preprocessing of the tweets.
# We will delete useless strings (like @, # ...)
# because we think that they will not help
# in determining if the person is Happy/Sad

class TextPreProc(BaseEstimator,TransformerMixin):
    def __init__(self, use_mention=False):
        self.use_mention = use_mention
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # We can choose between keeping the mentions
        # or deleting them
        if self.use_mention:
            X = X.str.replace(r"@[a-zA-Z0-9_]* ", " @tags ")
        else:
            X = X.str.replace(r"@[a-zA-Z0-9_]* ", "")
            
        # Keeping only the word after the #
        X = X.str.replace("#", "")
        X = X.str.replace(r"[-\.\n]", "")
        # Removing HTML garbage
        X = X.str.replace(r"&\w+;", "")
        # Removing links
        X = X.str.replace(r"https?://\S*", "")
        # replace repeated letters with only two occurences
        # heeeelllloooo => heelloo
        X = X.str.replace(r"(.)\1+", r"\1\1")
        # mark emoticons as happy or sad
        X = X.str.replace(HAPPY_EMO, " happyemoticons ")
        X = X.str.replace(SAD_EMO, " sademoticons ")
        X = X.str.lower()
        return X

In [14]:
# This is the pipeline that will transform our tweets to something eatable.
# You can see that we are using our previously defined stemmer, it will
# take care of the stemming process.
# For stop words, we let the inverse document frequency do the job
from sklearn.model_selection import train_test_split

sentiments = train_data['Sentiment']
tweets = train_data['SentimentText']

# I get those parameters from the 'Fine tune the model' part
vectorizer = TfidfVectorizer(tokenizer=lemmatize_tokenize, ngram_range=(1,2))
pipeline = Pipeline([
    ('text_pre_processing', TextPreProc(use_mention=True)),
    ('vectorizer', vectorizer),
])

# Let's split our data into learning set and testing set
# This process is done to test the efficency of our model at the end.
# You shouldn't look at the test data only after choosing the final model
learn_data, test_data, sentiments_learning, sentiments_test = train_test_split(tweets, sentiments, test_size=0.3)

# This will tranform our learning data from simple text to vector
# by going through the preprocessing tranformer.
learning_data = pipeline.fit_transform(learn_data)

## Select a model

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

lr = LogisticRegression()
bnb = BernoulliNB()
mnb = MultinomialNB()

models = {
    'logistic regression': lr,
    'bernoulliNB': bnb,
    'multinomialNB': mnb,
}

for model in models.keys():
    scores = cross_val_score(models[model], learning_data, sentiments_learning, scoring="f1", cv=10)
    print("===", model, "===")
    print("scores = ", scores)
    print("mean = ", scores.mean())
    print("variance = ", scores.var())
    models[model].fit(learning_data, sentiments_learning)
    print("score on the learning data (accuracy) = ", accuracy_score(models[model].predict(learning_data), sentiments_learning))
    print("")



=== logitic regression ===
scores =  [0.81203189 0.81031756 0.8144938  0.81841247 0.80708897 0.81094767
 0.80923451 0.81672103 0.80807348 0.81284848]
mean =  0.8120169871871848
variance =  1.2146670158113766e-05
score on the learning data (accuracy) =  0.8717281975082867

=== bernoulliNB ===
scores =  [0.7898764  0.77943071 0.79187106 0.78949212 0.78558937 0.78451493
 0.78253707 0.78888759 0.79379262 0.79175934]
mean =  0.7877751186883548
variance =  1.8993468686875977e-05
score on the learning data (accuracy) =  0.9033175220025146

=== multinomialNB ===
scores =  [0.81170059 0.80886303 0.80847288 0.81018826 0.80185656 0.80566395
 0.80834915 0.81199153 0.80414439 0.81140548]
mean =  0.8082635813201623
variance =  1.0447278063494593e-05
score on the learning data (accuracy) =  0.8983740998971311



## Fine tune the model

In [16]:
from sklearn.model_selection import GridSearchCV

grid_search_pipeline = Pipeline([
    ('text_pre_processing', TextPreProc()),
    ('vectorizer', TfidfVectorizer()),
    ('model', MultinomialNB()),
])

params = [
    {
        'text_pre_processing__use_mention': [True, False],
        'vectorizer__max_features': [1000, 2000, 5000, 10000, 20000, None],
        'vectorizer__ngram_range': [(1,1), (1,2)],
    },
]
grid_search = GridSearchCV(grid_search_pipeline, params, cv=5, scoring='f1')
grid_search.fit(learn_data, sentiments_learning)
print(grid_search.best_params_)

{'text_pre_processing__use_mention': True, 'vectorizer__max_features': None, 'vectorizer__ngram_range': (1, 2)}


In [17]:
grid_search

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('text_pre_processing',
                                        TextPreProc(use_mention=False)),
                                       ('vectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                          

## Test

In [18]:
mnb.fit(learning_data, sentiments_learning)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
testing_data = pipeline.transform(test_data)
mnb.score(testing_data, sentiments_test)

0.7551755175517552

In [20]:
# Predecting on the test.csv
sub_data = pd.read_csv("test.csv", encoding='ISO-8859-1')
sub_learning = pipeline.transform(sub_data.SentimentText)
sub = pd.DataFrame(sub_data.ItemID, columns=("ItemID", "Sentiment"))
sub["Sentiment"] = mnb.predict(sub_learning)
print(sub)

        ItemID  Sentiment
0            1          0
1            2          0
2            3          1
3            4          0
4            5          0
...        ...        ...
299984  299996          1
299985  299997          1
299986  299998          1
299987  299999          1
299988  300000          1

[299989 rows x 2 columns]


### Test your tweet

In [21]:
# Just run it
model = MultinomialNB()
model.fit(learning_data, sentiments_learning)
tweet = pd.Series([input(),])
tweet = pipeline.transform(tweet)
proba = model.predict_proba(tweet)[0]
print("The probability that this tweet is sad is:", proba[0])
print("The probability that this tweet is happy is:", proba[1])

I'm so glad to meet you
The probability that this tweet is sad is: 0.059035898277383286
The probability that this tweet is happy is: 0.940964101722616
