# Modeling and Analysis: Implementation of NLP and ML Approach for Sentiment Detection on Twitter Tweets

### FETCH DATA FROM TWITTER USING TWITTER API

In [1]:
import tweepy as tw
my_api_key = "nZA1UP3eIW9KbRPOAXQZcN7nH"
my_api_secret = "S0KjwqFMo7RJAtoNBCcDjToHmYXfjXRVsni0VDJpv8fyEo48sf"
auth = tw.OAuthHandler(my_api_key, my_api_secret)
api = tw.API(auth, wait_on_rate_limit=True)

### TOPIC EXPERIMENT WHAT SENTIMENT

In [2]:
search_query = "@bongbongmarcos"

### SET HOW MANY DATA WILL BE COLLECTED

In [None]:
tweet = tw.Cursor(api.search, q=search_query, lang="en", since="2021-01-01").items(5000)
tweet_copy = []
for tweet in tweet:
    tweet_copy.append(tweet)    
print("Total Tweets fetched:", len(tweet_copy))

### DATAFRAME

In [None]:
%%time
import pandas as pd
data = pd.DataFrame()
for tweet in tweet_copy:
    hashtags = []
    try:
        for hashtag in tweet.entities["hashtags"]:
            hashtags.append(hashtag["text"])
        text = api.get_status(id=tweet.id, tweet_mode='extended').full_text
    except:
        pass
    data = data.append(pd.DataFrame({'user_name': tweet.user.name, 'date': tweet.created_at,'text': text, 'hashtags': [hashtags if hashtags else None],'source': tweet.source}))
    data = data.reset_index(drop=True)
data.head(7)

### DESCRIBING DATA

DATA TYPES

In [None]:
data.dtypes

COLUMNS

In [None]:
data.columns

INDEX

In [None]:
data.index

SHAPE

In [None]:
data.shape

SUMMARY

In [None]:
data.info()

DROP COLUMN HASHTAGS

In [None]:
data = data.drop(['hashtags'], axis=1)

CHECK SUMMARY

In [None]:
data.info()

DATAFRAME

In [None]:
data.head(7)

### BASIC TEXT PREPROCESSING

SET TEXT LOWERCASE

In [None]:
pd.set_option('display.max_colwidth', 200)
data['lc'] = data['text'].str.lower()
data[['text', 'lc']].head(7)

REMOVE UNWANTED SPACES

In [None]:
import re
data['us'] = data['lc'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
data[['lc', 'us']].head(7)

REMOVE URL

In [None]:
data['url'] = data['us'].apply(lambda x:re.sub(r"http\S+|https\S+", "", x, flags = re.MULTILINE))
data[['us', 'url']].head(7)

REMOVE HTML ENCONDINGS

In [None]:
data['html'] = data['url'].apply(lambda x: re.sub('<[^<]+?>', '', x))
data[['url', 'html']].head(7)

REMOVE @USERNAMES

In [None]:
data['un'] = data['html'].apply(lambda x: re.sub('@[A-Za-z0-9]+', '', x))
data[['html', 'un']].head(7)

REMOVE PUNCTUATIONS, NUMBERS, EMOJIS, ETC.

In [None]:
data['pne'] = data['un'].apply(lambda x: re.sub('[^A-Za-z]', ' ', x))
data[['un', 'pne']].head(7)

REMOVE 3 AND LESS CHARACTERS

In [None]:
data['two'] = data['pne'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
data[['pne', 'two']].head(7)

### INTERMEDIATE TEXT PREPROCESSING

REMOVE STOPWORD

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
print(stop)

In [None]:
from nltk.tokenize import word_tokenize
def def_sw(texts):    
    tweet_tokens = word_tokenize(texts)
    sw = [word for word in tweet_tokens if word not in stop]        
    return " ".join(sw)

In [None]:
data['sw'] = data['two'].apply(lambda x: def_sw(x))
data[['two', 'sw']].head(7)

LEMMATIZING

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
def def_lt(tweets):     
    tweet_tokens = word_tokenize(tweets)
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos = 'a') for w in tweet_tokens] #adjective
    lemma_words = [lemmatizer.lemmatize(w, pos = 'n') for w in lemma_words]  #nouns
    lemma_words = [lemmatizer.lemmatize(w, pos = 'r') for w in lemma_words]  #adverb
    lemma_words = [lemmatizer.lemmatize(w, pos = 'v') for w in lemma_words]  #verb  
    return " ".join(lemma_words)

In [None]:
data['lt'] = data['sw'].apply(lambda x: def_lt(x))
data[['sw', 'lt']].head(7)

REMOVE DUPLICATE

In [None]:
from collections import OrderedDict
data['dc'] = (data['lt'].str.split().apply(lambda x: OrderedDict.fromkeys(x).keys()).str.join(' '))
data[['lt', 'dc']].head(7)

CLEANED TEXT

In [None]:
data['cleaned_text'] =  data['dc']

### SENTIMENT ANALYSIS

VADER

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
data['compound'] = [analyzer.polarity_scores(v)['compound']*100 for v in data['lt']]
data['neg'] = [analyzer.polarity_scores(v)['neg']*100 for v in data['lt']]
data['neu'] = [analyzer.polarity_scores(v)['neu']*100 for v in data['lt']]
data['pos'] = [analyzer.polarity_scores(v)['pos']*100 for v in data['lt']]
data[['cleaned_text', 'neg', 'neu', 'pos', 'compound']].round(2).head(7)

SENTIMENT LABELED

In [None]:
def Analysis(score): 
    if score >= 0.05:
        return 1
    elif score <= -0.05:
        return 0
    else:
        return 2
data["st"] = data["compound"].apply(Analysis)
data[['cleaned_text', 'st']].head(7)

SENTIMENT ANALYSIS

In [None]:
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.countplot(data.st)

In [None]:
data['st'].value_counts()

MOST FREQUENTLY

In [None]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
cbs = '#75f0bd'
cv = CountVectorizer(stop_words = 'english')
words = cv.fit_transform(data.cleaned_text)
sum_words = words.sum(axis=0)
words_freq = [(word, sum_words[0, i]) for word, i in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
frequency = pd.DataFrame(words_freq, columns=['word', 'freq'])
frequency.head(30).plot(x='word', y='freq', kind='bar', figsize=(15, 7), color = cbs)
plt.title("Most Frequently Occuring Words - Top 30")
plt.show()

### BY WORDS

In [None]:
words_df = pd.DataFrame(frequency)

VADER

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
words_df['compound'] = [analyzer.polarity_scores(v)['compound']*100 for v in words_df['word']]
words_df['neg'] = [analyzer.polarity_scores(v)['neg']*100 for v in words_df['word']]
words_df['neu'] = [analyzer.polarity_scores(v)['neu']*100 for v in words_df['word']]
words_df['pos'] = [analyzer.polarity_scores(v)['pos']*100 for v in words_df['word']]
words_df[['word', 'neg', 'neu', 'pos', 'compound']].round(2).head(7)

SENTIMENT LABELED

In [None]:
def tweet_analysis(score): 
    if score >= 0.05:
        return 1
    elif score <= -0.05:
        return 0
    else:
        return 2
words_df["labeled"] = words_df["compound"].apply(tweet_analysis)
words_df[['word', 'labeled']].head(7)

SENTIMENT ANALYSIS

In [None]:
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.countplot(words_df.labeled)

In [None]:
words_df['labeled'].value_counts()

FIRST 10 LARGEST POSITIVE VALUES

In [None]:
words_df[['word', 'pos', 'compound']].nlargest(10, ['pos']).reset_index(drop=True)

FIRST 10 LARGEST NEGATIVE VALUES

In [None]:
words_df[['word', 'neg', 'compound']].nlargest(10, ['neg']).reset_index(drop=True)

WORDCLOUD POSITIVE

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud 
from matplotlib import colors
color_list=['#262D35','#ffcc00','#000002',]
colormap=colors.ListedColormap(color_list)
words = ' '.join([Text for Text in words_df[words_df['labeled']==1]['word']])
wordCloud = WordCloud(background_color='white',colormap=colormap, mode="RGB", width=2000 , height=1000).generate(words)
plt.figure(figsize=(20,10))
plt.imshow(wordCloud)
plt.title("Wordcloud Positive")
plt.show()

### WORCLOUD NEGATIVE

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud 
from matplotlib import colors
from nltk.corpus import stopwords
color_list=['#262D35','#ffcc00','#000002',]
colormap=colors.ListedColormap(color_list)
words = ' '.join([Text for Text in words_df[words_df['labeled']==0]['word']])
wordCloud = WordCloud(background_color='white',colormap=colormap, mode="RGB", width=2000 , height=1000).generate(words)
plt.figure(figsize=(20,10))
plt.imshow(wordCloud)
plt.title("Wordcloud Negative")
plt.show()

TEXTBLOB

In [None]:
from textblob import TextBlob
def get_sub(Tweets):
    return TextBlob(Tweets).sentiment.subjectivity
def get_pol(Tweets):
    return TextBlob(Tweets).sentiment.polarity
words_df['subjectivity'] = words_df['word'].apply(get_sub)
words_df['polarity'] = words_df['word'].apply(get_pol)
words_df[['word', 'subjectivity', 'polarity','labeled']].head(7)#.reset_index(drop=True)

SUBJECTIVITY VS OBJECTIVITY

In [None]:
cbs = '#75f0bd'
import numpy
plt.figure(figsize=(8,6))
for i in range(0, words_df.shape[0]):
    plt.scatter(words_df['polarity'][i], words_df['subjectivity'][i], color=cbs)   
plt.title('Scatter plot')
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.show()

### MACHINE LEARNING APROACH

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 200)
dataset = pd.read_csv('dataset.csv')
dataset.iloc[5050:5057]

In [None]:
dataset.shape

In [None]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
def preprocess(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    tweet = re.sub(r"http\S+|https\S+", "", tweet, flags = re.MULTILINE)
    tweet = BeautifulSoup(tweet, 'lxml').get_text()
    tweet = re.sub('@[A-Za-z0-9]+', '', tweet)
    tweet = re.sub("[^a-zA-Z]", ' ', tweet)
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [word for word in tweet_tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos = 'a') for w in filtered_words] #adjective
    lemma_words = [lemmatizer.lemmatize(w, pos = 'n') for w in lemma_words]  #nouns
    lemma_words = [lemmatizer.lemmatize(w, pos = 'r') for w in lemma_words]  #adverb
    lemma_words = [lemmatizer.lemmatize(w, pos = 'v') for w in lemma_words]  #verb  
    return " ".join(lemma_words)

In [None]:
from collections import OrderedDict
dataset['pr'] = dataset['tweet'].apply(lambda x: preprocess(x))
dataset['two'] = dataset['pr'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
dataset['dc'] = (dataset['two'].str.split().apply(lambda x: OrderedDict.fromkeys(x).keys()).str.join(' '))
dataset[['tweet', 'dc']].iloc[5050:5057]

In [None]:
dataset['ct'] = dataset['dc']

In [None]:
import numpy as np 
import pandas as pd 
import os
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
corpus = []
for i in range(0, len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['ct'][i])
    review = review.lower()
    review = review.split()
    review = ' '.join(review)
    corpus.append(review)
bow_transformer = CountVectorizer(stop_words='english')
bow_transformer = bow_transformer.fit(corpus)
print('Length of the Vocabulary: ',len(bow_transformer.vocabulary_))
messages_bow = bow_transformer.transform(corpus)
tfidf_transformer = TfidfTransformer().fit(messages_bow)
X = tfidf_transformer.transform(messages_bow)
y = []
for row in dataset["labeled"]:
    y.append(row)

In [None]:
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=bow_transformer.get_feature_names(),columns=["idf_weights"])
df_idf.head(7)

TRAIN AND TEST

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

IMPLEMENT MULTINOMIAL NAIVE BAYES MODEL

In [None]:
%%time
from sklearn.naive_bayes  import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred_classifier = classifier.predict(X_test)
print('Naive Bayes Results:')
print(classification_report(y_test, y_pred_classifier))
confusion_matrix = confusion_matrix(y_test,y_pred_classifier)
print("\nConfusion Matrix\n", confusion_matrix)
plot_confusion_matrix(classifier, X_test, y_test,cmap="GnBu");
print("Multinomial Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred_classifier).round(2)*100)

TESTING

In [None]:
test_set = ['i love computer science']
new_test = bow_transformer.transform(test_set)
classifier.predict(new_test)
# array([0]) = Negative
# array([1]) = Positive

### DETECT FROM TWITTER

In [None]:
data['cleaned_text'].iloc[15]

In [None]:
test_set = [data['cleaned_text'].iloc[15]]
new_test = bow_transformer.transform(test_set)
classifier.predict(new_test)