<a href="https://colab.research.google.com/github/elsyanares/Deep-Learning-Computational-Intelligence/blob/main/TaskWeek6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Nama : Elsya Nareswari

NPM : 2106702535

Original Code : https://www.kaggle.com/code/abdallahwagih/twitter-sentiment-analysis

In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
jp797498e_twitter_entity_sentiment_analysis_path = kagglehub.dataset_download('jp797498e/twitter-entity-sentiment-analysis')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/jp797498e/twitter-entity-sentiment-analysis?dataset_version_number=2...


100%|██████████| 1.99M/1.99M [00:00<00:00, 30.1MB/s]

Extracting files...





Data source import complete.


# Import Needed Modules

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import spacy

# EDA

In [3]:
# Read the dataset with name "Emotion_classify_Data.csv" and store it in a variable df
columns = ['id','country','Label','Text']
df = pd.read_csv("/content/twitter_training.csv", names=columns)

# Print the shape of dataframe
print(df.shape)

# Print top 5 rows
df.head(5)

(46295, 4)


Unnamed: 0,id,country,Label,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46295 entries, 0 to 46294
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       46295 non-null  int64 
 1   country  46295 non-null  object
 2   Label    46295 non-null  object
 3   Text     45850 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.4+ MB


In [5]:
# Check the distribution of Emotion
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
Positive,13710
Negative,12300
Neutral,11483
Irrelevant,8802


In [6]:
# Show sample
for i in range(5):
    print(f"{i+1}: {df['Text'][i]} -> {df['Label'][i]}")

1: im getting on borderlands and i will murder you all , -> Positive
2: I am coming to the borders and I will kill you all, -> Positive
3: im getting on borderlands and i will kill you all, -> Positive
4: im coming on borderlands and i will murder you all, -> Positive
5: im getting on borderlands 2 and i will murder you me all, -> Positive


# Preprocessing

### Drop nan values

In [7]:
df.dropna(inplace=True)

### Preprocess Function

In [8]:
# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")

In [9]:
# use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

Apply preprocess function on dataframe

In [11]:
df['Preprocessed Text'] = df['Text'].apply(preprocess)

In [12]:
df

Unnamed: 0,id,country,Label,Text,Preprocessed Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
...,...,...,...,...,...
46290,11944,Verizon,Neutral,The last 3 August’s I have broken my phone. Th...,3 August break phone year different Joseph Ver...
46291,11944,Verizon,Neutral,The last 3 August's I've broken my phone. This...,3 August break phone year different Joseph Ver...
46292,11944,Verizon,Neutral,The last time I broke my phone was on August 3...,time break phone August 3 year different time ...
46293,11944,Verizon,Neutral,The last 3 August’s I have broken my phone. Th...,3 August break phone year different Joseph Ver...


Encoding target column

In [13]:
le_model = LabelEncoder()
df['Label'] = le_model.fit_transform(df['Label'])

In [14]:
df.head(5)

Unnamed: 0,id,country,Label,Text,Preprocessed Text
0,2401,Borderlands,3,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,3,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,3,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,3,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,3,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder


Split data into train and test

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df['Preprocessed Text'], df['Label'],
                                                    test_size=0.2, random_state=42, stratify=df['Label'])

In [16]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (36680,)
Shape of X_test:  (9170,)


# Machine Learning Model

### Naive Bayes Model

In [17]:
# Create classifier
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (MultinomialNB()))
])

In [18]:
# Model training
clf.fit(X_train, y_train)

In [19]:
# Get prediction
y_pred = clf.predict(X_test)

In [21]:
# Print score
print(accuracy_score(y_test, y_pred))

0.7583424209378408


In [20]:
# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.54      0.69      1742
           1       0.73      0.85      0.79      2438
           2       0.86      0.67      0.75      2268
           3       0.68      0.89      0.77      2722

    accuracy                           0.76      9170
   macro avg       0.80      0.74      0.75      9170
weighted avg       0.79      0.76      0.75      9170



Random Forest

In [22]:
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (RandomForestClassifier()))
])

In [23]:
clf.fit(X_train, y_train)

In [24]:
# Get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

In [25]:
# Print Accuracy
print(accuracy_score(y_test, y_pred))

0.9146128680479826


In [26]:
# Print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.87      0.92      1742
           1       0.94      0.92      0.93      2438
           2       0.93      0.90      0.92      2268
           3       0.85      0.96      0.90      2722

    accuracy                           0.91      9170
   macro avg       0.92      0.91      0.92      9170
weighted avg       0.92      0.91      0.91      9170



# Test Model

Get text

In [28]:
test_df = pd.read_csv('/content/twitter_training.csv', names=columns)
test_df.head()

Unnamed: 0,id,country,Label,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [29]:
test_text = test_df['Text'][10]
print(f"{test_text} ===> {test_df['Label'][10]}")

2010 So I spent a few hours making something for fun. . . If you don't know I am a HUGE RhandlerR fan and Maya is one of my favorite characters. So I decided to make myself a wallpaper for my PC. . Here is the original image versus the creation I made :) Enjoy! pic.twitter.com/mLsI5wf9Jg ===> Positive


Apply preprocess

In [30]:
test_text_processed = [preprocess(test_text)]
test_text_processed

['2010 spend hour make fun know huge RhandlerR fan Maya favorite character decide wallpaper pc original image versus creation enjoy pic.twitter.com/mlsi5wf9jg']

Get Prediction

In [31]:
test_text = clf.predict(test_text_processed)

Output

In [32]:
classes = ['Irrelevant', 'Natural', 'Negative', 'Positive']

print(f"True Label: {test_df['Label'][10]}")
print(f'Predict Label: {classes[test_text[0]]}')

True Label: Positive
Predict Label: Positive


Irrelevant : 0
Natural : 1
Negative: 2
Positive: 3