# import needed libraries

In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import spacy

# explore the data

In [14]:
columns = ['id', 'company', 'label', 'text']
df_train=pd.read_csv("/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv",names=columns)
df_train.head()

Unnamed: 0,id,company,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


# extract meta data that is useful for our project

In [15]:
df_train.drop(["company","id"],axis=1,inplace=True)


In [16]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   74682 non-null  object
 1   text    73996 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


# preprocessing

In [18]:
df_train.isna().sum()

label      0
text     686
dtype: int64

delete na values

In [19]:
df_train.dropna(inplace=True)

In [20]:
df_train.isna().sum()

label    0
text     0
dtype: int64

In [21]:
nlp=spacy.load("en_core_web_sm")

# initialize preprocess function

In [22]:
def preprocess(text):
    doc=nlp(text)
    filtered_token=list()
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_token.append(token.lemma_)
    return " ".join(filtered_token)    

# showing sample after preprocessing

In [26]:
txt = df_train['text'][0]
print(txt)
process = preprocess(txt)
print(process)

im getting on borderlands and i will murder you all ,
m get borderland murder


# apply function to all dataframe

In [28]:
df_train['preprocessed_text'] = df_train['text'].apply(preprocess)
df_train

Unnamed: 0,label,text,preprocessed_text
0,Positive,im getting on borderlands and i will murder yo...,m get borderland murder
1,Positive,I am coming to the borders and I will kill you...,come border kill
2,Positive,im getting on borderlands and i will kill you ...,m get borderland kill
3,Positive,im coming on borderlands and i will murder you...,m come borderland murder
4,Positive,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
...,...,...,...
74677,Positive,Just realized that the Windows partition of my...,realize Windows partition Mac like 6 year Nvid...
74678,Positive,Just realized that my Mac window partition is ...,realize Mac window partition 6 year Nvidia dri...
74679,Positive,Just realized the windows partition of my Mac ...,realize window partition Mac 6 year Nvidia dri...
74680,Positive,Just realized between the windows partition of...,realize window partition Mac like 6 year Nvidi...


# encoding label column

In [30]:
le_model = LabelEncoder()
df_train['label'] = le_model.fit_transform(df_train['label'])
df_train.head()

Unnamed: 0,label,text,preprocessed_text
0,3,im getting on borderlands and i will murder yo...,m get borderland murder
1,3,I am coming to the borders and I will kill you...,come border kill
2,3,im getting on borderlands and i will kill you ...,m get borderland kill
3,3,im coming on borderlands and i will murder you...,m come borderland murder
4,3,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder


# split the data into train and test

In [33]:
x_train, x_test, y_train, y_test = train_test_split(df_train['preprocessed_text'], df_train['label'], 
                                                    test_size=0.2, random_state=42, stratify=df_train['label'])

In [34]:
print("Shape of X_train: ", x_train.shape)
print("Shape of X_test: ", x_test.shape)
print("Shape of Y_train: ", y_train.shape)
print("Shape of Y_test: ", y_test.shape)

Shape of X_train:  (59196,)
Shape of X_test:  (14800,)
Shape of Y_train:  (59196,)
Shape of Y_test:  (14800,)


# machine learing model

In [36]:
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('random_forest', (RandomForestClassifier()))         
])

In [37]:
clf.fit(x_train, y_train)


# evaluate the model

In [38]:
y_pred = clf.predict(x_test)

In [39]:
print(accuracy_score(y_test, y_pred))

0.910472972972973


In [40]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.86      0.91      2575
           1       0.92      0.93      0.93      4472
           2       0.93      0.89      0.91      3622
           3       0.85      0.94      0.89      4131

    accuracy                           0.91     14800
   macro avg       0.92      0.90      0.91     14800
weighted avg       0.91      0.91      0.91     14800



# test the model

In [41]:
test_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv', names= columns)
test_df.head()

Unnamed: 0,id,company,label,text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [57]:
test_text = test_df['text'][15]
print(f"{test_text} ===> {test_df['label'][15]}")

Congrats to the NVIDIA NeMo team for the 1.0.0 release candidate!
Really excited to see NeMo embracing Hydra as the way to take control over the configuration madness that is machine learning! :) ===> Positive


In [58]:
test_text_processed = [preprocess(test_text)]
test_text_processed

['congrat NVIDIA NeMo team 1.0.0 release candidate \n excited NeMo embrace Hydra way control configuration madness machine learning']

In [59]:
test_text = clf.predict(test_text_processed)

In [60]:
classes = ['Irrelevant', 'Natural', 'Negative', 'Positive']

print(f"True Label: {test_df['label'][15]}")
print(f'Predict Label: {classes[test_text[0]]}')

True Label: Positive
Predict Label: Positive
