# Sentiment Analysis using Python
#### [Tutorials Link]("https://techvidvan.com/tutorials/python-sentiment-analysis/#:~:text=Python%20sentiment%20analysis%20is%20a,in%20a%20piece%20of%20text.")

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Tweets.csv") # Load datas


In [3]:
review_df = df[['text','airline_sentiment']] # Get our Neaded Colums
print(review_df.shape)
review_df.head(5)

(14640, 2)


Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [4]:
df.columns # Show all Colums names

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [5]:
review_df

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
...,...,...
14635,@AmericanAir thank you we got on a different f...,positive
14636,@AmericanAir leaving over 20 minutes Late Flig...,negative
14637,@AmericanAir Please bring American Airlines to...,neutral
14638,"@AmericanAir you have my money, you change my ...",negative


In [6]:
review_df = review_df[review_df['airline_sentiment'] != 'neutral'] # Remove all Neutral Sentiment Rows
print(review_df.shape)
review_df.head(5)

(11541, 2)


Unnamed: 0,text,airline_sentiment
1,@VirginAmerica plus you've added commercials t...,positive
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
5,@VirginAmerica seriously would pay $30 a fligh...,negative
6,"@VirginAmerica yes, nearly every time I fly VX...",positive


In [7]:
review_df["airline_sentiment"].value_counts() 

negative    9178
positive    2363
Name: airline_sentiment, dtype: int64

### factorize Function 
 ```
 factorize() function can be used to encode strings as numeric values.
 Ex:
     >>> pd.factorize(['b', 'b', 'a', 'c', 'b'])
     >>> (array([0, 0, 1, 2, 0]), array(['b', 'a', 'c'], dtype=object))
 ```

In [8]:
sentiment_label = review_df.airline_sentiment.factorize()
sentiment_label

(array([0, 1, 1, ..., 0, 1, 1]),
 Index(['positive', 'negative'], dtype='object'))

In [9]:
tweet = review_df.text.values
tweet

array(["@VirginAmerica plus you've added commercials to the experience... tacky.",
       '@VirginAmerica it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse',
       "@VirginAmerica and it's a really big bad thing about it", ...,
       '@AmericanAir thank you we got on a different flight to Chicago.',
       "@AmericanAir you have my money, you change my flight, and don't answer your phones! Any other suggestions so I can make my commitment??"],
      dtype=object)

##### Tokenizer
```
Tokenization is the process of tokenizing or splitting a string, text into a list of tokens. One can think of token as parts like a word is a token in a sentence, and a sentence is a token in a paragraph.
```
```
Ex:
    >>> text = ["leaving over over leaving"]
    >>> test_doc = tokenizer.texts_to_sequences(text)
    >>> print(test_doc)
    >>> [[491, 99, 99, 491]]
```

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer

2022-05-25 11:21:28.051544: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-25 11:21:28.051563: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [11]:
tokenizer = Tokenizer(num_words=5000) # Create a tokenizer Object

In [12]:
tokenizer.fit_on_texts(tweet) # Train the tokenizer Object

In [13]:
encoded_docs = tokenizer.texts_to_sequences(tweet) 

In [14]:
# text = ["leaving over over leaving"]
# test_doc = tokenizer.texts_to_sequences(text)
# print(test_doc)

# from tensorflow.keras.preprocessing.sequence import pad_sequences
# padded_sequence = pad_sequences(test_doc, maxlen=len(max(tweet)))
# print(padded_sequence)
# type(len(max(tweet)))

###### pad_sequences
```pad_sequences is used to ensure that all sequences in a list have the same length. ```
```
Ex:
    >>> text = ["Hello, I am Rafiq"]
    >>> test_doc = tokenizer.texts_to_sequences(text)
    >>> pad_sequences(test_doc, maxlen=10)
    >>> array([[   0,    0,    0,    0,    0,    0,    0, 1100,    3,  106]],dtype=int32)
    
```

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
padded_sequence = pad_sequences(encoded_docs, maxlen=len(max(tweet)))
padded_sequence

array([[   0,    0,    0, ...,    1,    2,  177],
       [   0,    0,    0, ...,   25,  482, 2417],
       [   0,    0,    0, ...,  483,   86,   21],
       ...,
       [   0,    0,    0, ...,    8,    1,  480],
       [   0,    0,    0, ..., 1682,   45,  945],
       [   0,    0,    0, ...,  144,   11, 2325]], dtype=int32)

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding

In [18]:
vocab_size = len(tokenizer.word_index) + 1

embedding_vector_length = 32
model = Sequential() # Create model instance
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 32)           423488    
                                                                 
 spatial_dropout1d (SpatialD  (None, 200, 32)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               53200     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 476,789
Trainable params: 476,789
Non-trainable params: 0
__________________________________________________

2022-05-25 11:21:29.176805: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-25 11:21:29.177592: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-25 11:21:29.177676: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-05-25 11:21:29.177760: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-05-25 11:21:29.177827: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Co

### Train Models

In [19]:
history = model.fit(padded_sequence,sentiment_label[0],validation_split=0.2, epochs=10, batch_size=32)

Epoch 1/10


ValueError: in user code:

    File "/home/nyaz/miniconda3/envs/ml/lib/python3.8/site-packages/keras/engine/training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "/home/nyaz/miniconda3/envs/ml/lib/python3.8/site-packages/keras/engine/training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/nyaz/miniconda3/envs/ml/lib/python3.8/site-packages/keras/engine/training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "/home/nyaz/miniconda3/envs/ml/lib/python3.8/site-packages/keras/engine/training.py", line 859, in train_step
        y_pred = self(x, training=True)
    File "/home/nyaz/miniconda3/envs/ml/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/nyaz/miniconda3/envs/ml/lib/python3.8/site-packages/keras/engine/input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 200), found shape=(None, 139)


In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(history.history['accuracy'], label='acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()
plt.savefig("Accuracy plot.jpg")

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.show()
plt.savefig("Loss plt.jpg")

In [None]:
def predict_sentiment(text):
    tw = tokenizer.texts_to_sequences([text])
    tw = pad_sequences(tw,maxlen=200)
    prediction = int(model.predict(tw).round().item())
    print(model.predict(tw).round().item())
    print(type(model.predict(tw).round()), model.predict(tw))
    print("Predicted label: ", sentiment_label[1][prediction])

In [None]:
test_sentence1 = "I enjoyed my journey on this flight."
predict_sentiment(test_sentence1)

In [None]:
test_sentence2 = "This is the worst flight experience of my life!"
predict_sentiment(test_sentence2)

In [None]:
predict_sentiment("This is not a Bad news")

> ### Store train model and tokenizers data.

In [None]:
import pickle

# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

tokenizer_filename = 'finalized_tokenizer.sav'
pickle.dump(tokenizer, open(tokenizer_filename, 'wb'))

In [None]:
!ls

In [None]:
loaded_model = pickle.load(open('finalized_model.sav', 'rb'))
loaded_tokenizer = pickle.load(open('finalized_tokenizer.sav', 'rb'))

text = ["This is a Good news"]

tw = loaded_tokenizer.texts_to_sequences(text)
tw = pad_sequences(tw,maxlen=200)
prediction = int(loaded_model.predict(tw).round().item())
print("Predicted label: ", sentiment_label[1][prediction])