In [138]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

DATA PROCESSING


In [121]:
tweets= pd.read_csv('/content/Tweets.csv')

In [122]:
#NUMBER OF ROWS AND COLUMNS
#ROWS= TOTAL NUMBER OF TWEETS
tweets.shape

(14640, 15)

In [123]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [124]:
tweets.describe()

Unnamed: 0,tweet_id,airline_sentiment_confidence,negativereason_confidence,retweet_count
count,14640.0,14640.0,10522.0,14640.0
mean,5.692184e+17,0.900169,0.638298,0.08265
std,779111200000000.0,0.16283,0.33044,0.745778
min,5.675883e+17,0.335,0.0,0.0
25%,5.685592e+17,0.6923,0.3606,0.0
50%,5.694779e+17,1.0,0.6706,0.0
75%,5.698905e+17,1.0,1.0,0.0
max,5.703106e+17,1.0,1.0,44.0


In [134]:
 import nltk
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [125]:
tweets.dtypes

tweet_id                          int64
airline_sentiment                object
airline_sentiment_confidence    float64
negativereason                   object
negativereason_confidence       float64
airline                          object
airline_sentiment_gold           object
name                             object
negativereason_gold              object
retweet_count                     int64
text                             object
tweet_coord                      object
tweet_created                    object
tweet_location                   object
user_timezone                    object
dtype: object

In [126]:
#PRINTING FIRST FIVE ROWS FROM DATASET
tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [128]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()


In [131]:
def preprocess_text(text):
    words = word_tokenize(text)
    words = [ps.stem(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)


In [132]:
 import nltk
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [137]:
tweets['processed_text'] = tweets['text'].apply(preprocess_text)

SPLITTING THE DATA

In [140]:
X_train, X_test, y_train, y_test = train_test_split(tweets['processed_text'], tweets['airline_sentiment'], test_size=0.2, random_state=42)

In [141]:
model = make_pipeline(
    CountVectorizer(),
    MultinomialNB())

In [142]:
model.fit(X_train, y_train)

In [143]:
#testing the model
y_pred = model.predict(X_test)

In [144]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7913251366120219
Confusion Matrix:
[[1784   74   31]
 [ 297  249   34]
 [ 135   40  284]]
Classification Report:
              precision    recall  f1-score   support

    negative       0.81      0.94      0.87      1889
     neutral       0.69      0.43      0.53       580
    positive       0.81      0.62      0.70       459

    accuracy                           0.79      2928
   macro avg       0.77      0.66      0.70      2928
weighted avg       0.78      0.79      0.78      2928



In [146]:
examples = ["The product works as expected and is of high quality"]
example_predictions = model.predict(examples)
print("Example Predictions:", example_predictions)

Example Predictions: ['positive']


In [147]:

examples = ["I love this airline! It's the best.", "Thanks for giving the best experience!!."]
example_predictions = model.predict(examples)
print("Example Predictions:", example_predictions)

Example Predictions: ['positive' 'positive']


In [148]:
examples = ["The staff was friendly, but i didnt like the food!!."]
example_predictions = model.predict(examples)
print("Example Predictions:", example_predictions)

Example Predictions: ['negative']


In [149]:
examples = ["I love this airline! It's the best!!."]
example_predictions = model.predict(examples)
print("Example Predictions:", example_predictions)

Example Predictions: ['positive']


In [150]:
examples = ["The cabin crew provided excellent service throughout the flight!!."]
example_predictions = model.predict(examples)
print("Example Predictions:", example_predictions)

Example Predictions: ['positive']
