<a href="https://colab.research.google.com/github/chathuri2020/AI-Ml-projects/blob/main/NLP_tweet_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import the libraries

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('/content/Tweets.csv')

# Display the first few rows to understand its structure
print(data.head())

             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   
1                    NaN    jnar

In [None]:
# Select only the "airline_sentiment" and "text" columns
df = data[["airline_sentiment", "text"]]

# Display the first few rows of the dataframe after selecting relevant columns
print(df.head())

  airline_sentiment                                               text
0           neutral                @VirginAmerica What @dhepburn said.
1          positive  @VirginAmerica plus you've added commercials t...
2           neutral  @VirginAmerica I didn't today... Must mean I n...
3          negative  @VirginAmerica it's really aggressive to blast...
4          negative  @VirginAmerica and it's a really big bad thing...


In [None]:
import nltk
import string
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

ps = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)  # Remove URLs
    text = nltk.word_tokenize(text)  # Tokenize the text
    y = []
    for i in text:
        if i not in stopwords.words('english'):
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))  # Perform stemming
    return " ".join(y)

# Apply clean_text function to 'text' column and create a new column 'text_cleaned'
df['text_cleaned'] = df['text'].apply(clean_text)

# Display the cleaned text
print(df.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  airline_sentiment                                               text  \
0           neutral                @VirginAmerica What @dhepburn said.   
1          positive  @VirginAmerica plus you've added commercials t...   
2           neutral  @VirginAmerica I didn't today... Must mean I n...   
3          negative  @VirginAmerica it's really aggressive to blast...   
4          negative  @VirginAmerica and it's a really big bad thing...   

                                        text_cleaned  
0                  @ virginamerica @ dhepburn said .  
1  @ virginamerica plu 've ad commerci experi ......  
2  @ virginamerica n't today ... must mean need t...  
3  @ virginamerica 's realli aggress blast obnoxi...  
4            @ virginamerica 's realli big bad thing  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_cleaned'] = df['text'].apply(clean_text)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf = TfidfVectorizer(max_features=3000)

# Fit and transform 'text_cleaned' to get X (features)
X = tfidf.fit_transform(df['text_cleaned']).toarray()

# Convert 'airline_sentiment' to Y (labels)
Y = df['airline_sentiment'].values


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# Train a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred_nb = nb_classifier.predict(X_test)

# Calculate accuracy of Multinomial Naive Bayes classifier
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy of Multinomial Naive Bayes Classifier: {accuracy_nb}")

# Train a Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)

# Calculate accuracy of Random Forest classifier
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of Random Forest Classifier: {accuracy_rf}")


Accuracy of Multinomial Naive Bayes Classifier: 0.7219945355191257
Accuracy of Random Forest Classifier: 0.7537568306010929


In [None]:
# Group the dataframe by 'airline_sentiment' and describe the counts
sentiment_counts = df.groupby('airline_sentiment').describe()

# Display the count for each sentiment class
print(sentiment_counts)


                   text         \
                  count unique   
airline_sentiment                
negative           9178   9087   
neutral            3099   3067   
positive           2363   2298   

                                                                           \
                                                                 top freq   
airline_sentiment                                                           
negative           @AmericanAir that's 16+ extra hours of travel ...    2   
neutral                                           @SouthwestAir sent    5   
positive                                            @JetBlue thanks!    5   

                  text_cleaned         \
                         count unique   
airline_sentiment                       
negative                  9178   9083   
neutral                   3099   3025   
positive                  2363   2262   

                                                                           
         

In [None]:
# Group the dataframe by 'airline_sentiment' and describe the counts
sentiment_counts = df.groupby('airline_sentiment').describe()

# Display the count for each sentiment class
print(sentiment_counts)

# Extract the count for the 'negative' sentiment class specifically
count_negative = sentiment_counts.loc['negative', ('text', 'count')]
print("Number of instances with 'negative' sentiment:", count_negative)


                   text         \
                  count unique   
airline_sentiment                
negative           9178   9087   
neutral            3099   3067   
positive           2363   2298   

                                                                           \
                                                                 top freq   
airline_sentiment                                                           
negative           @AmericanAir that's 16+ extra hours of travel ...    2   
neutral                                           @SouthwestAir sent    5   
positive                                            @JetBlue thanks!    5   

                  text_cleaned         \
                         count unique   
airline_sentiment                       
negative                  9178   9083   
neutral                   3099   3025   
positive                  2363   2262   

                                                                           
         

In [None]:
# Filter the dataset to include only rows where 'airline_sentiment' is 'neutral'
neutral_tweets = df[df['airline_sentiment'] == 'neutral']

# Count the number of unique tweets in the filtered dataset
unique_neutral_tweets = neutral_tweets['text'].nunique()

print("Number of unique tweets with 'neutral' sentiment:", unique_neutral_tweets)


Number of unique tweets with 'neutral' sentiment: 3067


In [None]:
print(data.shape)
print(X.shape)

(14640, 15)
(14640, 3000)


In [None]:
text = "@VirginAmerica What @dhepburn said."
cleaned_text = clean_text(text)

print("Cleaned text:", cleaned_text)

Cleaned text: @ virginamerica @ dhepburn said .


In [None]:
# Filter the dataset to include only rows where 'airline_sentiment' is 'neutral'
neutral_tweets = df[df['airline_sentiment'] == 'neutral']

# Count the number of unique tweets in the filtered dataset
unique_neutral_tweets = neutral_tweets['text'].nunique()

print("Number of unique tweets with 'neutral' sentiment:", unique_neutral_tweets)


Number of unique tweets with 'neutral' sentiment: 3067
