In [1]:

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.semi_supervised import LabelSpreading
import demoji
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import re, string
import emoji
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

data = pd.read_csv('Airline-Sentiment-2-w-AA.csv', encoding='ISO-8859-1')

data.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2/25/15 5:24,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/15 11:35,5.70306e+17,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2/25/15 1:53,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
2,681448156,False,finalized,3,2/25/15 10:01,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/15 11:15,5.70301e+17,Lets Play,Central Time (US & Canada)
3,681448158,False,finalized,3,2/25/15 3:05,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
4,681448159,False,finalized,3,2/25/15 5:50,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2/24/15 11:14,5.70301e+17,,Pacific Time (US & Canada)


In [2]:
data.airline_sentiment.unique()
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   _unit_id                      14640 non-null  int64  
 1   _golden                       14640 non-null  bool   
 2   _unit_state                   14640 non-null  object 
 3   _trusted_judgments            14640 non-null  int64  
 4   _last_judgment_at             14584 non-null  object 
 5   airline_sentiment             14640 non-null  object 
 6   airline_sentiment:confidence  14640 non-null  float64
 7   negativereason                9178 non-null   object 
 8   negativereason:confidence     10522 non-null  float64
 9   airline                       14640 non-null  object 
 10  airline_sentiment_gold        40 non-null     object 
 11  name                          14640 non-null  object 
 12  negativereason_gold           32 non-null     object 
 13  r

In [3]:
df = data[['text','airline_sentiment']]
df.head()
df.isnull().sum()

text                 0
airline_sentiment    0
dtype: int64

In [4]:
# Clean  text
def clean_text_from_emojis(text):
    return demoji.replace(text, '')

#Remove punctuations, links, mentions and \r\n new line characters
def strip_all_entities(text):
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

# Clean hastags '#' Symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

# Remove multiple spaces
def remove_mult_spaces(text):
    return re.sub("\s\s+" , " ", text)



texts_new = []
for t in df.text:
    texts_new.append(remove_mult_spaces(clean_hashtags(strip_all_entities(clean_text_from_emojis(t)))))
    

In [5]:
df['cleaned_text'] = texts_new

# rearrange the column order
df = df.reindex(columns=['text','cleaned_text', 'airline_sentiment'])
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_text'] = texts_new


Unnamed: 0,text,cleaned_text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,what said,neutral
1,@VirginAmerica plus you've added commercials t...,plus youve added commercials to the experience...,positive
2,@VirginAmerica I didn't today... Must mean I n...,i didnt today must mean i need to take another...,neutral
3,@VirginAmerica it's really aggressive to blast...,its really aggressive to blast obnoxious enter...,negative
4,@VirginAmerica and it's a really big bad thing...,and its a really big bad thing about it,negative


In [6]:
# Randomly select a portion of the data to turn into unlabeled
percentage_to_turn_unlabeled = 0.4  # Adjust as needed
num_samples_to_turn_unlabeled = int(len(df) * percentage_to_turn_unlabeled)
samples_to_turn_unlabeled = df.sample(num_samples_to_turn_unlabeled)

# Set the 'label' column for the selected samples to NaN
df.loc[samples_to_turn_unlabeled.index, 'airline_sentiment'] = None  # or df.loc[samples_to_turn_unlabeled.index, 'label'] = np.nan

# Display the modified DataFrame
print(df)
labeled_data = df[df['airline_sentiment'].notnull()]
unlabeled_data = df[df['airline_sentiment'].isnull()]
# Instantiate the vectorizer
vectorizer = TfidfVectorizer()

                                                    text  \
0                    @VirginAmerica What @dhepburn said.   
1      @VirginAmerica plus you've added commercials t...   
2      @VirginAmerica I didn't today... Must mean I n...   
3      @VirginAmerica it's really aggressive to blast...   
4      @VirginAmerica and it's a really big bad thing...   
...                                                  ...   
14635  @AmericanAir thank you we got on a different f...   
14636  @AmericanAir leaving over 20 minutes Late Flig...   
14637  @AmericanAir Please bring American Airlines to...   
14638  @AmericanAir you have my money, you change my ...   
14639  @AmericanAir we have 8 ppl so we need 2 know h...   

                                            cleaned_text airline_sentiment  
0                                              what said           neutral  
1      plus youve added commercials to the experience...              None  
2      i didnt today must mean i need to take an

In [7]:

# Clean and prepare the DataFrame
df['cleaned_text'] = df['text'].apply(lambda x: remove_mult_spaces(clean_hashtags(strip_all_entities(clean_text_from_emojis(x)))))

# Separate labeled and unlabeled data
labeled_data = df[df['airline_sentiment'].notnull()]
unlabeled_data = df[df['airline_sentiment'].isnull()]

# Separate the features (X) and labels (y)
X_labeled = vectorizer.fit_transform(labeled_data['cleaned_text'])
y_labeled = labeled_data['airline_sentiment']

# Filter out rows with None values in the target variable
mask_labeled = y_labeled.notnull()
X_labeled = X_labeled[mask_labeled]
y_labeled = y_labeled[mask_labeled]

# Split the labeled data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)

# Train the initial classifier on the labeled training data
classifier1 = SVC(kernel='linear')
classifier1.fit(X_train, y_train)

# Predict the labels on the test data
predicted_labels = classifier1.predict(X_test)

# Evaluate the accuracy, confusion matrix, and classification report on the test set
accuracy = accuracy_score(y_test, predicted_labels)
conf_matrix = confusion_matrix(y_test, predicted_labels)
class_report = classification_report(y_test, predicted_labels)

print(f"Accuracy on the test set: {accuracy:.2%}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

# Use TF-IDF to convert text to numerical features for the unlabeled data
X_unlabeled = vectorizer.transform(unlabeled_data['cleaned_text'])

# Predict sentiment on the unlabeled data
predicted_sentiments = classifier1.predict(X_unlabeled)

# Add predicted sentiments to the unlabeled dataset
unlabeled_data['predicted_sentiment'] = predicted_sentiments

# Combine labeled and unlabeled data
combined_data = pd.concat([labeled_data, unlabeled_data])

# Separate features and labels for the combined data
X_combined = vectorizer.transform(combined_data['cleaned_text'])
y_combined = combined_data['airline_sentiment']

# Filter out rows with None values in the target variable for combined data
mask_combined = y_combined.notnull()
X_combined = X_combined[mask_combined]
y_combined = y_combined[mask_combined]

# Retrain the classifier on the combined data
classifier1.fit(X_combined, y_combined)

# Predict the labels on the test data again (for evaluation)
predicted_labels_combined = classifier1.predict(X_test)
# Evaluate the accuracy, confusion matrix, and classification report on the retrained model
accuracy_combined = accuracy_score(y_test, predicted_labels_combined)
conf_matrix_combined = confusion_matrix(y_test, predicted_labels_combined)
class_report_combined = classification_report(y_test, predicted_labels_combined)

print(f"Accuracy on the test set after retraining: {accuracy_combined:.2%}")
print("\nConfusion Matrix after retraining:")
print(conf_matrix_combined)
print("\nClassification Report after retraining:")
print(class_report_combined)

# Display the result for the unlabeled data
print(unlabeled_data[['text', 'predicted_sentiment']])

Accuracy on the test set: 79.34%

Confusion Matrix:
[[1047   46   17]
 [ 153  207   17]
 [  81   49  140]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.82      0.94      0.88      1110
     neutral       0.69      0.55      0.61       377
    positive       0.80      0.52      0.63       270

    accuracy                           0.79      1757
   macro avg       0.77      0.67      0.71      1757
weighted avg       0.79      0.79      0.78      1757



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_data['predicted_sentiment'] = predicted_sentiments


Accuracy on the test set after retraining: 91.41%

Confusion Matrix after retraining:
[[1094   10    6]
 [  73  297    7]
 [  28   27  215]]

Classification Report after retraining:
              precision    recall  f1-score   support

    negative       0.92      0.99      0.95      1110
     neutral       0.89      0.79      0.84       377
    positive       0.94      0.80      0.86       270

    accuracy                           0.91      1757
   macro avg       0.92      0.86      0.88      1757
weighted avg       0.91      0.91      0.91      1757

                                                    text predicted_sentiment
1      @VirginAmerica plus you've added commercials t...            negative
5      @VirginAmerica seriously would pay $30 a fligh...            negative
10     @VirginAmerica did you know that suicide is th...             neutral
15         @VirginAmerica SFO-PDX schedule is still MIA.            negative
16     @VirginAmerica So excited for my first cross 

In [8]:
# Example of new text to test
new_text = "I am not happy with the service"

# Clean and preprocess the new text
cleaned_new_text = remove_mult_spaces(clean_hashtags(strip_all_entities(clean_text_from_emojis(new_text))))
\
# Transform the preprocessed text using the TF-IDF vectorizer
X_new = vectorizer.transform([cleaned_new_text])

# Use the trained model to predict the sentiment
predicted_sentiment = classifier1.predict(X_new)[0]

# Display the result
print(f"Predicted sentiment for the new text: {predicted_sentiment}")


Predicted sentiment for the new text: negative


In [9]:
# Clean and prepare the DataFrame
df['cleaned_text'] = df['text'].apply(lambda x: remove_mult_spaces(clean_hashtags(strip_all_entities(clean_text_from_emojis(x)))))

# Separate labeled and unlabeled data
labeled_data = df[df['airline_sentiment'].notnull()]
unlabeled_data = df[df['airline_sentiment'].isnull()]

# Separate the features (X) and labels (y)
X_labeled = vectorizer.fit_transform(labeled_data['cleaned_text'])
y_labeled = labeled_data['airline_sentiment']

# Filter out rows with None values in the target variable
mask_labeled = y_labeled.notnull()
X_labeled = X_labeled[mask_labeled]
y_labeled = y_labeled[mask_labeled]

# Split the labeled data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)

# Train the initial classifier on the labeled training data
classifier2 = LogisticRegression()
classifier2.fit(X_train, y_train)

# Predict the labels on the test data
predicted_labels = classifier2.predict(X_test)

# Evaluate the accuracy, confusion matrix, and classification report on the test set
accuracy = accuracy_score(y_test, predicted_labels)
conf_matrix = confusion_matrix(y_test, predicted_labels)
class_report = classification_report(y_test, predicted_labels)

print(f"Accuracy on the test set: {accuracy:.2%}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

# Use TF-IDF to convert text to numerical features for the unlabeled data
X_unlabeled = vectorizer.transform(unlabeled_data['cleaned_text'])

# Predict sentiment on the unlabeled data
predicted_sentiments = classifier2.predict(X_unlabeled)

# Add predicted sentiments to the unlabeled dataset
unlabeled_data['predicted_sentiment'] = predicted_sentiments

# Combine labeled and unlabeled data
combined_data = pd.concat([labeled_data, unlabeled_data])

# Separate features and labels for the combined data
X_combined = vectorizer.transform(combined_data['cleaned_text'])
y_combined = combined_data['airline_sentiment']

# Filter out rows with None values in the target variable for combined data
mask_combined = y_combined.notnull()
X_combined = X_combined[mask_combined]
y_combined = y_combined[mask_combined]

# Retrain the classifier on the combined data
classifier2.fit(X_combined, y_combined)

# Predict the labels on the test data again (for evaluation)
predicted_labels_combined = classifier2.predict(X_test)

# Evaluate the accuracy, confusion matrix, and classification report on the retrained model
accuracy_combined = accuracy_score(y_test, predicted_labels_combined)
conf_matrix_combined = confusion_matrix(y_test, predicted_labels_combined)
class_report_combined = classification_report(y_test, predicted_labels_combined)

print(f"Accuracy on the test set after retraining: {accuracy_combined:.2%}")
print("\nConfusion Matrix after retraining:")
print(conf_matrix_combined)
print("\nClassification Report after retraining:")
print(class_report_combined)

# Display the result for the unlabeled data
print(unlabeled_data[['text', 'predicted_sentiment']])

Accuracy on the test set: 77.97%

Confusion Matrix:
[[1058   38   14]
 [ 175  188   14]
 [  97   49  124]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.80      0.95      0.87      1110
     neutral       0.68      0.50      0.58       377
    positive       0.82      0.46      0.59       270

    accuracy                           0.78      1757
   macro avg       0.76      0.64      0.68      1757
weighted avg       0.77      0.78      0.76      1757



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_data['predicted_sentiment'] = predicted_sentiments


Accuracy on the test set after retraining: 88.56%

Confusion Matrix after retraining:
[[1089   14    7]
 [  85  287    5]
 [  55   35  180]]

Classification Report after retraining:
              precision    recall  f1-score   support

    negative       0.89      0.98      0.93      1110
     neutral       0.85      0.76      0.81       377
    positive       0.94      0.67      0.78       270

    accuracy                           0.89      1757
   macro avg       0.89      0.80      0.84      1757
weighted avg       0.89      0.89      0.88      1757

                                                    text predicted_sentiment
1      @VirginAmerica plus you've added commercials t...            negative
5      @VirginAmerica seriously would pay $30 a fligh...            negative
10     @VirginAmerica did you know that suicide is th...             neutral
15         @VirginAmerica SFO-PDX schedule is still MIA.            negative
16     @VirginAmerica So excited for my first cross 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Clean and prepare the DataFrame
df['cleaned_text'] = df['text'].apply(lambda x: remove_mult_spaces(clean_hashtags(strip_all_entities(clean_text_from_emojis(x)))))

# Separate labeled and unlabeled data
labeled_data = df[df['airline_sentiment'].notnull()]
unlabeled_data = df[df['airline_sentiment'].isnull()]

# Separate the features (X) and labels (y)
X_labeled = vectorizer.fit_transform(labeled_data['cleaned_text'])
y_labeled = labeled_data['airline_sentiment']

# Filter out rows with None values in the target variable
mask_labeled = y_labeled.notnull()
X_labeled = X_labeled[mask_labeled]
y_labeled = y_labeled[mask_labeled]

# Split the labeled data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)

# Train the initial classifier on the labeled training data
classifier3 = RandomForestClassifier()
classifier3.fit(X_train, y_train)

# Predict the labels on the test data
predicted_labels = classifier3.predict(X_test)

# Evaluate the accuracy, confusion matrix, and classification report on the test set
accuracy = accuracy_score(y_test, predicted_labels)
conf_matrix = confusion_matrix(y_test, predicted_labels)
class_report = classification_report(y_test, predicted_labels)

print(f"Accuracy on the test set: {accuracy:.2%}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

# Use TF-IDF to convert text to numerical features for the unlabeled data
X_unlabeled = vectorizer.transform(unlabeled_data['cleaned_text'])

# Predict sentiment on the unlabeled data
predicted_sentiments = classifier3.predict(X_unlabeled)

# Add predicted sentiments to the unlabeled dataset
unlabeled_data['predicted_sentiment'] = predicted_sentiments

# Combine labeled and unlabeled data
combined_data = pd.concat([labeled_data, unlabeled_data])

# Separate features and labels for the combined data
X_combined = vectorizer.transform(combined_data['cleaned_text'])
y_combined = combined_data['airline_sentiment']

# Filter out rows with None values in the target variable for combined data
mask_combined = y_combined.notnull()
X_combined = X_combined[mask_combined]
y_combined = y_combined[mask_combined]

# Retrain the classifier on the combined data
classifier3.fit(X_combined, y_combined)

# Predict the labels on the test data again (for evaluation)
predicted_labels_combined = classifier3.predict(X_test)

# Evaluate the accuracy, confusion matrix, and classification report on the retrained model
accuracy_combined = accuracy_score(y_test, predicted_labels_combined)
conf_matrix_combined = confusion_matrix(y_test, predicted_labels_combined)
class_report_combined = classification_report(y_test, predicted_labels_combined)

print(f"Accuracy on the test set after retraining: {accuracy_combined:.2%}")
print("\nConfusion Matrix after retraining:")
print(conf_matrix_combined)
print("\nClassification Report after retraining:")
print(class_report_combined)

# Display the result for the unlabeled data
print(unlabeled_data[['text', 'predicted_sentiment']])

Accuracy on the test set: 74.39%

Confusion Matrix:
[[1075   25   10]
 [ 230  137   10]
 [ 141   34   95]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.74      0.97      0.84      1110
     neutral       0.70      0.36      0.48       377
    positive       0.83      0.35      0.49       270

    accuracy                           0.74      1757
   macro avg       0.76      0.56      0.60      1757
weighted avg       0.75      0.74      0.71      1757



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_data['predicted_sentiment'] = predicted_sentiments


Accuracy on the test set after retraining: 99.77%

Confusion Matrix after retraining:
[[1110    0    0]
 [   0  376    1]
 [   0    3  267]]

Classification Report after retraining:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      1110
     neutral       0.99      1.00      0.99       377
    positive       1.00      0.99      0.99       270

    accuracy                           1.00      1757
   macro avg       1.00      1.00      1.00      1757
weighted avg       1.00      1.00      1.00      1757

                                                    text predicted_sentiment
1      @VirginAmerica plus you've added commercials t...            negative
5      @VirginAmerica seriously would pay $30 a fligh...            negative
10     @VirginAmerica did you know that suicide is th...             neutral
15         @VirginAmerica SFO-PDX schedule is still MIA.            negative
16     @VirginAmerica So excited for my first cross 

In [11]:
# Example of new text to test
new_text = "I am pleased with the service"

# Clean and preprocess the new text
cleaned_new_text = remove_mult_spaces(clean_hashtags(strip_all_entities(clean_text_from_emojis(new_text))))
\
# Transform the preprocessed text using the TF-IDF vectorizer
X_new = vectorizer.transform([cleaned_new_text])

# Use the trained model to predict the sentiment
predicted_sentiment = classifier3.predict(X_new)[0]

# Display the result
print(f"Predicted sentiment for the new text: {predicted_sentiment}")

Predicted sentiment for the new text: negative


In [12]:
# Clean and prepare the DataFrame
df['cleaned_text'] = df['text'].apply(lambda x: remove_mult_spaces(clean_hashtags(strip_all_entities(clean_text_from_emojis(x)))))

# Separate labeled and unlabeled data
labeled_data = df[df['airline_sentiment'].notnull()]
unlabeled_data = df[df['airline_sentiment'].isnull()]

# Separate the features (X) and labels (y)
X_labeled = vectorizer.fit_transform(labeled_data['cleaned_text'])
y_labeled = labeled_data['airline_sentiment']

# Filter out rows with None values in the target variable
mask_labeled = y_labeled.notnull()
X_labeled = X_labeled[mask_labeled]
y_labeled = y_labeled[mask_labeled]

# Split the labeled data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)

# Train the initial classifier on the labeled training data

classifier4 = GradientBoostingClassifier()
classifier4.fit(X_train, y_train)

# Predict the labels on the test data
predicted_labels = classifier4.predict(X_test)

# Evaluate the accuracy, confusion matrix, and classification report on the test set
accuracy = accuracy_score(y_test, predicted_labels)
conf_matrix = confusion_matrix(y_test, predicted_labels)
class_report = classification_report(y_test, predicted_labels)

print(f"Accuracy on the test set: {accuracy:.2%}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
# Use TF-IDF to convert text to numerical features for the unlabeled data
X_unlabeled = vectorizer.transform(unlabeled_data['cleaned_text'])

# Predict sentiment on the unlabeled data
predicted_sentiments = classifier4.predict(X_unlabeled)

# Add predicted sentiments to the unlabeled dataset
unlabeled_data['predicted_sentiment'] = predicted_sentiments

# Combine labeled and unlabeled data
combined_data = pd.concat([labeled_data, unlabeled_data])

# Separate features and labels for the combined data
X_combined = vectorizer.transform(combined_data['cleaned_text'])
y_combined = combined_data['airline_sentiment']

# Filter out rows with None values in the target variable for combined data
mask_combined = y_combined.notnull()
X_combined = X_combined[mask_combined]
y_combined = y_combined[mask_combined]

# Retrain the classifier on the combined data
classifier4.fit(X_combined, y_combined)

# Predict the labels on the test data again (for evaluation)
predicted_labels_combined = classifier4.predict(X_test)

# Evaluate the accuracy, confusion matrix, and classification report on the retrained model
accuracy_combined = accuracy_score(y_test, predicted_labels_combined)
conf_matrix_combined = confusion_matrix(y_test, predicted_labels_combined)
class_report_combined = classification_report(y_test, predicted_labels_combined)

print(f"Accuracy on the test set after retraining: {accuracy_combined:.2%}")
print("\nConfusion Matrix after retraining:")
print(conf_matrix_combined)
print("\nClassification Report after retraining:")
print(class_report_combined)

# Display the result for the unlabeled data
print(unlabeled_data[['text', 'predicted_sentiment']])

Accuracy on the test set: 72.97%

Confusion Matrix:
[[1070   18   22]
 [ 276   75   26]
 [ 123   10  137]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.73      0.96      0.83      1110
     neutral       0.73      0.20      0.31       377
    positive       0.74      0.51      0.60       270

    accuracy                           0.73      1757
   macro avg       0.73      0.56      0.58      1757
weighted avg       0.73      0.73      0.68      1757



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_data['predicted_sentiment'] = predicted_sentiments


Accuracy on the test set after retraining: 77.06%

Confusion Matrix after retraining:
[[1086    8   16]
 [ 239  118   20]
 [ 113    7  150]]

Classification Report after retraining:
              precision    recall  f1-score   support

    negative       0.76      0.98      0.85      1110
     neutral       0.89      0.31      0.46       377
    positive       0.81      0.56      0.66       270

    accuracy                           0.77      1757
   macro avg       0.82      0.62      0.66      1757
weighted avg       0.79      0.77      0.74      1757

                                                    text predicted_sentiment
1      @VirginAmerica plus you've added commercials t...            negative
5      @VirginAmerica seriously would pay $30 a fligh...            negative
10     @VirginAmerica did you know that suicide is th...            negative
15         @VirginAmerica SFO-PDX schedule is still MIA.            negative
16     @VirginAmerica So excited for my first cross 

In [13]:
# Clean and prepare the DataFrame
df['cleaned_text'] = df['text'].apply(lambda x: remove_mult_spaces(clean_hashtags(strip_all_entities(clean_text_from_emojis(x)))))

# Separate labeled and unlabeled data
labeled_data = df[df['airline_sentiment'].notnull()]
unlabeled_data = df[df['airline_sentiment'].isnull()]

# Separate the features (X) and labels (y)
X_labeled = vectorizer.fit_transform(labeled_data['cleaned_text'])
y_labeled = labeled_data['airline_sentiment']

# Filter out rows with None values in the target variable
mask_labeled = y_labeled.notnull()
X_labeled = X_labeled[mask_labeled]
y_labeled = y_labeled[mask_labeled]

# Split the labeled data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)


classifier5 = MultinomialNB()
classifier5.fit(X_train, y_train)

# Predict the labels on the test data
predicted_labels = classifier5.predict(X_test)

# Evaluate the accuracy, confusion matrix, and classification report on the test set
accuracy = accuracy_score(y_test, predicted_labels)
conf_matrix = confusion_matrix(y_test, predicted_labels)
class_report = classification_report(y_test, predicted_labels)

print(f"Accuracy on the test set: {accuracy:.2%}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

# Use TF-IDF to convert text to numerical features for the unlabeled data
X_unlabeled = vectorizer.transform(unlabeled_data['cleaned_text'])

# Predict sentiment on the unlabeled data
predicted_sentiments = classifier5.predict(X_unlabeled)

# Add predicted sentiments to the unlabeled dataset
unlabeled_data['predicted_sentiment'] = predicted_sentiments

# Combine labeled and unlabeled data
combined_data = pd.concat([labeled_data, unlabeled_data])

# Separate features and labels for the combined data
X_combined = vectorizer.transform(combined_data['cleaned_text'])
y_combined = combined_data['airline_sentiment']

# Filter out rows with None values in the target variable for combined data
mask_combined = y_combined.notnull()
X_combined = X_combined[mask_combined]
y_combined = y_combined[mask_combined]

# Retrain the classifier on the combined data
classifier5.fit(X_combined, y_combined)

# Predict the labels on the test data again (for evaluation)
predicted_labels_combined = classifier5.predict(X_test)

# Evaluate the accuracy, confusion matrix, and classification report on the retrained model
accuracy_combined = accuracy_score(y_test, predicted_labels_combined)
conf_matrix_combined = confusion_matrix(y_test, predicted_labels_combined)
class_report_combined = classification_report(y_test, predicted_labels_combined)

print(f"Accuracy on the test set after retraining: {accuracy_combined:.2%}")
print("\nConfusion Matrix after retraining:")
print(conf_matrix_combined)
print("\nClassification Report after retraining:")
print(class_report_combined)

# Display the result for the unlabeled data
print(unlabeled_data[['text', 'predicted_sentiment']])

Accuracy on the test set: 67.05%

Confusion Matrix:
[[1107    2    1]
 [ 336   39    2]
 [ 232    6   32]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.66      1.00      0.79      1110
     neutral       0.83      0.10      0.18       377
    positive       0.91      0.12      0.21       270

    accuracy                           0.67      1757
   macro avg       0.80      0.41      0.40      1757
weighted avg       0.74      0.67      0.57      1757



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_data['predicted_sentiment'] = predicted_sentiments


Accuracy on the test set after retraining: 71.09%

Confusion Matrix after retraining:
[[1108    1    1]
 [ 285   91    1]
 [ 214    6   50]]

Classification Report after retraining:
              precision    recall  f1-score   support

    negative       0.69      1.00      0.82      1110
     neutral       0.93      0.24      0.38       377
    positive       0.96      0.19      0.31       270

    accuracy                           0.71      1757
   macro avg       0.86      0.47      0.50      1757
weighted avg       0.78      0.71      0.65      1757

                                                    text predicted_sentiment
1      @VirginAmerica plus you've added commercials t...            negative
5      @VirginAmerica seriously would pay $30 a fligh...            negative
10     @VirginAmerica did you know that suicide is th...             neutral
15         @VirginAmerica SFO-PDX schedule is still MIA.            negative
16     @VirginAmerica So excited for my first cross 

In [15]:
joblib.dump(vectorizer, 'vectorizer.joblib')
joblib.dump(classifier3, 'randomforest.joblib')

['randomforest.joblib']