In [None]:
import sys,os
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
from matplotlib.patches import ConnectionPatch
import seaborn as sns
import numpy as np
import sklearn
import string
import re
import nltk
import tensorflow as tf
from collections import Counter
from tensorflow import keras
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.utils import pad_sequences,to_categorical
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,confusion_matrix,precision_score,recall_score,classification_report,roc_curve
from datetime import datetime
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from google.colab import files,drive
from wordcloud import WordCloud, STOPWORDS
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
print(f"Running Panda Version: {pd.__version__}")
print(f"Running TensorFlow Version: {tf.__version__}")
print(f"Running Keras API Version:{keras.__version__}")
print(f"Running Python {sys.version_info[:2][0]}.{sys.version_info[:2][1]}")

Running Panda Version: 2.2.2
Running TensorFlow Version: 2.19.0
Running Keras API Version:3.10.0
Running Python 3.12


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
seed = 0
tf.keras.utils.set_random_seed(seed)

In [None]:
uploaded = files.upload()
dataset = pd.read_csv("Tweets.csv",na_values=['NA'], low_memory=False)

Saving Tweets.csv to Tweets.csv


# EDA (Exploratory Data Analysis)

## Descriptive statistics

#### Shape of dataset

In [None]:
shapes = f"Dataset structure: rows: {dataset.shape[0]} - columns: {dataset.shape[1]}"
print(shapes)

Dataset structure: rows: 14640 - columns: 15


#### Radom samples

In [None]:
dataset.sample(4)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
1917,569521101736439809,negative,1.0,Customer Service Issue,0.3618,United,,BenHabbel,,0,@united completely unacceptable to seat 1st cl...,,2015-02-22 07:36:26 -0800,New York City,Pacific Time (US & Canada)
8685,568025019068108800,negative,1.0,Cancelled Flight,0.6545,Delta,,rjburnsva,,0,@JetBlue they say they have no update. I don't...,,2015-02-18 04:31:32 -0800,,Eastern Time (US & Canada)
7350,569650936911683586,negative,0.6633,Late Flight,0.6633,Delta,,jessbutl,,0,@JetBlue Tx for the info. Just don't understan...,,2015-02-22 16:12:21 -0800,"Brooklyn, NY",
8250,568558562810544129,positive,1.0,,,Delta,,TheSnortherner,,0,@JetBlue oh yes! I hope you expand to other ai...,,2015-02-19 15:51:39 -0800,NYC,Eastern Time (US & Canada)


#### Dataset Features types

In [None]:
dataset.dtypes

Unnamed: 0,0
tweet_id,int64
airline_sentiment,object
airline_sentiment_confidence,float64
negativereason,object
negativereason_confidence,float64
airline,object
airline_sentiment_gold,object
name,object
negativereason_gold,object
retweet_count,int64


#### Names of columns

In [None]:
", ".join(dataset.columns.tolist())

'tweet_id, airline_sentiment, airline_sentiment_confidence, negativereason, negativereason_confidence, airline, airline_sentiment_gold, name, negativereason_gold, retweet_count, text, tweet_coord, tweet_created, tweet_location, user_timezone'

#### Categorical features

In [None]:
dataset.describe(include='O').T

Unnamed: 0,count,unique,top,freq
airline_sentiment,14640,3,negative,9178
negativereason,9178,10,Customer Service Issue,2910
airline,14640,6,United,3822
airline_sentiment_gold,40,3,negative,32
name,14640,7701,JetBlueNews,63
negativereason_gold,32,13,Customer Service Issue,12
text,14640,14427,@united thanks,6
tweet_coord,1019,832,"[0.0, 0.0]",164
tweet_created,14640,14247,2015-02-24 09:54:34 -0800,5
tweet_location,9907,3081,"Boston, MA",157


#### Numeric features

In [None]:
dataset.describe(include=np.number).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tweet_id,14640.0,5.692184e+17,779111200000000.0,5.675883e+17,5.685592e+17,5.694779e+17,5.698905e+17,5.703106e+17
airline_sentiment_confidence,14640.0,0.9001689,0.16283,0.335,0.6923,1.0,1.0,1.0
negativereason_confidence,10522.0,0.6382983,0.3304398,0.0,0.3606,0.6706,1.0,1.0
retweet_count,14640.0,0.08265027,0.7457782,0.0,0.0,0.0,0.0,44.0


#### Target classes

In [None]:
", ".join(dataset.airline_sentiment.unique().tolist())

'neutral, positive, negative'

In [None]:
dataset.airline_sentiment.value_counts()

Unnamed: 0_level_0,count
airline_sentiment,Unnamed: 1_level_1
negative,9178
neutral,3099
positive,2363


Imbalance data ( more negatives than positive and neutral)

#### Missing values and duplicates

In [None]:
print('Sum of Missing values accross columns\n',dataset.isnull().sum())
print('-'*40)
print("Sum of missing values",sum(dataset.isnull().sum()))

Sum of Missing values accross columns
 tweet_id                            0
airline_sentiment                   0
airline_sentiment_confidence        0
negativereason                   5462
negativereason_confidence        4118
airline                             0
airline_sentiment_gold          14600
name                                0
negativereason_gold             14608
retweet_count                       0
text                                0
tweet_coord                     13621
tweet_created                       0
tweet_location                   4733
user_timezone                    4820
dtype: int64
----------------------------------------
Sum of missing values 61962


### Airlines

In [None]:
", ".join(dataset.airline.unique().tolist())

'Virgin America, United, Southwest, Delta, US Airways, American'

In [None]:
dataset.airline.value_counts()

Unnamed: 0_level_0,count
airline,Unnamed: 1_level_1
United,3822
US Airways,2913
American,2759
Southwest,2420
Delta,2222
Virgin America,504


#### Tweets

In [None]:
print("Tweets frequencies grouped by sentiments for the airlines")
print('-'*60)
airlines_sentiments_groups = dataset.groupby("airline", group_keys=True)[['airline_sentiment']].value_counts()
airlines_sentiments_groups

Tweets frequencies grouped by sentiments for the airlines
------------------------------------------------------------


Unnamed: 0_level_0,Unnamed: 1_level_0,count
airline,airline_sentiment,Unnamed: 2_level_1
American,negative,1960
American,neutral,463
American,positive,336
Delta,negative,955
Delta,neutral,723
Delta,positive,544
Southwest,negative,1186
Southwest,neutral,664
Southwest,positive,570
US Airways,negative,2263


In [None]:
print("Time of first tweet in the dataset:",dataset.tweet_created.min())
print('-'*65)
print("Time of last tweet in the dataset:",dataset.tweet_created.max())

Time of first tweet in the dataset: 2015-02-16 23:36:05 -0800
-----------------------------------------------------------------
Time of last tweet in the dataset: 2015-02-24 11:53:37 -0800


In [None]:
dataset[dataset['negativereason'] != 'N/A'].groupby("airline", group_keys=True)[['negativereason']].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
airline,negativereason,Unnamed: 2_level_1
American,Customer Service Issue,768
American,Late Flight,249
American,Cancelled Flight,246
American,Can't Tell,198
American,Lost Luggage,149
American,Flight Booking Problems,130
American,Bad Flight,87
American,Flight Attendant Complaints,87
American,longlines,34
American,Damaged Luggage,12


Highest retweet

In [None]:
dataset[['text','airline','name','airline_sentiment']].loc[dataset['retweet_count'].max()]

Unnamed: 0,44
text,@VirginAmerica are flights leaving Dallas for ...
airline,Virgin America
name,papamurat
airline_sentiment,neutral


## Multivariate analysis (Visualization).