In [720]:
import sys
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
from matplotlib.patches import ConnectionPatch
import seaborn as sns
import numpy as np
import sklearn
import string
import re
import nltk
import tensorflow as tf
from collections import Counter
from tensorflow import keras
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.utils import class_weight
from keras.utils import pad_sequences
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.metrics import accuracy_score,multilabel_confusion_matrix,f1_score,roc_auc_score,confusion_matrix,precision_score,recall_score
from datetime import datetime
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from google.colab import files,drive
from wordcloud import WordCloud, STOPWORDS
from sklearn.svm import LinearSVC,SVC
from sklearn.model_selection import GridSearchCV,StratifiedKFold
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
print("Running Panda Version:"+pd.__version__)
print("Running TensorFlow Version:"+ tf.__version__)
print("Running Keras API Version:"+ keras.__version__)
print("Running Python {0}.{1}".format(sys.version_info[:2][0],sys.version_info[:2][1]))

Running Panda Version:2.2.2
Running TensorFlow Version:2.17.0
Running Keras API Version:3.4.1
Running Python 3.10


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [721]:
seed = 0
np.random.seed(seed)

In [None]:
uploaded = files.upload()

# ***Exploratory Data Analysis***

In [None]:
dataset = pd.read_csv("Tweets.csv",na_values=['NA'], low_memory=False)

### Dataset shapes

In [None]:
print('Dataset structure: rows =',dataset.shape[0], ' - columns =',dataset.shape[1])

##### Some random rows

In [None]:
dataset.sample(3)

###  Descriptive statistics for the dataset

In [None]:
print('Dataset Features types:')
dataset.dtypes

In [None]:
print("List of names of columns:\n")
print('-'*50)
dataset.columns.tolist()

In [None]:
print('Descriptive Statistics for numeric features on Dataset')
dataset.describe(include=np.number)

In [None]:
print('Range for numeric features on Dataset')
print('-'*40)
dataset.max(numeric_only=True) - dataset.min(numeric_only=True)

In [None]:
dataset.info()

In [None]:
print('Descriptive Statistics for categorical features on Dataset')
dataset.describe(include=object)

In [None]:
print('Number of classes: ',len(dataset.airline_sentiment.unique().tolist()))

In [None]:
print('Name of classes (y): ',dataset.airline_sentiment.unique().tolist())

In [None]:
print('Check for duplicates?',dataset.duplicated().any())
print('-'*70)
print('Sum of duplicated rows :',dataset.duplicated().sum())

In [None]:
print('Check for missing values ',dataset.isnull().any().any())
print('-'*70)
print('Sum of Missing values accross columns\n',dataset.isnull().sum())
print('-'*70)
print("Sum of missing values",sum(dataset.isnull().sum()))

In [None]:
missing_data=dataset[dataset.isnull().any(axis=1)]
missing_data.head(3)

In [None]:
dataset.airline_sentiment.value_counts()

####Data imbalance, more negative and neutral sentiment than positive.

In [None]:
list_of_airlines =dataset.airline.unique().tolist()
print('list of airlines: ',list_of_airlines)

In [None]:
print("Time of first tweet in the dataset:",dataset.tweet_created.min())
print('-'*70)
print("Time of last tweet in the dataset:",dataset.tweet_created.max())

In [None]:
print("Airlines with tweet count\n", dataset.airline.value_counts())

In [None]:
print("Tweets frequencies grouped by sentiments for the airlines")
print('-'*80)
airlines_sentiments_groups = dataset.groupby("airline", group_keys=True)[['airline_sentiment']].value_counts()
airlines_sentiments_groups

In [None]:
print("Maximum tweet confidence",dataset.airline_sentiment_confidence.max())
print('-'*80)
print("Minimum tweet confidence",dataset.airline_sentiment_confidence.min())

In [None]:
print(dataset.airline_sentiment_confidence.quantile([0,0.25,0.50,0.75,1]))

####25 percent quantile for sentiment confidence is 0.65, which means that 25 percent of the dataset values for this measure is less than 0.65.

In [None]:
dataset['negativereason'] =  dataset['negativereason'].fillna('N/A')

In [None]:
dataset[dataset['negativereason'] != 'N/A'].groupby("airline", group_keys=True)[['negativereason']].value_counts()

##### Highest retweet

In [None]:
print(dataset[['text','airline','airline_sentiment']].loc[dataset['retweet_count'].max()])

## Data visualization (Multivariate analysis).

In [None]:
target_classes,tweet_freq = np.unique(dataset.airline_sentiment,return_counts=True)
print(target_classes,tweet_freq)

In [None]:
def func(pct, allvals)->str:
    absolute = int(np.round(pct/100.*np.sum(allvals)))
    return f"{pct:.1f}%\n ({absolute:d})"

In [None]:
# make figure and assign axis objects
fig, (ax1,ax2,ax3) = plt.subplots(1, 3, figsize=(20, 8))
fig.subplots_adjust(wspace=0)

# pie chart parameters
overall_ratios = tweet_freq
labels = target_classes
explode = [0.1, 0, 0]
colors =['c','y','g']
# rotate so that first wedge is split by the x-axis
angle = -272 * overall_ratios[2]
wedges, *_ = ax1.pie(overall_ratios, autopct=lambda pct: func(pct, overall_ratios),shadow=True,labels=labels, explode=explode,colors=colors,startangle=angle)

# bar chart parameters
negative_sentiment_ratios =[airlines_sentiments_groups[x]['negative'] for x in list_of_airlines]
airline_labels = list_of_airlines
bottom = 5
width = .2

# Adding from the top matches the legend.
for j, (height, label) in enumerate(reversed([*zip(negative_sentiment_ratios, airline_labels)])):
    bottom -= height
    bc = ax2.bar(0, height, width, bottom=bottom, color='C0', label=label,alpha=0.1 + 0.17 * j)
    ax2.bar_label(bc, labels=[height], label_type='center')

ax2.legend(loc=3)
ax2.set_title('Negative sentiments count per airline')
ax2.axis('off')
ax2.set_xlim(- 3.5 * width, 3.5 * width)

# use ConnectionPatch to draw lines between the two plots
theta1, theta2 = wedges[0].theta1, wedges[0].theta2
center, r = wedges[0].center, wedges[0].r
bar_height = sum(negative_sentiment_ratios)

# draw top connecting line
x = r * np.cos(np.pi / 180 * theta2) + center[0]
y = r * np.sin(np.pi / 180 * theta2) + center[1]
con = ConnectionPatch(xyA=(-width / 2, 0), coordsA=ax2.transData,
                      xyB=(x, y), coordsB=ax1.transData)
con.set_color([0, 0, 0])
con.set_linewidth(1)
ax2.add_artist(con)

# draw bottom connecting line
x = r * np.cos(np.pi / 180 * theta1) + center[0]
y = r * np.sin(np.pi / 180 * theta1) + center[1]
con = ConnectionPatch(xyA=(-width / 2, -bar_height), coordsA=ax2.transData,xyB=(x, y), coordsB=ax1.transData)
con.set_color([0, 0, 0])
ax2.add_artist(con)
con.set_linewidth(1)

ax3 = sns.countplot(x="airline", hue="airline_sentiment", data=dataset)
ax3.set_xlabel('')
ax3.set_ylabel('Number of Tweets')
ax3.set_title('Tweets per airline within Training Dataset')


plt.show()
del ax1,ax2,ax3

##### Date decomposition (feature creation).

In [None]:
def decode_date(date_str)-> pd.Series:
  aux = date_str.replace(' -0800','')
  date_utc = datetime.strptime(aux,"%Y-%m-%d %H:%M:%S")
  return pd.Series([date_utc.weekday(),date_utc.day,date_utc.hour])

In [None]:
dataset[['week_day','day','hour']] = dataset['tweet_created'].apply(decode_date)

In [None]:
dataset.head(3)

In [None]:
fig, axes = plt.subplots(figsize=(8,5))
ax = sns.countplot(x="day", hue="airline_sentiment", data=dataset)
ax.set_xlabel('Day')
ax.set_ylabel('Number of Tweets')
ax.set_title('Daily AirLine Tweets Distribution within a month')
del ax,fig,axes

In [None]:
fig, axes = plt.subplots(figsize=(8,5))
ax = sns.countplot(x="week_day", hue="airline_sentiment", data=dataset)
ax.set_xlabel('Week day')
ax.set(xticklabels=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
ax.set_ylabel('Number of Tweets')
ax.set_title('Daily AirLine Tweets Distribution within a week')
del ax,fig,axes

In [None]:
fig, axes = plt.subplots(figsize=(8,5))
ax = sns.countplot(x="hour", hue="airline_sentiment", data=dataset)
ax.set_xlabel('Hour')
ax.set_ylabel('Number of Tweets')
ax.set_title('Tweets Distribution within 24hrs')
del ax,fig,axes

### Top 20 tweet authors

In [None]:
top_tweeter = Counter([name for name in dataset['name']])
top_tweeter_df = pd.DataFrame(top_tweeter.most_common(20))
top_tweeter_df.columns = ['top_tweet_authors','count']
print("Top 20 Authors",'-'*15)
top_tweeter_df.style.background_gradient(cmap='inferno')

### Top reasons for complain.

In [None]:
top_complain = Counter([reason for reason in dataset[dataset['negativereason'] != 'N/A']['negativereason']])
top_complain_df = pd.DataFrame(top_complain.most_common(10))
top_complain_df.columns = ['top_complain_reason','count']
print("Top 10 complain",'-'*15)
top_complain_df.style.background_gradient(cmap='inferno')

### Data preprocessing, feature engineering.

In [None]:
#For removing user tags(@user)
def remove_user_tags(tweet:str)->tuple[str,int]:
    user = re.compile(r'@\S+')
    initial = len(tweet)
    text = user.sub(r'',tweet)
    result = ''.join([i for i in text if not i.isdigit()])
    final = len(result)
    number = 0
    for i in text:
      if i.isdigit():
        number+=1
    if( final == initial - number):
      return (result,0)
    else:
      return (result,1)

#For removing Url links
def remove_url(tweet:str)->tuple[str,int]:
    url = re.compile(r'https?://\S+|www\.\S+')
    initial = len(tweet)
    text = url.sub(r'',tweet,re.IGNORECASE)
    result = ''.join([i for i in text if not i.isdigit()])
    final = len(result)
    number = 0
    for i in text:
      if i.isdigit():
        number += 1
    if( final == initial - number):
      return (result,0)
    else:
      return (result,1)

In [None]:
def clean_and_create_feature(tweet:str)-> pd.Series:
    (str_without_tags,tag_flag) = remove_user_tags(tweet)
    (str_without_links,url_flag) = remove_url(str_without_tags)
    return pd.Series([str_without_links,tag_flag,url_flag])

In [None]:
dataset[['text','user_tag','url_flag']] = dataset.apply(lambda x: clean_and_create_feature(x.text),axis=1)

In [None]:
dataset.head(3)

In [None]:
print(dataset.groupby("airline_sentiment", group_keys=True)[['user_tag']].value_counts())
print('-'*50)
print(dataset.groupby("airline_sentiment", group_keys=True)[['url_flag']].value_counts())

In [None]:
f, axes = plt.subplots(figsize=(8,5))
ax = sns.countplot(x="user_tag", hue="airline_sentiment", data=dataset)
ax.set(xticklabels=['With mension tag'])
ax.set_xlabel('')
ax.set_ylabel('Number of Tweets')
ax.set_title('Tweets with User tag distribution within the dataset')
del ax,f,axes

In [None]:
f, axes = plt.subplots(figsize=(8, 5))
ax = sns.countplot(x="url_flag",hue="airline_sentiment", data=dataset)
ax.set(xticklabels=['Without URL','With URL'])
ax.set_xlabel('')
ax.set_ylabel('Number of Tweets')
ax.set_title('Tweets with URL distribution within the dataset')
del ax,f,axes

In [None]:
def happy_emoticons_removal(tweet:str) ->tuple[str,int]:
    happy = re.compile(r"([xX;:]-?[dDpP)])")
    initial = len(tweet)
    text = happy.sub(r'',tweet)
    result = ''.join([i for i in text if not i.isdigit()])
    final = len(result)
    number = 0
    for i in text:
      if i.isdigit():
        number += 1
    if( final != initial - number):
      return (result,1)
    return (result,0)

def sad_emoticons_removal(tweet:str) ->tuple[str,int]:
    sad = re.compile(r"[:;](['\"]?[-~]?[/(\|C<>{}\[]+)")
    initial = len(tweet)
    text = sad.sub(r'',tweet)
    result = ''.join([i for i in text if not i.isdigit()])
    final = len(result)
    number = 0
    for i in text:
      if i.isdigit():
        number += 1
    if( final != initial - number):
      return (result,1)
    return (result,0)


In [None]:
def emoticon_removal_and_feature_creation(tweet:str)-> pd.Series:
  (str_without_happy_emoji,happy_emoji_flag) = happy_emoticons_removal(tweet)
  (str_without_emoji,sad_emoji_flag) = sad_emoticons_removal(str_without_happy_emoji)
  return pd.Series([str_without_emoji,happy_emoji_flag,sad_emoji_flag])

In [None]:
dataset[['text','happy_emoji','sad_emoji']] = dataset.apply(lambda x: emoticon_removal_and_feature_creation(x.text),axis=1)

In [None]:
f, axes = plt.subplots(1,2,figsize=(12, 5))
ax = sns.countplot(x="happy_emoji", hue="airline_sentiment", data=dataset,ax=axes[0])
ax.set(xticklabels=['False','True'])
ax.set_xlabel('"Happy" emoji on Tweet')
ax.set_ylabel('Number of Tweets')
ax = sns.countplot(x="happy_emoji", hue="airline_sentiment", data=dataset[dataset.happy_emoji==1],ax=axes[1])
ax.set(xticklabels=['True'])
ax.set_xlabel('"Happy" emoji on Tweet')
ax.set_ylabel('Number of Tweets')
f.suptitle('Distribution of "Happy" emoji within the dataset',fontsize=15)
del ax,f,axes

In [None]:
f, axes = plt.subplots(1,2,figsize=(12, 5))
ax = sns.countplot(x="sad_emoji", hue="airline_sentiment", data=dataset,ax=axes[0])
ax.set(xticklabels=['False','True'])
ax.set_xlabel('"Sad" emoji on Tweet')
ax.set_ylabel('Number of Tweets')
ax = sns.countplot(x="sad_emoji", hue="airline_sentiment", data=dataset[dataset.sad_emoji==1],ax=axes[1])
ax.set(xticklabels=['True'])
ax.set_xlabel('"Sad" emoji on Tweet')
ax.set_ylabel('Number of Tweets')
f.suptitle('Distribution of "Sad" emoji within the dataset',fontsize=15)
del ax,f,axes

In [None]:
print(dataset.groupby("airline_sentiment", group_keys=True)[['happy_emoji']].value_counts())
print('-'*50)
print(dataset.groupby("airline_sentiment", group_keys=True)[['sad_emoji']].value_counts())

In [None]:
positive_sentiment = dataset[dataset.airline_sentiment == "positive"]
positive_text=positive_sentiment['text']
negative_sentiment = dataset[dataset.airline_sentiment == 'negative']
negative_text=negative_sentiment['text']
neutral_sentiment = dataset[dataset.airline_sentiment == 'neutral']
neutral_text=neutral_sentiment['text']
complain_text = top_complain_df['top_complain_reason']
top_authors = top_tweeter_df["top_tweet_authors"]

In [None]:
%%time
# Create and generate a word
fig, ax = plt.subplots(1, 5, figsize=(20, 8),edgecolor = 'k')
positive_tweet = WordCloud(width = 400,height = 400,background_color = 'black',max_words = 90,stopwords = STOPWORDS).generate(str(positive_text))
negative_tweet = WordCloud(width = 400,height = 400,background_color = 'black',max_words = 90,stopwords = STOPWORDS).generate(str(negative_text))
neutral_tweet = WordCloud(width = 400,height = 400,background_color = 'black',max_words = 90,stopwords = STOPWORDS).generate(str(neutral_text))
complain_tweet = WordCloud(width = 400,height = 400,background_color = 'black',max_words = 90,stopwords = STOPWORDS).generate(str(complain_text))
top_authors_tweet = WordCloud(width = 400,height = 400,background_color = 'black',max_words = 90,stopwords = STOPWORDS).generate(str(top_authors))

ax[0].imshow(positive_tweet)
ax[0].axis('off')
ax[0].set_title('Positive Sentiment')

ax[1].imshow(negative_tweet)
ax[1].axis('off')
ax[1].set_title('Negative Sentiment')

ax[2].imshow(neutral_tweet)
ax[2].axis('off')
ax[2].set_title('Neutral Sentiment')



ax[3].imshow(complain_tweet)
ax[3].axis('off')
ax[3].set_title('Top complain from clients')


ax[4].imshow(top_authors_tweet)
ax[4].axis('off')
ax[4].set_title('top authors name')

plt.show()
#del top_tweeter_df,top_complain_df

#### Drop less beneficial columns

In [None]:
dataset.drop(['user_tag','tweet_id','name','negativereason','tweet_created','negativereason_confidence','airline_sentiment_gold','negativereason_gold','tweet_coord','tweet_location','user_timezone'],axis=1,inplace=True)

In [None]:
print("Sum of missing values",sum(dataset.isnull().sum()))

In [None]:
target_dict = {'positive':1,'negative': -1,'neutral': 0}
dataset['target'] = dataset['airline_sentiment'].map(target_dict)

In [None]:
def split_dataset(df:pd.DataFrame,test_percentage:float)-> tuple[pd.DataFrame,pd.DataFrame]:
  shuffle = np.random.permutation(len(df))
  test_size = int(len(df) * test_percentage)
  test_aux = shuffle[:test_size]
  train_aux = shuffle[test_size:]
  return (df.iloc[train_aux],df.iloc[test_aux])

In [None]:
train, test = split_dataset(dataset,0.2)

In [None]:
print(train.shape,test.shape)

In [None]:
train.head(3)

In [None]:
def stop_word_and_stemming(tweet:str)->str:
    tknzr = TweetTokenizer(preserve_case=False,strip_handles=True, reduce_len=True)
    myStemmer = PorterStemmer()
    tweet = tknzr.tokenize(tweet)
    stop = stopwords.words('english')
    new_list = [myStemmer.stem(word) for word in tweet if word not in stop]
    tweet = ' '.join(new_list)
    return tweet

#For removing punctuation
def remove_punctuations(text:str):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [None]:
train['text'] = train['text'].apply(stop_word_and_stemming)
test['text'] = test['text'].apply(stop_word_and_stemming)

In [None]:
train['text'] = train['text'].apply(remove_punctuations)
test['text'] = test['text'].apply(remove_punctuations)

In [None]:
train.duplicated().any()

In [None]:
train.shape

In [None]:
train.drop_duplicates(inplace=True)

In [None]:
train.head(3)

### Baseline Model Implementation


In [None]:
y = train['target'].to_numpy()

In [None]:
y_t = test['target'].to_numpy()

In [None]:
vec = CountVectorizer(ngram_range=(1,2)).fit(train['text'])

In [None]:
X = vec.transform(train['text'])

In [None]:
X_t =vec.transform(test['text'])

In [None]:
vocab_size = len(vec.vocabulary_) + 1

In [None]:
print("Vocabulary Size :", vocab_size)

In [None]:
# feature selection
def features_selection(X_train, y_train, X_test):
	SKB = SelectKBest(score_func=chi2, k='all')
	SKB.fit(X_train, y_train)
	X_train_fs = SKB.transform(X_train)
	X_test_fs = SKB.transform(X_test)
	return X_train_fs, X_test_fs, SKB

In [None]:
X_train_fs, X_test_fs, SKB = features_selection(X, y, X_t)

In [None]:
for i in range(3):
	print('Feature %d: %f' % (i, SKB.scores_[i]))

####Linear support vector machine

In [None]:
param_grid = {'C': [0.01, 0.1, 1.0, 10.0, 100.0]}
clf = LinearSVC(loss='hinge',random_state=1,class_weight="balanced")

In [None]:
grids = GridSearchCV(clf, param_grid,verbose=1,n_jobs=-1)
grids = grids.fit(X_train_fs, y)
print ("Best parameters: %s" % grids.best_params_)

In [None]:
predictions = grids.predict(X_test_fs)

In [None]:
def score_board_df(y_test,predictions,model_name,averaging_method='macro')-> pd.DataFrame:
  f1 = "%0.3f" % f1_score(y_test, predictions,average=averaging_method)
  prec = "%0.3f" % precision_score(y_test, predictions,average=averaging_method)
  rec ="%0.3f" % recall_score(y_test, predictions,average=averaging_method)
  acc = "%0.3f" % accuracy_score(y_test, predictions)
  return pd.DataFrame({"Classifier-name":model_name,"F-measure":[f1],"Precision":[prec],"Recall":[rec],"Accuracy":[acc],"Average-method":averaging_method})


In [None]:
score_board = score_board_df(y_t, predictions,"LinearSVC",'weighted')

In [None]:
score_board

In [None]:
def conf_matrix(y_test, pred_test)-> pd.DataFrame:
    # Creating a confusion matrix
    con_mat = confusion_matrix(y_test, pred_test)
    con_mat = pd.DataFrame(con_mat, range(-1,2), range(-1,2))

    #Ploting the confusion matrix
    plt.figure(figsize=(8,5))
    sns.set(font_scale=1.5)
    ax = sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)
    ax.set_title('Confusion matrix')
    return con_mat


In [None]:
_ = conf_matrix(y_t, predictions)