## Install and Import

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# natural language toolkit
!pip install nltk contractions



🔑    :     https://www.nltk.org/api/nltk.tokenize.html

In [3]:
import nltk
import numpy as np
import pandas as pd 
import contractions

### Notebook settings

In [4]:
pd.set_option('display.max_colwidth', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## Tokenization

In [5]:
sample_text= """This is pretty cool. A good quality candy might cost $3.88 in New York. 
                But I don't think we buy it. Mr.Biden said $1,000,000. 2 cars."""

In [6]:
from nltk.tokenize import sent_tokenize, wordpunct_tokenize, word_tokenize

#### Sentence Tokenization

In [83]:
# To use tokenziers
print(nltk.download('punkt'))

True


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gulfa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
sentence_tokens = sent_tokenize(sample_text.lower()) # Sensitive to punctuation '.' vs ','
print(sentence_tokens)

['this is pretty cool.', 'a good quality candy might cost $3.88 in new york.', "but i don't think we buy it.", 'mr.biden said $1,000,000.', '2 cars.']


#### WordPunct Tokenization

In [9]:
wordpunc_tokens = wordpunct_tokenize(sample_text.lower()) # regular-expression based tokenizer, which splits text on whitespace and punctuation
print(wordpunc_tokens)

['this', 'is', 'pretty', 'cool', '.', 'a', 'good', 'quality', 'candy', 'might', 'cost', '$', '3', '.', '88', 'in', 'new', 'york', '.', 'but', 'i', 'don', "'", 't', 'think', 'we', 'buy', 'it', '.', 'mr', '.', 'biden', 'said', '$', '1', ',', '000', ',', '000', '.', '2', 'cars', '.']


#### Word Tokenization

In [10]:
word_tokens = word_tokenize(sample_text.lower())
print(word_tokens)

['this', 'is', 'pretty', 'cool', '.', 'a', 'good', 'quality', 'candy', 'might', 'cost', '$', '3.88', 'in', 'new', 'york', '.', 'but', 'i', 'do', "n't", 'think', 'we', 'buy', 'it', '.', 'mr.biden', 'said', '$', '1,000,000', '.', '2', 'cars', '.']


## Removing Punctuation and Numbers

In [11]:
tokens_without_punc = [w for w in word_tokens if w.isalpha()] # .isalnum() for number and object # we are losing mr.biden
print(tokens_without_punc)

['this', 'is', 'pretty', 'cool', 'a', 'good', 'quality', 'candy', 'might', 'cost', 'in', 'new', 'york', 'but', 'i', 'do', 'think', 'we', 'buy', 'it', 'said', 'cars']


## Removing Stopwords

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gulfa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
from nltk.corpus import stopwords

In [14]:
stop_words = stopwords.words("english")
print(stop_words)
print('len stop_words :', len(stop_words))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [15]:
words_to_exclude_from_stopwords = ['not', "n't", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", "don't", 'hadn', 
                                   "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 
                                   'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', 
                                   "won't", 'wouldn', "wouldn't"]

new_stopwords = [w for w in stop_words if w not in words_to_exclude_from_stopwords]
print('len new_stop_words :', len(new_stopwords))

len new_stop_words : 141


In [16]:
print(tokens_without_punc)

['this', 'is', 'pretty', 'cool', 'a', 'good', 'quality', 'candy', 'might', 'cost', 'in', 'new', 'york', 'but', 'i', 'do', 'think', 'we', 'buy', 'it', 'said', 'cars']


In [17]:
token_without_sw = [t for t in tokens_without_punc if t not in stop_words] # new_stopwords
print(token_without_sw)

['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'cars']


## Lemmatization

In [18]:
from nltk.stem import WordNetLemmatizer

In [19]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gulfa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
WordNetLemmatizer().lemmatize("driving")
WordNetLemmatizer().lemmatize("driver")
WordNetLemmatizer().lemmatize("drives")

'driving'

'driver'

'drive'

In [21]:
lem = [WordNetLemmatizer().lemmatize(t) for t in token_without_sw]

In [22]:
print(token_without_sw)
print(lem)

['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'cars']
['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'car']


## Stemming

In [23]:
from nltk.stem import PorterStemmer

In [24]:
PorterStemmer().stem("driving")
PorterStemmer().stem("driver")
PorterStemmer().stem("drives")

'drive'

'driver'

'drive'

In [25]:
stem = [PorterStemmer().stem(t) for t in token_without_sw]

In [26]:
print('w/o norm :', token_without_sw)
print('stem     :', stem)
print('lemma    :', lem)

w/o norm : ['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'cars']
stem     : ['pretti', 'cool', 'good', 'qualiti', 'candi', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'car']
lemma    : ['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'car']


## Joining

In [27]:
" ".join(lem)

'pretty cool good quality candy might cost new york think buy said car'

### Expanding Contractions

In [28]:
my_text  = word_tokenize(contractions.fix("I'll go there I've got a book".lower()))
my_text
#[w for w in my_text if w.isalpha()]

['i', 'will', 'go', 'there', 'i', "'ve", 'got', 'a', 'book']

### Part of Speech Tag


In [29]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gulfa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [30]:
from nltk import pos_tag

In [31]:
text = """Steven Paul Jobs was an American business magnate, industrial designer, investor, and media proprietor. 
He was the chairman, chief executive officer (CEO), and co-founder of Apple Inc.; the chairman and majority shareholder of Pixar; 
a member of The Walt Disney Company's board of directors following its acquisition of Pixar; and the founder, chairman, and CEO of NeXT."""

In [32]:
tokens = word_tokenize(text)
pos = pos_tag(tokens)
pos

[('Steven', 'NNP'),
 ('Paul', 'NNP'),
 ('Jobs', 'NNP'),
 ('was', 'VBD'),
 ('an', 'DT'),
 ('American', 'JJ'),
 ('business', 'NN'),
 ('magnate', 'NN'),
 (',', ','),
 ('industrial', 'JJ'),
 ('designer', 'NN'),
 (',', ','),
 ('investor', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('media', 'NNS'),
 ('proprietor', 'NN'),
 ('.', '.'),
 ('He', 'PRP'),
 ('was', 'VBD'),
 ('the', 'DT'),
 ('chairman', 'NN'),
 (',', ','),
 ('chief', 'JJ'),
 ('executive', 'NN'),
 ('officer', 'NN'),
 ('(', '('),
 ('CEO', 'NNP'),
 (')', ')'),
 (',', ','),
 ('and', 'CC'),
 ('co-founder', 'NN'),
 ('of', 'IN'),
 ('Apple', 'NNP'),
 ('Inc.', 'NNP'),
 (';', ':'),
 ('the', 'DT'),
 ('chairman', 'NN'),
 ('and', 'CC'),
 ('majority', 'NN'),
 ('shareholder', 'NN'),
 ('of', 'IN'),
 ('Pixar', 'NNP'),
 (';', ':'),
 ('a', 'DT'),
 ('member', 'NN'),
 ('of', 'IN'),
 ('The', 'DT'),
 ('Walt', 'NNP'),
 ('Disney', 'NNP'),
 ('Company', 'NNP'),
 ("'s", 'POS'),
 ('board', 'NN'),
 ('of', 'IN'),
 ('directors', 'NNS'),
 ('following', 'VBG'),
 ('its', 

### NER

In [33]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\gulfa\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\gulfa\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [34]:
from nltk import ne_chunk

In [35]:
import matplotlib as mpl
import os
if os.environ.get('DISPLAY','') == '':
    print('no display found. Using non-interactive Agg backend')
    mpl.use('Agg')

for chunk in nltk.ne_chunk(pos):
      if hasattr(chunk, 'label'):
        print(chunk.label(), ' '.join(c[0] for c in chunk))
text

no display found. Using non-interactive Agg backend
PERSON Steven
PERSON Paul Jobs
GPE American
ORGANIZATION CEO
ORGANIZATION Apple Inc.
GPE Pixar
ORGANIZATION Walt Disney Company
GPE Pixar
ORGANIZATION CEO
ORGANIZATION NeXT


"Steven Paul Jobs was an American business magnate, industrial designer, investor, and media proprietor. \nHe was the chairman, chief executive officer (CEO), and co-founder of Apple Inc.; the chairman and majority shareholder of Pixar; \na member of The Walt Disney Company's board of directors following its acquisition of Pixar; and the founder, chairman, and CEO of NeXT."

## Cleaning Function

In [36]:
##### !!!!!!!!!!
a = "I don't want to fly with your company." # vs 
[token for token in word_tokenize(a) if token not in stop_words] 
b = "I do not want to fly with your company" 
[token for token in word_tokenize(b) if token not in stop_words] 

['I', "n't", 'want', 'fly', 'company', '.']

['I', 'want', 'fly', 'company']

In [37]:
"I don't want to fly with your company.".replace("'", "")

'I dont want to fly with your company.'

In [38]:
def cleaning(data):
    
    #1. Contractions Expension & Tokenize
    #text_tokens = word_tokenize(contractions.fix(data.lower())) 
    text_tokens = word_tokenize(data.replace("'", '').lower())
    
    #2. Remove Puncs
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    
    #3. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #4. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

## CountVectorization and TF-IDF Vectorization

#### Data
🔑 Source: https://www.kaggle.com/crowdflower/twitter-airline-sentiment?select=Tweets.csv

In [39]:
# For Colab
#from google.colab import drive
#drive.mount('/content/gdrive')

In [40]:
df = pd.read_csv('Tweets.csv')
#df = pd.read_csv("airline_tweets.csv")

In [41]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [43]:
df = df[['airline_sentiment','text']]
df.rename(columns={'airline_sentiment':'sentiment'}, inplace=True)
df.sample(20)

Unnamed: 0,sentiment,text
9943,positive,@USAirways Ann Marie at LGA is the best ticket agent ever! #excellentcustomerservice
2747,negative,@united your agents forced me to check a carry on bag. When I received my bag I found your crew had stolen from me. U lost my business!
5834,neutral,@SouthwestAir sent
154,neutral,"@VirginAmerica - Is Flight 713 from Love Field to SFO definitely Cancelled Flightled for Monday, February 23?"
10401,negative,@USAirways That's clever. Its clear you need more agents. 94 min hold and counting. Flex workforces are highly possible in 2015
14033,negative,@AmericanAir approaching three hours sitting in the plane on the ground at DFW on #AmericanAirlines flight 3056 - Oscar performance
7004,neutral,YASSSSS. Da Fuccc- RT @JetBlue: Our fleet's on fleek. http://t.co/oAJ5mnuchA
252,positive,"@VirginAmerica thank you for the easy itinerary shift for impending weather. Quick, painless &amp; free."
7305,positive,@JetBlue Love you guys😍😍😍 http://t.co/3X9NRUOvtS
13128,negative,@AmericanAir you keep returning my call and hanging up when I answer? Help reFlight Booking Problems a flight!


In [44]:
df2 = df.copy()

In [45]:
df2["clean_text"] = df2["text"].apply(cleaning)
df2.head()

Unnamed: 0,sentiment,text,clean_text
0,neutral,@VirginAmerica What @dhepburn said.,virginamerica dhepburn said
1,positive,@VirginAmerica plus you've added commercials to the experience... tacky.,virginamerica plus youve added commercial experience tacky
2,neutral,@VirginAmerica I didn't today... Must mean I need to take another trip!,virginamerica didnt today must mean need take another trip
3,negative,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",virginamerica really aggressive blast obnoxious entertainment guest face amp little recourse
4,negative,@VirginAmerica and it's a really big bad thing about it,virginamerica really big bad thing


In [46]:
# URLs
df2[df2['clean_text'].str.contains('http')].head()

Unnamed: 0,sentiment,text,clean_text
7,neutral,"@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP",virginamerica really missed prime opportunity men without hat parody http
13,positive,@VirginAmerica @virginmedia I'm flying your #fabulous #Seductive skies again! U take all the #stress away from travel http://t.co/ahlXHhKiyn,virginamerica virginmedia im flying fabulous seductive sky u take stress away travel http
21,positive,@VirginAmerica I love this graphic. http://t.co/UT5GrRwAaA,virginamerica love graphic http
34,positive,@VirginAmerica this is great news! America could start flights to Hawaii by end of year http://t.co/r8p2Zy3fe4 via @Pacificbiznews,virginamerica great news america could start flight hawaii end year http via pacificbiznews
35,neutral,Nice RT @VirginAmerica: Vibe with the moodlight from takeoff to touchdown. #MoodlitMonday #ScienceBehindTheExperience http://t.co/Y7O0uNxTQP,nice rt virginamerica vibe moodlight takeoff touchdown moodlitmonday sciencebehindtheexperience http


In [47]:
# Tags
df2[df2['text'].str.contains('#')].head(3)

Unnamed: 0,sentiment,text,clean_text
13,positive,@VirginAmerica @virginmedia I'm flying your #fabulous #Seductive skies again! U take all the #stress away from travel http://t.co/ahlXHhKiyn,virginamerica virginmedia im flying fabulous seductive sky u take stress away travel http
16,positive,@VirginAmerica So excited for my first cross country flight LAX to MCO I've heard nothing but great things about Virgin America. #29DaysToGo,virginamerica excited first cross country flight lax mco ive heard nothing great thing virgin america
26,negative,@VirginAmerica What happened 2 ur vegan food options?! At least say on ur site so i know I won't be able 2 eat anything for next 6 hrs #fail,virginamerica happened ur vegan food option least say ur site know wont able eat anything next hr fail


In [48]:
# Mentions
df2[df2['text'].str.contains('@')].head(3)

Unnamed: 0,sentiment,text,clean_text
0,neutral,@VirginAmerica What @dhepburn said.,virginamerica dhepburn said
1,positive,@VirginAmerica plus you've added commercials to the experience... tacky.,virginamerica plus youve added commercial experience tacky
2,neutral,@VirginAmerica I didn't today... Must mean I need to take another trip!,virginamerica didnt today must mean need take another trip


In [49]:
df2.sample(10)

Unnamed: 0,sentiment,text,clean_text
5425,negative,@SouthwestAir rocks - @AmericanAir horror,southwestair rock americanair horror
9599,neutral,"@USAirways Do your flight schedules to - from Quintana Roo, Mexico reflect their newly adopted time zone change to EST?",usairways flight schedule quintana roo mexico reflect newly adopted time zone change est
4592,neutral,"@SouthwestAir briughy me to @ComClassic, #AIF2015 &amp; so much more with the #agcommunity",southwestair briughy comclassic amp much agcommunity
13233,negative,@AmericanAir Trying to Cancelled Flight fligt 2321 O'Hare to Dallas 12:17pm. High call volume no one answer. Online no confirmation. #thankU,americanair trying cancelled flight fligt ohare dallas high call volume one answer online confirmation thanku
9966,negative,@USAirways I have nothing but issues when I fly w/u what's up w/that?! #travel #vacation #awful,usairways nothing issue fly whats travel vacation awful
12792,positive,@AmericanAir keep up the good work. Got me to my destination safe and on time today,americanair keep good work got destination safe time today
13024,negative,"@AmericanAir, I've tried no less than 8 times today to get in touch with your service desk beginning at 8:30 EST. I'm having no luck -help!",americanair ive tried le time today get touch service desk beginning est im luck
5780,negative,@SouthwestAir I love ya but your bringin me down. An hour Late Flight leaving and now we've been sitting in the runway for 20 min 😥 #fail,southwestair love ya bringin hour late flight leaving weve sitting runway min fail
13934,negative,"@AmericanAir yeah, rebooked for tomorrow morning. But extremely disappointed to miss a wedding.",americanair yeah rebooked tomorrow morning extremely disappointed miss wedding
6055,neutral,@SouthwestAir booked our flights this morning. Can't wait to move about the country.,southwestair booked flight morning cant wait move country


In [50]:
df = pd.read_csv('Tweets.csv')
#df = pd.read_csv("airline_tweets.csv")

df = df[['airline_sentiment','text']]
df.rename(columns={'airline_sentiment':'sentiment'}, inplace=True)

In [51]:
wordpunct_tokenize(df2['text'][0])

['@', 'VirginAmerica', 'What', '@', 'dhepburn', 'said', '.']

In [52]:
def updated_cleaning(data):

    import re
    
    #1. Removing URLS
    data = re.sub(r'http\S+', '', data)

    #2. Removing Tags
    data = re.sub(r'#\w+', '', data)

    #3. Removing Mentions
    data = re.sub(r'@\w+', '', data)

    #4. Contractions Expension & Tokenize
    #text_tokens = word_tokenize(contractions.fix(data.lower())) 
    text_tokens = word_tokenize(data.replace("'", '').lower())

    #5. Removing mentions
    tokens_without_mention = [w for w in text_tokens if not w.startswith('@')]
    
    #6. Remove Puncs
    tokens_without_punc = [w for w in tokens_without_mention if w.isalpha()]
    
    #7. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #8. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

In [53]:
df2 = df.copy()
df2["clean_text"] = df2["text"].apply(updated_cleaning)

In [54]:
df2[df2['clean_text'].str.contains('http')]
df2[df2['clean_text'].str.contains('#')]
df2[df2['clean_text'].str.contains('@')]

Unnamed: 0,sentiment,text,clean_text


Unnamed: 0,sentiment,text,clean_text


Unnamed: 0,sentiment,text,clean_text


In [55]:
df2.sample(10)

Unnamed: 0,sentiment,text,clean_text
7875,positive,@JetBlue got it. Thanks,got thanks
9764,negative,"@USAirways your rude staff said"" I don't care that we are out of market place food you're going on vacation and I have to work"" nice huh",rude staff said dont care market place food youre going vacation work nice huh
7664,neutral,@JetBlue Alright I hope. My JetBlue app is showing a change in my seats and online shows an a320. Don't want any surprises,alright hope jetblue app showing change seat online show dont want surprise
2190,negative,.@united Your airline is again rated the WORST in America so you've got your work cut out for you. It's bc you treat customers like garbage.,airline rated worst america youve got work cut bc treat customer like garbage
12400,negative,@AmericanAir 140 characters aren't enough to describe how inconsiderate your employees are,character arent enough describe inconsiderate employee
9679,positive,@USAirways looks like our bag has been rescued. Thanks!,look like bag rescued thanks
3943,negative,@united next flight? Don't think I'll be spending anymore money with you guys ever. It was that bad.,next flight dont think ill spending anymore money guy ever bad
4463,neutral,@SouthwestAir @PHLAirport Will Flight 2155 that arrives at E11 be a penguin plane?,flight arrives penguin plane
9833,negative,@USAirways @AmericanAir 2hrs Late Flightr finally taking off😂😂😂,late flightr finally taking
762,negative,@united 4 passengers after a 2 hour delayed flight left with no hotel at the end of the night @ hou airport. Wtf??!! http://t.co/ZfqMpGXVS6,passenger hour delayed flight left hotel end night hou airport wtf


## CountVectorization

In [56]:
X = df2[["clean_text"]] # as a dataframe
y = df2["sentiment"]

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 4299)

In [59]:
X_train

Unnamed: 0,clean_text
3013,haha clean plane held overnight hangar sound lovely also dont lie screensand say weather
13868,let seriously
2391,hello flying first class behind people zone pls pas app dept board class first
9222,worry flight attendant took care
8721,claimed happy way treated
...,...
7823,yesterday flight wanted chicago sale within point range today isnt way guy still honor
9228,ive hold hour cc mile arent showing mediocre combo cc amp airline
5237,yes total hour hold cancelled flightlations one would think would staff decided cx drive
119,love team running gate la tonight waited delayed flight kept thing entertaining


In [60]:
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
vectorizer1 = CountVectorizer()
X_train_count1 = vectorizer1.fit_transform(X_train['clean_text']) # INPUT: should be a list or pd.Series
X_test_count1 = vectorizer1.transform(X_test['clean_text'])

In [62]:
#vectorizer1.get_feature_names()[:25]

In [63]:
len(vectorizer1.get_feature_names())

7705

In [64]:
vectorizer1.get_feature_names()[:20]

['aa',
 'aaaand',
 'aadavantage',
 'aadv',
 'aadvantage',
 'aal',
 'aaron',
 'ab',
 'aback',
 'abandon',
 'abandoned',
 'abandonment',
 'abassinet',
 'abbreve',
 'abc',
 'abducted',
 'ability',
 'able',
 'aboard',
 'abounds']

#### min_df

In [65]:
#https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
# WHITE BOARD: https://whiteboard.office.com/me/whiteboards/13947786-4746-4a0e-ad23-4c720b363d5c
vectorizer2 = CountVectorizer(preprocessor=cleaning, min_df=2, ngram_range=(1,2))
X_train_count2 = vectorizer2.fit_transform(X_train['clean_text'])
X_test_count2 = vectorizer2.transform(X_test['clean_text'])

len(vectorizer2.get_feature_names())

13244

In [66]:
vectorizer2.get_feature_names()[:20]

['aa',
 'aa agent',
 'aa amp',
 'aa customer',
 'aa dallas',
 'aa dfw',
 'aa doesnt',
 'aa email',
 'aa employee',
 'aa family',
 'aa flight',
 'aa gate',
 'aa gold',
 'aa help',
 'aa mile',
 'aa monday',
 'aa number',
 'aa platinum',
 'aa possible',
 'aa usair']

In [67]:
X_train_count2.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [68]:
count_df = pd.DataFrame(X_train_count2.toarray(), columns = vectorizer2.get_feature_names())
count_df

Unnamed: 0,aa,aa agent,aa amp,aa customer,aa dallas,aa dfw,aa doesnt,aa email,aa employee,aa family,...,yyz,yyz terminal,zero,zero communication,zero entertainment,zero response,zone,zone nine,zoom,zurich
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11707,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11708,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11709,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11710,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
# TOP 20 TOKENS
print('Before min_df')
list(zip(vectorizer1.get_feature_names(), X_train_count1.toarray().sum(axis=0)))[:20]
print('\n')
print('After min_df')
list(zip(vectorizer2.get_feature_names(), X_train_count2.toarray().sum(axis=0)))[:20]

Before min_df


[('aa', 168),
 ('aaaand', 1),
 ('aadavantage', 1),
 ('aadv', 2),
 ('aadvantage', 9),
 ('aal', 1),
 ('aaron', 1),
 ('ab', 1),
 ('aback', 1),
 ('abandon', 1),
 ('abandoned', 1),
 ('abandonment', 1),
 ('abassinet', 1),
 ('abbreve', 1),
 ('abc', 6),
 ('abducted', 1),
 ('ability', 4),
 ('able', 93),
 ('aboard', 3),
 ('abounds', 1)]



After min_df


[('aa', 168),
 ('aa agent', 3),
 ('aa amp', 4),
 ('aa customer', 4),
 ('aa dallas', 2),
 ('aa dfw', 2),
 ('aa doesnt', 2),
 ('aa email', 2),
 ('aa employee', 4),
 ('aa family', 2),
 ('aa flight', 5),
 ('aa gate', 2),
 ('aa gold', 2),
 ('aa help', 3),
 ('aa mile', 2),
 ('aa monday', 2),
 ('aa number', 2),
 ('aa platinum', 2),
 ('aa possible', 2),
 ('aa usair', 2)]

In [70]:
X_train.loc[2, :]

clean_text    didnt today must mean need take another trip
Name: 2, dtype: object

## TF-IDF

🔑 : sklearn TD-IDF
https://towardsdatascience.com/how-sklearns-tf-idf-is-different-from-the-standard-tf-idf-275fa582e73d

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:
tf_idf_vectorizer = TfidfVectorizer()
X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train['clean_text'])
X_test_tf_idf = tf_idf_vectorizer.transform(X_test['clean_text'])

In [73]:
tf_idf_vectorizer.get_feature_names()[:25]

['aa',
 'aaaand',
 'aadavantage',
 'aadv',
 'aadvantage',
 'aal',
 'aaron',
 'ab',
 'aback',
 'abandon',
 'abandoned',
 'abandonment',
 'abassinet',
 'abbreve',
 'abc',
 'abducted',
 'ability',
 'able',
 'aboard',
 'abounds',
 'abq',
 'abroad',
 'absolute',
 'absolutely',
 'absorb']

In [74]:
X_train_tf_idf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [75]:
pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names())[:10]

Unnamed: 0,aa,aaaand,aadavantage,aadv,aadvantage,aal,aaron,ab,aback,abandon,...,yyz,zabsonre,zambia,zero,zip,zipper,zone,zoom,zukes,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.312678,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
X_train.loc[5]

clean_text    seriously would pay flight seat didnt playing really bad thing flying va
Name: 5, dtype: object

In [77]:
pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names()).loc[1].sort_values(ascending=False)[:30]

seriously          0.774457
let                0.632627
pregame            0.000000
preregistration    0.000000
prepares           0.000000
prepared           0.000000
prepare            0.000000
preparation        0.000000
preoccupied        0.000000
premium            0.000000
premiere           0.000000
premier            0.000000
prem               0.000000
pregnant           0.000000
preggo             0.000000
prefference        0.000000
present            0.000000
preferred          0.000000
preference         0.000000
preferably         0.000000
prefer             0.000000
pref               0.000000
preemptive         0.000000
predictive         0.000000
predicted          0.000000
predictable        0.000000
predict            0.000000
precludes          0.000000
precipitation      0.000000
prescreen          0.000000
Name: 1, dtype: float64

In [78]:
df2[df2['clean_text'].str.contains('let')][['sentiment', 'clean_text']]['sentiment'].value_counts().plot(kind='bar');

In [79]:
df2[df2['clean_text'].str.contains('let ')][['sentiment', 'clean_text']]

Unnamed: 0,sentiment,clean_text
59,neutral,new marketing song let u know think
71,neutral,emailed customer service team let know need tracking number
97,negative,let scanned passenger leave plane told someone remove bag class bin
191,negative,trying book flight guy website wont let lose business
216,negative,mobile site broken show number point wont let checkin
...,...,...
14364,negative,im trying call book reward ticket one world partner automated system wont let talk great job
14375,negative,let crew member know every time happens time
14429,negative,reservation system wont even let u leave phone way fix rebooked wrong day
14528,negative,let employee treat loyal customer
