## Import pandas and RE

In [1]:
import pandas as pd
import re
from textblob import TextBlob

## Loading CSV file

In [2]:
neg_reviews_dataset = pd.read_csv(r"Tweets.csv",error_bad_lines=False)


In [3]:
neg_reviews_dataset["text"][74]

"@VirginAmerica not worried, it's been a great ride in a new plane with great crew. All airlines should be like this."

In [4]:
df = neg_reviews_dataset.filter(["airline_sentiment", "negativereason", "text"]) 

In [5]:
df.head(5)

Unnamed: 0,airline_sentiment,negativereason,text
0,neutral,,@VirginAmerica What @dhepburn said.
1,positive,,@VirginAmerica plus you've added commercials t...
2,neutral,,@VirginAmerica I didn't today... Must mean I n...
3,negative,Bad Flight,@VirginAmerica it's really aggressive to blast...
4,negative,Can't Tell,@VirginAmerica and it's a really big bad thing...


In [6]:
df = df.loc[df['airline_sentiment'] == 'negative']

In [7]:
df.head(5)

Unnamed: 0,airline_sentiment,negativereason,text
3,negative,Bad Flight,@VirginAmerica it's really aggressive to blast...
4,negative,Can't Tell,@VirginAmerica and it's a really big bad thing...
5,negative,Can't Tell,@VirginAmerica seriously would pay $30 a fligh...
15,negative,Late Flight,@VirginAmerica SFO-PDX schedule is still MIA.
17,negative,Bad Flight,@VirginAmerica I flew from NYC to SFO last we...


## Re-categorize negative reason

In [8]:
df.loc[df['negativereason'].isin(['Bad Flight', 
                                  'Flight Attendant Complaints']), 
       ['negativereason']] = 'Bad Flights'

In [9]:
df.loc[df['negativereason'].isin(['Customer Service Issue', 
                                  'Flight Booking Problems', 
                                  'longlines']), 
       ['negativereason']] = 'Customer Service'

In [10]:
df.loc[df['negativereason'].isin(['Lost Luggage',
                                  'Damaged Luggage']), 
       ['negativereason']] = 'Luggage Issues'

In [11]:
df.loc[df['negativereason'].isin(['Late Flight',
                                  'Cancelled Flight']), 
       ['negativereason']] = 'Flight Cancellation and Delays'

In [12]:
tmp = df
df

Unnamed: 0,airline_sentiment,negativereason,text
3,negative,Bad Flights,@VirginAmerica it's really aggressive to blast...
4,negative,Can't Tell,@VirginAmerica and it's a really big bad thing...
5,negative,Can't Tell,@VirginAmerica seriously would pay $30 a fligh...
15,negative,Flight Cancellation and Delays,@VirginAmerica SFO-PDX schedule is still MIA.
17,negative,Bad Flights,@VirginAmerica I flew from NYC to SFO last we...
...,...,...,...
14631,negative,Bad Flights,@AmericanAir thx for nothing on getting us out...
14633,negative,Flight Cancellation and Delays,@AmericanAir my flight was Cancelled Flightled...
14634,negative,Flight Cancellation and Delays,@AmericanAir right on cue with the delays👌
14636,negative,Customer Service,@AmericanAir leaving over 20 minutes Late Flig...


## Re-processing tweets

In [13]:
 #lowercase all words
tmp['text'] = [elem.lower() for elem in tmp['text']]

In [14]:
#Trim newline
tmp['text'] = [re.sub(r"\n", ' ', elem) for elem in tmp['text']] 

In [15]:
#Removing @Airline
tmp['text'] = [re.sub(r"@[a-zA-Z]*", '', elem) for elem in tmp['text']]

In [16]:
# Url
tmp['text'] = [re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", ' url ', elem) for elem in tmp['text']]

In [17]:
#Dealing with emoji
tmp['text'] = [elem.encode('ascii', 'ignore').decode('ascii') for elem in tmp['text']]

In [18]:
#Money
tmp['text'] = [re.sub(r"\$\d*", ' money ', elem) for elem in tmp['text']] #Money

In [19]:
# Dealing with time
tmp['text'] = [re.sub(r" min ", ' time ', elem) for elem in tmp['text']] #Minminute(s)?
tmp['text'] = [re.sub(r" minute(s)?", ' time ', elem) for elem in tmp['text']] #Min
tmp['text'] = [re.sub(r"\d\dmin", ' time ', elem) for elem in tmp['text']] 
tmp['text'] = [re.sub(r"(\d)*(h)*(r)*(\d)+min(ute)*(s)*", ' time ', elem) for elem in tmp['text']]

tmp['text'] = [re.sub(r" ?hr(s)? ", ' time ', elem) for elem in tmp['text']] #Hour\d\dhrs
tmp['text'] = [re.sub(r"\d\dhrs", ' time ', elem) for elem in tmp['text']]  #04hrs
tmp['text'] = [re.sub(r"(\d)?\d/(\d)?\d/\d(\d)?(\d)?(\d)?", ' time ', elem) for elem in tmp['text']]  #21/01/2014

    
tmp['text'] = [re.sub(r"\d?\d?:?\d?\d( )?am", ' time ', elem) for elem in tmp['text']]
tmp['text'] = [re.sub(r"\d?\d?:?\d?\d( )?pm", ' time ', elem) for elem in tmp['text']]

In [20]:
tmp['text'] = [re.sub(r"thx", ' thank ', elem) for elem in tmp['text']]
tmp['text'] = [re.sub(r"thru", ' through ', elem) for elem in tmp['text']]

In [21]:
#Dealing with year-old
tmp['text'] = [re.sub(r"yr old", ' year-old ', elem) for elem in tmp['text']]
tmp['text'] = [re.sub(r"year old", ' year-old ', elem) for elem in tmp['text']]

In [22]:
#Dealing with ur
tmp['text'] = [re.sub(r" ur ", ' your ', elem) for elem in tmp['text']]

In [23]:
tmp['text'] = [re.sub(r"flight [A-Z]{2} ?[0-9]{3,}", ' flight ', elem) for elem in tmp['text']] #flight numbers
tmp['text'] = [re.sub(r"[A-Z]{3}-[A-Z]{3}|[A-Z]{3} to [A-Z]{3}", 'flight', elem) for elem in tmp['text']] #trips ex: SFO-PDX [15]
tmp['text'] = [re.sub(r"[A-Z]{3} ", ' airport ', elem) for elem in tmp['text']] #ex: LAX [17]

In [24]:
tmp['text'][5]

" seriously would pay  money  a flight for seats that didn't have this playing. it's really the only bad thing about flying va"

## Checking spelling

In [25]:
#tmp['text'] = [TextBlob(elem).correct() for elem in tmp['text']]

In [26]:
tmp = tmp.filter(['negativereason', 'text'])
tmp

Unnamed: 0,negativereason,text
3,Bad Flights,"it's really aggressive to blast obnoxious ""en..."
4,Can't Tell,and it's a really big bad thing about it
5,Can't Tell,seriously would pay money a flight for seat...
15,Flight Cancellation and Delays,sfo-pdx schedule is still mia.
17,Bad Flights,i flew from nyc to sfo last week and couldn'...
...,...,...
14631,Bad Flights,thank for nothing on getting us out of the ...
14633,Flight Cancellation and Delays,"my flight was cancelled flightled, leaving to..."
14634,Flight Cancellation and Delays,right on cue with the delays
14636,Customer Service,leaving over 20 time late flight. no warning...


## To csv file

In [27]:
tmp.to_csv('negative-review.csv', columns = ['negativereason', 'text'], index=False)